commit dd8f4b760d0f4f883031e7a3c6926240b43046d1
Author: MSVSphere Packaging Team <packager@msvsphere-os.ru>
Date:   Fri Sep 22 17:43:45 2023 +0300

    import gmp-6.2.0-13.el9

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..024739e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+SOURCES/gmp-6.2.0.tar.bz2
diff --git a/.gmp.metadata b/.gmp.metadata
new file mode 100644
index 0000000..062ceb6
--- /dev/null
+++ b/.gmp.metadata
@@ -0,0 +1 @@
+5e9341d3807bc7505376f9ed9f5c1c6c57050aa6 SOURCES/gmp-6.2.0.tar.bz2
diff --git a/SOURCES/cve-2021-43618.patch b/SOURCES/cve-2021-43618.patch
new file mode 100644
index 0000000..f741972
--- /dev/null
+++ b/SOURCES/cve-2021-43618.patch
@@ -0,0 +1,25 @@
+
+# HG changeset patch
+# User Marco Bodrato <bodrato@mail.dm.unipi.it>
+# Date 1634836009 -7200
+# Node ID 561a9c25298e17bb01896801ff353546c6923dbd
+# Parent  e1fd9db13b475209a864577237ea4b9105b3e96e
+mpz/inp_raw.c: Avoid bit size overflows
+
+diff -r e1fd9db13b47 -r 561a9c25298e mpz/inp_raw.c
+--- a/mpz/inp_raw.c	Tue Dec 22 23:49:51 2020 +0100
++++ b/mpz/inp_raw.c	Thu Oct 21 19:06:49 2021 +0200
+@@ -88,8 +88,11 @@
+ 
+   abs_csize = ABS (csize);
+ 
++  if (UNLIKELY (abs_csize > ~(mp_bitcnt_t) 0 / 8))
++    return 0; /* Bit size overflows */
++
+   /* round up to a multiple of limbs */
+-  abs_xsize = BITS_TO_LIMBS (abs_csize*8);
++  abs_xsize = BITS_TO_LIMBS ((mp_bitcnt_t) abs_csize * 8);
+ 
+   if (abs_xsize != 0)
+     {
+
diff --git a/SOURCES/gmp-6.0.0-debuginfo.patch b/SOURCES/gmp-6.0.0-debuginfo.patch
new file mode 100644
index 0000000..bb72839
--- /dev/null
+++ b/SOURCES/gmp-6.0.0-debuginfo.patch
@@ -0,0 +1,21 @@
+diff -up wrk/mpn/m4-ccas.wrk wrk/mpn/m4-ccas
+--- wrk/mpn/m4-ccas.wrk	2015-04-02 16:44:03.645305407 +0200
++++ wrk/mpn/m4-ccas	2015-04-02 16:21:57.893870969 +0200
+@@ -104,4 +104,4 @@ echo "$CC"
+ $CC || exit
+ 
+ # Comment this out to preserve .s intermediates
+-rm -f $TMP
++#rm -f $TMP
+diff -up wrk/mpn/Makeasm.am.wrk wrk/mpn/Makeasm.am
+--- wrk/mpn/Makeasm.am.wrk	2015-04-02 16:42:41.692278742 +0200
++++ wrk/mpn/Makeasm.am	2015-04-02 16:21:57.891870945 +0200
+@@ -66,7 +66,7 @@ SUFFIXES = .s .S .asm
+ 
+ 
+ # can be overridden during development, eg. "make RM_TMP=: mul_1.lo"
+-RM_TMP = rm -f
++RM_TMP = true
+ 
+ 
+ # .S assembler, preprocessed with cpp.
diff --git a/SOURCES/gmp-intel-cet.patch b/SOURCES/gmp-intel-cet.patch
new file mode 100644
index 0000000..62e790d
--- /dev/null
+++ b/SOURCES/gmp-intel-cet.patch
@@ -0,0 +1,3518 @@
+From 2db44789f76e93fb641fc0ced30f35c163ab89ba Mon Sep 17 00:00:00 2001
+From: rpm-build <rpm-build>
+Date: Tue, 17 Aug 2021 15:33:21 +0200
+Subject: [PATCH] Intel CET: Define the base macros
+
+---
+ acinclude.m4                           | 100 +++++++++++++++++++++++++
+ configure.ac                           |   1 +
+ mpn/x86/aors_n.asm                     |   5 +-
+ mpn/x86/aorsmul_1.asm                  |   1 +
+ mpn/x86/atom/sse2/aorsmul_1.asm        |   1 +
+ mpn/x86/atom/sse2/mul_basecase.asm     |   1 +
+ mpn/x86/atom/sse2/sqr_basecase.asm     |   1 +
+ mpn/x86/bdiv_dbm1c.asm                 |   1 +
+ mpn/x86/copyd.asm                      |   1 +
+ mpn/x86/copyi.asm                      |   1 +
+ mpn/x86/divrem_1.asm                   |   1 +
+ mpn/x86/divrem_2.asm                   |   1 +
+ mpn/x86/k6/aors_n.asm                  |   1 +
+ mpn/x86/k6/aorsmul_1.asm               |   1 +
+ mpn/x86/k6/divrem_1.asm                |   1 +
+ mpn/x86/k6/k62mmx/copyd.asm            |   1 +
+ mpn/x86/k6/k62mmx/lshift.asm           |   1 +
+ mpn/x86/k6/k62mmx/rshift.asm           |   1 +
+ mpn/x86/k6/mmx/com.asm                 |   1 +
+ mpn/x86/k6/mmx/logops_n.asm            |   1 +
+ mpn/x86/k6/mmx/lshift.asm              |   1 +
+ mpn/x86/k6/mmx/popham.asm              |   1 +
+ mpn/x86/k6/mmx/rshift.asm              |   1 +
+ mpn/x86/k6/mod_34lsub1.asm             |   1 +
+ mpn/x86/k6/mul_1.asm                   |   1 +
+ mpn/x86/k6/mul_basecase.asm            |   1 +
+ mpn/x86/k6/pre_mod_1.asm               |   1 +
+ mpn/x86/k6/sqr_basecase.asm            |   1 +
+ mpn/x86/k7/aors_n.asm                  |   1 +
+ mpn/x86/k7/mmx/com.asm                 |   1 +
+ mpn/x86/k7/mmx/copyd.asm               |   1 +
+ mpn/x86/k7/mmx/copyi.asm               |   1 +
+ mpn/x86/k7/mmx/divrem_1.asm            |   1 +
+ mpn/x86/k7/mmx/lshift.asm              |   1 +
+ mpn/x86/k7/mmx/popham.asm              |   1 +
+ mpn/x86/k7/mmx/rshift.asm              |   1 +
+ mpn/x86/k7/mod_1_1.asm                 |   1 +
+ mpn/x86/k7/mod_1_4.asm                 |   1 +
+ mpn/x86/k7/mod_34lsub1.asm             |   1 +
+ mpn/x86/k7/mul_basecase.asm            |   1 +
+ mpn/x86/k7/sqr_basecase.asm            |   1 +
+ mpn/x86/lshift.asm                     |   1 +
+ mpn/x86/mmx/sec_tabselect.asm          |   1 +
+ mpn/x86/mod_34lsub1.asm                |   1 +
+ mpn/x86/mul_1.asm                      |   1 +
+ mpn/x86/mul_basecase.asm               |   1 +
+ mpn/x86/p6/aors_n.asm                  |   3 +-
+ mpn/x86/p6/aorsmul_1.asm               |   3 +-
+ mpn/x86/p6/copyd.asm                   |   1 +
+ mpn/x86/p6/gcd_11.asm                  |   1 +
+ mpn/x86/p6/lshsub_n.asm                |   3 +-
+ mpn/x86/p6/mmx/divrem_1.asm            |   1 +
+ mpn/x86/p6/mod_34lsub1.asm             |   1 +
+ mpn/x86/p6/mul_basecase.asm            |   3 +-
+ mpn/x86/p6/sqr_basecase.asm            |   3 +-
+ mpn/x86/pentium/aors_n.asm             |   1 +
+ mpn/x86/pentium/aorsmul_1.asm          |   1 +
+ mpn/x86/pentium/com.asm                |   1 +
+ mpn/x86/pentium/copyd.asm              |   1 +
+ mpn/x86/pentium/copyi.asm              |   1 +
+ mpn/x86/pentium/logops_n.asm           |   1 +
+ mpn/x86/pentium/lshift.asm             |   1 +
+ mpn/x86/pentium/mmx/lshift.asm         |   1 +
+ mpn/x86/pentium/mmx/mul_1.asm          |   1 +
+ mpn/x86/pentium/mmx/rshift.asm         |   1 +
+ mpn/x86/pentium/mod_34lsub1.asm        |   1 +
+ mpn/x86/pentium/mul_1.asm              |   1 +
+ mpn/x86/pentium/mul_2.asm              |   1 +
+ mpn/x86/pentium/mul_basecase.asm       |   1 +
+ mpn/x86/pentium/rshift.asm             |   1 +
+ mpn/x86/pentium/sqr_basecase.asm       |   1 +
+ mpn/x86/pentium4/copyd.asm             |   1 +
+ mpn/x86/pentium4/copyi.asm             |   1 +
+ mpn/x86/pentium4/mmx/popham.asm        |   1 +
+ mpn/x86/pentium4/sse2/add_n.asm        |   1 +
+ mpn/x86/pentium4/sse2/addlsh1_n.asm    |   1 +
+ mpn/x86/pentium4/sse2/addmul_1.asm     |   1 +
+ mpn/x86/pentium4/sse2/cnd_add_n.asm    |   1 +
+ mpn/x86/pentium4/sse2/cnd_sub_n.asm    |   1 +
+ mpn/x86/pentium4/sse2/divrem_1.asm     |   1 +
+ mpn/x86/pentium4/sse2/mod_1_1.asm      |   1 +
+ mpn/x86/pentium4/sse2/mod_1_4.asm      |   1 +
+ mpn/x86/pentium4/sse2/mod_34lsub1.asm  |   1 +
+ mpn/x86/pentium4/sse2/mul_1.asm        |   1 +
+ mpn/x86/pentium4/sse2/mul_basecase.asm |   1 +
+ mpn/x86/pentium4/sse2/rsh1add_n.asm    |   1 +
+ mpn/x86/pentium4/sse2/sqr_basecase.asm |   1 +
+ mpn/x86/pentium4/sse2/sub_n.asm        |   1 +
+ mpn/x86/pentium4/sse2/submul_1.asm     |   1 +
+ mpn/x86/rshift.asm                     |   1 +
+ mpn/x86/sec_tabselect.asm              |   1 +
+ mpn/x86/sqr_basecase.asm               |   1 +
+ mpn/x86/udiv.asm                       |   1 +
+ mpn/x86/umul.asm                       |   1 +
+ mpn/x86/x86-defs.m4                    |   7 +-
+ mpn/x86_64/addaddmul_1msb0.asm         |   1 +
+ mpn/x86_64/aorrlsh1_n.asm              |   1 +
+ mpn/x86_64/aorrlshC_n.asm              |   1 +
+ mpn/x86_64/aorrlsh_n.asm               |   1 +
+ mpn/x86_64/aors_err1_n.asm             |   1 +
+ mpn/x86_64/aors_err2_n.asm             |   1 +
+ mpn/x86_64/aors_err3_n.asm             |   1 +
+ mpn/x86_64/aors_n.asm                  |   1 +
+ mpn/x86_64/aorsmul_1.asm               |   1 +
+ mpn/x86_64/atom/addmul_2.asm           |   1 +
+ mpn/x86_64/atom/aorrlsh1_n.asm         |   1 +
+ mpn/x86_64/atom/aorrlsh2_n.asm         |   1 +
+ mpn/x86_64/atom/lshift.asm             |   1 +
+ mpn/x86_64/atom/lshiftc.asm            |   1 +
+ mpn/x86_64/atom/mul_2.asm              |   1 +
+ mpn/x86_64/atom/rsh1aors_n.asm         |   1 +
+ mpn/x86_64/atom/rshift.asm             |   1 +
+ mpn/x86_64/atom/sublsh1_n.asm          |   1 +
+ mpn/x86_64/bd1/addmul_2.asm            |   1 +
+ mpn/x86_64/bd1/hamdist.asm             |   1 +
+ mpn/x86_64/bd1/mul_2.asm               |   1 +
+ mpn/x86_64/bd1/mul_basecase.asm        |   1 +
+ mpn/x86_64/bd1/popcount.asm            |   1 +
+ mpn/x86_64/bd2/gcd_11.asm              |   1 +
+ mpn/x86_64/bd2/gcd_22.asm              |   1 +
+ mpn/x86_64/bd4/gcd_11.asm              |   1 +
+ mpn/x86_64/bdiv_dbm1c.asm              |   1 +
+ mpn/x86_64/bdiv_q_1.asm                |   1 +
+ mpn/x86_64/bt1/aors_n.asm              |   1 +
+ mpn/x86_64/bt1/aorsmul_1.asm           |   1 +
+ mpn/x86_64/bt1/copyd.asm               |   1 +
+ mpn/x86_64/bt1/copyi.asm               |   1 +
+ mpn/x86_64/bt1/gcd_11.asm              |   1 +
+ mpn/x86_64/bt1/mul_1.asm               |   1 +
+ mpn/x86_64/bt1/mul_basecase.asm        |   1 +
+ mpn/x86_64/bt1/sqr_basecase.asm        |   1 +
+ mpn/x86_64/cnd_aors_n.asm              |   1 +
+ mpn/x86_64/com.asm                     |   1 +
+ mpn/x86_64/copyd.asm                   |   1 +
+ mpn/x86_64/copyi.asm                   |   1 +
+ mpn/x86_64/core2/aors_err1_n.asm       |   1 +
+ mpn/x86_64/core2/aors_n.asm            |   1 +
+ mpn/x86_64/core2/aorsmul_1.asm         |   1 +
+ mpn/x86_64/core2/divrem_1.asm          |   1 +
+ mpn/x86_64/core2/gcd_11.asm            |   1 +
+ mpn/x86_64/core2/gcd_22.asm            |   1 +
+ mpn/x86_64/core2/hamdist.asm           |   1 +
+ mpn/x86_64/core2/logops_n.asm          |   1 +
+ mpn/x86_64/core2/lshift.asm            |   1 +
+ mpn/x86_64/core2/lshiftc.asm           |   1 +
+ mpn/x86_64/core2/mul_basecase.asm      |   5 ++
+ mpn/x86_64/core2/mullo_basecase.asm    |   1 +
+ mpn/x86_64/core2/popcount.asm          |   1 +
+ mpn/x86_64/core2/rsh1aors_n.asm        |   1 +
+ mpn/x86_64/core2/rshift.asm            |   1 +
+ mpn/x86_64/core2/sqr_basecase.asm      |   1 +
+ mpn/x86_64/core2/sublshC_n.asm         |   1 +
+ mpn/x86_64/coreibwl/addmul_1.asm       |  24 ++++--
+ mpn/x86_64/coreibwl/mul_1.asm          |  24 ++++--
+ mpn/x86_64/coreibwl/mul_basecase.asm   |  47 ++++++++----
+ mpn/x86_64/coreibwl/mullo_basecase.asm |   1 +
+ mpn/x86_64/coreibwl/sqr_basecase.asm   |  49 ++++++++----
+ mpn/x86_64/coreihwl/addmul_2.asm       |   1 +
+ mpn/x86_64/coreihwl/aors_n.asm         |   1 +
+ mpn/x86_64/coreihwl/aorsmul_1.asm      |   1 +
+ mpn/x86_64/coreihwl/gcd_22.asm         |   1 +
+ mpn/x86_64/coreihwl/mul_2.asm          |   1 +
+ mpn/x86_64/coreihwl/mul_basecase.asm   |   1 +
+ mpn/x86_64/coreihwl/mullo_basecase.asm |   1 +
+ mpn/x86_64/coreihwl/redc_1.asm         |   1 +
+ mpn/x86_64/coreihwl/sqr_basecase.asm   |   1 +
+ mpn/x86_64/coreinhm/aorrlsh_n.asm      |   1 +
+ mpn/x86_64/coreinhm/hamdist.asm        |   1 +
+ mpn/x86_64/coreinhm/popcount.asm       |   1 +
+ mpn/x86_64/coreisbr/addmul_2.asm       |   1 +
+ mpn/x86_64/coreisbr/aorrlshC_n.asm     |   1 +
+ mpn/x86_64/coreisbr/aorrlsh_n.asm      |   1 +
+ mpn/x86_64/coreisbr/aors_n.asm         |   1 +
+ mpn/x86_64/coreisbr/cnd_add_n.asm      |   1 +
+ mpn/x86_64/coreisbr/cnd_sub_n.asm      |   1 +
+ mpn/x86_64/coreisbr/mul_1.asm          |   1 +
+ mpn/x86_64/coreisbr/mul_2.asm          |   1 +
+ mpn/x86_64/coreisbr/mul_basecase.asm   |   1 +
+ mpn/x86_64/coreisbr/mullo_basecase.asm |   1 +
+ mpn/x86_64/coreisbr/rsh1aors_n.asm     |   1 +
+ mpn/x86_64/coreisbr/sqr_basecase.asm   |   1 +
+ mpn/x86_64/div_qr_1n_pi1.asm           |   1 +
+ mpn/x86_64/div_qr_2n_pi1.asm           |   1 +
+ mpn/x86_64/div_qr_2u_pi1.asm           |   1 +
+ mpn/x86_64/dive_1.asm                  |   1 +
+ mpn/x86_64/divrem_1.asm                |   1 +
+ mpn/x86_64/divrem_2.asm                |   1 +
+ mpn/x86_64/fastavx/copyd.asm           |   1 +
+ mpn/x86_64/fastavx/copyi.asm           |   1 +
+ mpn/x86_64/fastsse/com-palignr.asm     |   1 +
+ mpn/x86_64/fastsse/com.asm             |   1 +
+ mpn/x86_64/fastsse/copyd-palignr.asm   |   1 +
+ mpn/x86_64/fastsse/copyd.asm           |   1 +
+ mpn/x86_64/fastsse/copyi-palignr.asm   |   1 +
+ mpn/x86_64/fastsse/copyi.asm           |   1 +
+ mpn/x86_64/fastsse/lshift-movdqu2.asm  |   1 +
+ mpn/x86_64/fastsse/lshift.asm          |   1 +
+ mpn/x86_64/fastsse/lshiftc-movdqu2.asm |   1 +
+ mpn/x86_64/fastsse/lshiftc.asm         |   1 +
+ mpn/x86_64/fastsse/rshift-movdqu2.asm  |   1 +
+ mpn/x86_64/fastsse/sec_tabselect.asm   |   1 +
+ mpn/x86_64/fat/fat_entry.asm           |   1 +
+ mpn/x86_64/gcd_11.asm                  |   1 +
+ mpn/x86_64/gcd_22.asm                  |   1 +
+ mpn/x86_64/k10/gcd_22.asm              |   1 +
+ mpn/x86_64/k10/hamdist.asm             |   1 +
+ mpn/x86_64/k10/popcount.asm            |   5 +-
+ mpn/x86_64/k8/addmul_2.asm             |   1 +
+ mpn/x86_64/k8/aorrlsh_n.asm            |   1 +
+ mpn/x86_64/k8/bdiv_q_1.asm             |   1 +
+ mpn/x86_64/k8/div_qr_1n_pi1.asm        |   1 +
+ mpn/x86_64/k8/mul_basecase.asm         |   8 ++
+ mpn/x86_64/k8/mullo_basecase.asm       |  12 ++-
+ mpn/x86_64/k8/mulmid_basecase.asm      |   9 +++
+ mpn/x86_64/k8/redc_1.asm               |  18 +++--
+ mpn/x86_64/k8/sqr_basecase.asm         |  18 +++--
+ mpn/x86_64/logops_n.asm                |   1 +
+ mpn/x86_64/lshift.asm                  |   1 +
+ mpn/x86_64/lshiftc.asm                 |   1 +
+ mpn/x86_64/lshsub_n.asm                |   1 +
+ mpn/x86_64/missing.asm                 |   1 +
+ mpn/x86_64/mod_1_2.asm                 |   1 +
+ mpn/x86_64/mod_1_4.asm                 |   1 +
+ mpn/x86_64/mod_34lsub1.asm             |  28 ++++---
+ mpn/x86_64/mode1o.asm                  |   1 +
+ mpn/x86_64/mul_1.asm                   |   1 +
+ mpn/x86_64/mul_2.asm                   |   1 +
+ mpn/x86_64/nano/dive_1.asm             |   1 +
+ mpn/x86_64/pentium4/aors_n.asm         |   1 +
+ mpn/x86_64/pentium4/mod_34lsub1.asm    |   1 +
+ mpn/x86_64/pentium4/rsh1aors_n.asm     |   1 +
+ mpn/x86_64/pentium4/rshift.asm         |   1 +
+ mpn/x86_64/popham.asm                  |   1 +
+ mpn/x86_64/rsh1aors_n.asm              |   1 +
+ mpn/x86_64/rshift.asm                  |   1 +
+ mpn/x86_64/sec_tabselect.asm           |   1 +
+ mpn/x86_64/sqr_diag_addlsh1.asm        |   1 +
+ mpn/x86_64/sublsh1_n.asm               |   1 +
+ mpn/x86_64/x86_64-defs.m4              |   6 +-
+ mpn/x86_64/zen/aorrlsh_n.asm           |  25 +++++--
+ mpn/x86_64/zen/mul_basecase.asm        |   1 +
+ mpn/x86_64/zen/mullo_basecase.asm      |   1 +
+ mpn/x86_64/zen/sbpi1_bdiv_r.asm        |   1 +
+ mpn/x86_64/zen/sqr_basecase.asm        |   1 +
+ 244 files changed, 537 insertions(+), 90 deletions(-)
+
+diff --git a/acinclude.m4 b/acinclude.m4
+index 86175ce..84e880b 100644
+--- a/acinclude.m4
++++ b/acinclude.m4
+@@ -3135,6 +3135,106 @@ __sparc_get_pc_thunk.l7:
+ GMP_DEFINE_RAW(["define(<HAVE_SHARED_THUNKS>,<$gmp_cv_asm_sparc_shared_thunks>)"])
+ ])
+ 
++dnl  GMP_ASM_X86_CET_MACROS(ABI)
++dnl  ------------
++dnl  Define
++dnl  1. X86_ENDBR for endbr32/endbr64.
++dnl  2. X86_NOTRACK for notrack prefix.
++dnl  3. X86_GNU_PROPERTY to add a .note.gnu.property section to mark
++dnl  Intel CET support if needed.
++dnl	.section ".note.gnu.property", "a"
++dnl	.p2align POINTER-ALIGN
++dnl	.long 1f - 0f
++dnl	.long 4f - 1f
++dnl	.long 5
++dnl 0:
++dnl	.asciz "GNU"
++dnl 1:
++dnl	.p2align POINTER-ALIGN
++dnl	.long 0xc0000002
++dnl	.long 3f - 2f
++dnl 2:
++dnl	.long 3
++dnl 3:
++dnl	.p2align POINTER-ALIGN
++dnl 4:
++AC_DEFUN([GMP_ASM_X86_CET_MACROS],[
++dnl AC_REQUIRE([AC_PROG_CC]) GMP uses something else
++AC_CACHE_CHECK([if Intel CET is enabled],
++  gmp_cv_asm_x86_intel_cet, [dnl
++  cat > conftest.c <<EOF
++#ifndef __CET__
++#error Intel CET is not enabled
++#endif
++EOF
++  if AC_TRY_COMMAND([${CC} $CFLAGS $CPPFLAGS
++                     -S -o conftest.s conftest.c >/dev/null])
++  then
++    gmp_cv_asm_x86_intel_cet=yes
++  else
++    gmp_cv_asm_x86_intel_cet=no
++  fi
++  rm -f conftest*])
++  if test "$gmp_cv_asm_x86_intel_cet" = yes; then
++    case $1 in
++    32)
++      endbr=endbr32
++      p2align=2
++      ;;
++    64)
++      endbr=endbr64
++      p2align=3
++      ;;
++    x32)
++      endbr=endbr64
++      p2align=2
++      ;;
++    esac
++    AC_CACHE_CHECK([if .note.gnu.property section is needed],
++      gmp_cv_asm_x86_gnu_property, [dnl
++      cat > conftest.c <<EOF
++#if !defined __ELF__ || !defined __CET__
++#error GNU property is not needed
++#endif
++EOF
++      if AC_TRY_COMMAND([${CC} $CFLAGS $CPPFLAGS
++			-S -o conftest.s conftest.c >/dev/null])
++      then
++	gmp_cv_asm_x86_gnu_property=yes
++      else
++	gmp_cv_asm_x86_gnu_property=no
++      fi
++      rm -f conftest*])
++    echo ["define(<X86_ENDBR>,<$endbr>)"] >> $gmp_tmpconfigm4
++    echo ["define(<X86_NOTRACK>,<notrack>)"] >> $gmp_tmpconfigm4
++  else
++    gmp_cv_asm_x86_gnu_property=no
++    echo ["define(<X86_ENDBR>,<>)"] >> $gmp_tmpconfigm4
++    echo ["define(<X86_NOTRACK>,<>)"] >> $gmp_tmpconfigm4
++  fi
++  if test "$gmp_cv_asm_x86_gnu_property" = yes; then
++    echo ["define(<X86_GNU_PROPERTY>, <
++	.section \".note.gnu.property\", \"a\"
++	.p2align $p2align
++	.long 1f - 0f
++	.long 4f - 1f
++	.long 5
++0:
++	.asciz \"GNU\"
++1:
++	.p2align $p2align
++	.long 0xc0000002
++	.long 3f - 2f
++2:
++	.long 3
++3:
++	.p2align $p2align
++4:>)"] >> $gmp_tmpconfigm4
++  else
++    echo ["define(<X86_GNU_PROPERTY>,<>)"] >> $gmp_tmpconfigm4
++  fi
++])
++
+ 
+ dnl  GMP_C_ATTRIBUTE_CONST
+ dnl  ---------------------
+diff --git a/configure.ac b/configure.ac
+index 024cacb..be314a6 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -3804,6 +3804,7 @@ yes
+ 	  esac
+           ;;
+       esac
++      GMP_ASM_X86_CET_MACROS($ABI)
+       ;;
+   esac
+ fi
+diff --git a/mpn/x86/aors_n.asm b/mpn/x86/aors_n.asm
+index 5d359f5..7ea7814 100644
+--- a/mpn/x86/aors_n.asm
++++ b/mpn/x86/aors_n.asm
+@@ -112,7 +112,7 @@ L(0a):	leal	(%eax,%eax,8),%eax
+ 	shrl	%ebp			C shift bit 0 into carry
+ 	popl	%ebp		FRAME_popl()
+ 
+-	jmp	*%eax			C jump into loop
++	X86_NOTRACK jmp	*%eax			C jump into loop
+ 
+ EPILOGUE()
+ 
+@@ -153,7 +153,7 @@ L(0b):	leal	(%eax,%eax,8),%eax
+ 	C Calculate start address in loop for non-PIC.
+ 	leal	L(oop)-3(%eax,%eax,8),%eax
+ ')
+-	jmp	*%eax			C jump into loop
++	X86_NOTRACK jmp	*%eax			C jump into loop
+ 
+ L(oopgo):
+ 	pushl	%ebp		FRAME_pushl()
+@@ -200,3 +200,4 @@ L(oop):	movl	(%esi),%eax
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/aorsmul_1.asm b/mpn/x86/aorsmul_1.asm
+index 54a8905..0ab1e01 100644
+--- a/mpn/x86/aorsmul_1.asm
++++ b/mpn/x86/aorsmul_1.asm
+@@ -154,3 +154,4 @@ L(end):	movl	%ebx,%eax
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/atom/sse2/aorsmul_1.asm b/mpn/x86/atom/sse2/aorsmul_1.asm
+index 969a14a..20658e1 100644
+--- a/mpn/x86/atom/sse2/aorsmul_1.asm
++++ b/mpn/x86/atom/sse2/aorsmul_1.asm
+@@ -172,3 +172,4 @@ PROLOGUE(func_1c)
+ 	mov	20(%esp), %edx		C carry
+ 	jmp	L(ent)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/atom/sse2/mul_basecase.asm b/mpn/x86/atom/sse2/mul_basecase.asm
+index 97d3aeb..74171aa 100644
+--- a/mpn/x86/atom/sse2/mul_basecase.asm
++++ b/mpn/x86/atom/sse2/mul_basecase.asm
+@@ -499,3 +499,4 @@ L(done):
+ 	pop	%edi
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/atom/sse2/sqr_basecase.asm b/mpn/x86/atom/sse2/sqr_basecase.asm
+index af19ed8..0031812 100644
+--- a/mpn/x86/atom/sse2/sqr_basecase.asm
++++ b/mpn/x86/atom/sse2/sqr_basecase.asm
+@@ -632,3 +632,4 @@ L(one):	pmuludq	%mm7, %mm7
+ 	pop	%edi
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/bdiv_dbm1c.asm b/mpn/x86/bdiv_dbm1c.asm
+index 0288c47..7a3b1a6 100644
+--- a/mpn/x86/bdiv_dbm1c.asm
++++ b/mpn/x86/bdiv_dbm1c.asm
+@@ -127,3 +127,4 @@ L(b1):	add	$-4, %ebp
+ 	pop	%esi
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/copyd.asm b/mpn/x86/copyd.asm
+index 51fa195..0e588d9 100644
+--- a/mpn/x86/copyd.asm
++++ b/mpn/x86/copyd.asm
+@@ -89,3 +89,4 @@ PROLOGUE(mpn_copyd)
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/copyi.asm b/mpn/x86/copyi.asm
+index f6b0354..6efbb90 100644
+--- a/mpn/x86/copyi.asm
++++ b/mpn/x86/copyi.asm
+@@ -97,3 +97,4 @@ PROLOGUE(mpn_copyi)
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/divrem_1.asm b/mpn/x86/divrem_1.asm
+index 255d493..b1af920 100644
+--- a/mpn/x86/divrem_1.asm
++++ b/mpn/x86/divrem_1.asm
+@@ -231,3 +231,4 @@ deflit(`FRAME',8)
+ 	popl	%edi
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/divrem_2.asm b/mpn/x86/divrem_2.asm
+index 4c38ad0..c2920c2 100644
+--- a/mpn/x86/divrem_2.asm
++++ b/mpn/x86/divrem_2.asm
+@@ -197,3 +197,4 @@ L(35):	sub	20(%esp), %ebp
+ 	movl	$1, 32(%esp)
+ 	jmp	L(8)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/aors_n.asm b/mpn/x86/k6/aors_n.asm
+index 168f9b4..257ba59 100644
+--- a/mpn/x86/k6/aors_n.asm
++++ b/mpn/x86/k6/aors_n.asm
+@@ -335,3 +335,4 @@ L(inplace_done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/aorsmul_1.asm b/mpn/x86/k6/aorsmul_1.asm
+index eaa92eb..78be9d2 100644
+--- a/mpn/x86/k6/aorsmul_1.asm
++++ b/mpn/x86/k6/aorsmul_1.asm
+@@ -389,3 +389,4 @@ Zdisp(	M4_inst,%ecx, disp0,(%edi))
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/divrem_1.asm b/mpn/x86/k6/divrem_1.asm
+index b4cea4f..ca41a3f 100644
+--- a/mpn/x86/k6/divrem_1.asm
++++ b/mpn/x86/k6/divrem_1.asm
+@@ -201,3 +201,4 @@ deflit(`FRAME',8)
+ 	popl	%edi
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/k62mmx/copyd.asm b/mpn/x86/k6/k62mmx/copyd.asm
+index f80a5a1..fc329f5 100644
+--- a/mpn/x86/k6/k62mmx/copyd.asm
++++ b/mpn/x86/k6/k62mmx/copyd.asm
+@@ -116,3 +116,4 @@ L(zero):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/k62mmx/lshift.asm b/mpn/x86/k6/k62mmx/lshift.asm
+index c86575f..728fb5b 100644
+--- a/mpn/x86/k6/k62mmx/lshift.asm
++++ b/mpn/x86/k6/k62mmx/lshift.asm
+@@ -292,3 +292,4 @@ deflit(`FRAME',4)
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/k62mmx/rshift.asm b/mpn/x86/k6/k62mmx/rshift.asm
+index f604a7b..bd673f3 100644
+--- a/mpn/x86/k6/k62mmx/rshift.asm
++++ b/mpn/x86/k6/k62mmx/rshift.asm
+@@ -291,3 +291,4 @@ L(finish_even):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/mmx/com.asm b/mpn/x86/k6/mmx/com.asm
+index b747454..646d16b 100644
+--- a/mpn/x86/k6/mmx/com.asm
++++ b/mpn/x86/k6/mmx/com.asm
+@@ -101,3 +101,4 @@ L(no_extra):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/mmx/logops_n.asm b/mpn/x86/k6/mmx/logops_n.asm
+index e17930b..acfd7df 100644
+--- a/mpn/x86/k6/mmx/logops_n.asm
++++ b/mpn/x86/k6/mmx/logops_n.asm
+@@ -224,3 +224,4 @@ L(no_extra):
+ 			ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/mmx/lshift.asm b/mpn/x86/k6/mmx/lshift.asm
+index 45be582..eee1eb8 100644
+--- a/mpn/x86/k6/mmx/lshift.asm
++++ b/mpn/x86/k6/mmx/lshift.asm
+@@ -128,3 +128,4 @@ L(top):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/mmx/popham.asm b/mpn/x86/k6/mmx/popham.asm
+index 2b19d0b..efeb1b4 100644
+--- a/mpn/x86/k6/mmx/popham.asm
++++ b/mpn/x86/k6/mmx/popham.asm
+@@ -234,3 +234,4 @@ HAM(`	nop			C code alignment')
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/mmx/rshift.asm b/mpn/x86/k6/mmx/rshift.asm
+index cd0382f..ae53711 100644
+--- a/mpn/x86/k6/mmx/rshift.asm
++++ b/mpn/x86/k6/mmx/rshift.asm
+@@ -128,3 +128,4 @@ Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/mod_34lsub1.asm b/mpn/x86/k6/mod_34lsub1.asm
+index 7e30503..05f8979 100644
+--- a/mpn/x86/k6/mod_34lsub1.asm
++++ b/mpn/x86/k6/mod_34lsub1.asm
+@@ -188,3 +188,4 @@ L(combine):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/mul_1.asm b/mpn/x86/k6/mul_1.asm
+index 3ef7ec2..2139f36 100644
+--- a/mpn/x86/k6/mul_1.asm
++++ b/mpn/x86/k6/mul_1.asm
+@@ -290,3 +290,4 @@ L(finish_not_one):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/mul_basecase.asm b/mpn/x86/k6/mul_basecase.asm
+index 7030001..ab202a2 100644
+--- a/mpn/x86/k6/mul_basecase.asm
++++ b/mpn/x86/k6/mul_basecase.asm
+@@ -610,3 +610,4 @@ Zdisp(	addl,	%ecx, disp0,(%edi))
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/pre_mod_1.asm b/mpn/x86/k6/pre_mod_1.asm
+index 34db20d..1e4cb17 100644
+--- a/mpn/x86/k6/pre_mod_1.asm
++++ b/mpn/x86/k6/pre_mod_1.asm
+@@ -144,3 +144,4 @@ L(q1_ff):
+ 
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k6/sqr_basecase.asm b/mpn/x86/k6/sqr_basecase.asm
+index b7ecb5c..f3a101a 100644
+--- a/mpn/x86/k6/sqr_basecase.asm
++++ b/mpn/x86/k6/sqr_basecase.asm
+@@ -678,3 +678,4 @@ L(pic_calc):
+ 
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/aors_n.asm b/mpn/x86/k7/aors_n.asm
+index 1a08072..bfdf3d4 100644
+--- a/mpn/x86/k7/aors_n.asm
++++ b/mpn/x86/k7/aors_n.asm
+@@ -256,3 +256,4 @@ L(even):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mmx/com.asm b/mpn/x86/k7/mmx/com.asm
+index a258c22..cf48fac 100644
+--- a/mpn/x86/k7/mmx/com.asm
++++ b/mpn/x86/k7/mmx/com.asm
+@@ -123,3 +123,4 @@ L(done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mmx/copyd.asm b/mpn/x86/k7/mmx/copyd.asm
+index 59ece40..3bc9ff8 100644
+--- a/mpn/x86/k7/mmx/copyd.asm
++++ b/mpn/x86/k7/mmx/copyd.asm
+@@ -142,3 +142,4 @@ L(done):
+ 
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mmx/copyi.asm b/mpn/x86/k7/mmx/copyi.asm
+index 9a28f92..f0648fa 100644
+--- a/mpn/x86/k7/mmx/copyi.asm
++++ b/mpn/x86/k7/mmx/copyi.asm
+@@ -155,3 +155,4 @@ L(done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mmx/divrem_1.asm b/mpn/x86/k7/mmx/divrem_1.asm
+index cf34328..370bfbb 100644
+--- a/mpn/x86/k7/mmx/divrem_1.asm
++++ b/mpn/x86/k7/mmx/divrem_1.asm
+@@ -830,3 +830,4 @@ L(fraction_entry):
+ 	jmp	L(fraction_done)
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mmx/lshift.asm b/mpn/x86/k7/mmx/lshift.asm
+index b3383cf..4140e82 100644
+--- a/mpn/x86/k7/mmx/lshift.asm
++++ b/mpn/x86/k7/mmx/lshift.asm
+@@ -479,3 +479,4 @@ L(end_even_unaligned):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mmx/popham.asm b/mpn/x86/k7/mmx/popham.asm
+index 95965b7..f29540a 100644
+--- a/mpn/x86/k7/mmx/popham.asm
++++ b/mpn/x86/k7/mmx/popham.asm
+@@ -211,3 +211,4 @@ L(loaded):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mmx/rshift.asm b/mpn/x86/k7/mmx/rshift.asm
+index 345d23a..0da1f93 100644
+--- a/mpn/x86/k7/mmx/rshift.asm
++++ b/mpn/x86/k7/mmx/rshift.asm
+@@ -478,3 +478,4 @@ L(end_even_unaligned):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mod_1_1.asm b/mpn/x86/k7/mod_1_1.asm
+index 1bbe6f9..8da9519 100644
+--- a/mpn/x86/k7/mod_1_1.asm
++++ b/mpn/x86/k7/mod_1_1.asm
+@@ -219,3 +219,4 @@ PROLOGUE(mpn_mod_1_1p_cps)
+ 	pop	%ebp
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mod_1_4.asm b/mpn/x86/k7/mod_1_4.asm
+index bb7597e..fe1da5b 100644
+--- a/mpn/x86/k7/mod_1_4.asm
++++ b/mpn/x86/k7/mod_1_4.asm
+@@ -258,3 +258,4 @@ C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
+ 	pop	%ebp
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mod_34lsub1.asm b/mpn/x86/k7/mod_34lsub1.asm
+index ee3ad04..0c1b8c8 100644
+--- a/mpn/x86/k7/mod_34lsub1.asm
++++ b/mpn/x86/k7/mod_34lsub1.asm
+@@ -186,3 +186,4 @@ L(combine):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/mul_basecase.asm b/mpn/x86/k7/mul_basecase.asm
+index 4dfb500..b96fda7 100644
+--- a/mpn/x86/k7/mul_basecase.asm
++++ b/mpn/x86/k7/mul_basecase.asm
+@@ -600,3 +600,4 @@ deflit(`disp1', eval(disp0-0 + 4))
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/k7/sqr_basecase.asm b/mpn/x86/k7/sqr_basecase.asm
+index 7b6a97e..df47ee4 100644
+--- a/mpn/x86/k7/sqr_basecase.asm
++++ b/mpn/x86/k7/sqr_basecase.asm
+@@ -633,3 +633,4 @@ L(diag):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/lshift.asm b/mpn/x86/lshift.asm
+index 6ee6153..95f5321 100644
+--- a/mpn/x86/lshift.asm
++++ b/mpn/x86/lshift.asm
+@@ -104,3 +104,4 @@ L(end):	shll	%cl,%ebx		C compute least significant limb
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/mmx/sec_tabselect.asm b/mpn/x86/mmx/sec_tabselect.asm
+index aae158a..543dec1 100644
+--- a/mpn/x86/mmx/sec_tabselect.asm
++++ b/mpn/x86/mmx/sec_tabselect.asm
+@@ -161,3 +161,4 @@ L(b00):	pop	%ebp
+ 	emms
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/mod_34lsub1.asm b/mpn/x86/mod_34lsub1.asm
+index e09e702..df52d37 100644
+--- a/mpn/x86/mod_34lsub1.asm
++++ b/mpn/x86/mod_34lsub1.asm
+@@ -181,3 +181,4 @@ L(combine):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/mul_1.asm b/mpn/x86/mul_1.asm
+index 421de62..dbbc0e3 100644
+--- a/mpn/x86/mul_1.asm
++++ b/mpn/x86/mul_1.asm
+@@ -138,3 +138,4 @@ L(end):	movl	%ebx,%eax
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/mul_basecase.asm b/mpn/x86/mul_basecase.asm
+index 8339732..c32fd7e 100644
+--- a/mpn/x86/mul_basecase.asm
++++ b/mpn/x86/mul_basecase.asm
+@@ -221,3 +221,4 @@ L(done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/p6/aors_n.asm b/mpn/x86/p6/aors_n.asm
+index df51c2e..ab172df 100644
+--- a/mpn/x86/p6/aors_n.asm
++++ b/mpn/x86/p6/aors_n.asm
+@@ -90,7 +90,7 @@ L(here):
+ ')
+ 
+ 	shr	%edx				C set cy flag
+-	jmp	*%eax
++	X86_NOTRACK jmp	*%eax
+ 
+ ifdef(`PIC',`
+ L(pic_calc):
+@@ -154,3 +154,4 @@ PROLOGUE(func_nc)
+ 	movl	20(%esp), %edx
+ 	jmp	L(start)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/p6/aorsmul_1.asm b/mpn/x86/p6/aorsmul_1.asm
+index bc8c49c..2a3b122 100644
+--- a/mpn/x86/p6/aorsmul_1.asm
++++ b/mpn/x86/p6/aorsmul_1.asm
+@@ -240,7 +240,7 @@ L(here):
+ 	cmovnz(	%ebx, %ecx)	C high,low carry other way around
+ 	cmovnz(	%eax, %ebx)
+ 
+-	jmp	*%edx
++	X86_NOTRACK jmp	*%edx
+ 
+ 
+ ifdef(`PIC',`
+@@ -318,3 +318,4 @@ deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/p6/copyd.asm b/mpn/x86/p6/copyd.asm
+index 1be7636..bd42da1 100644
+--- a/mpn/x86/p6/copyd.asm
++++ b/mpn/x86/p6/copyd.asm
+@@ -176,3 +176,4 @@ L(zero):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/p6/gcd_11.asm b/mpn/x86/p6/gcd_11.asm
+index 80e055e..a7fc6a8 100644
+--- a/mpn/x86/p6/gcd_11.asm
++++ b/mpn/x86/p6/gcd_11.asm
+@@ -81,3 +81,4 @@ L(end):	mov	%edx, %eax
+ 	pop	%edi
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/p6/lshsub_n.asm b/mpn/x86/p6/lshsub_n.asm
+index 7ada213..17db5d5 100644
+--- a/mpn/x86/p6/lshsub_n.asm
++++ b/mpn/x86/p6/lshsub_n.asm
+@@ -82,7 +82,7 @@ L(here):
+ 	pxor	%mm1, %mm1
+ 	pxor	%mm0, %mm0
+ 
+-	jmp	*%eax
++	X86_NOTRACK jmp	*%eax
+ 
+ ifdef(`PIC',`
+ L(pic_calc):
+@@ -167,3 +167,4 @@ L(ent):	mov	   0(up,n,4), %eax
+ 	jmp	   L(top)
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/p6/mmx/divrem_1.asm b/mpn/x86/p6/mmx/divrem_1.asm
+index 5300616..b6057dd 100644
+--- a/mpn/x86/p6/mmx/divrem_1.asm
++++ b/mpn/x86/p6/mmx/divrem_1.asm
+@@ -765,3 +765,4 @@ L(fraction_top):
+ 	jmp	L(fraction_done)
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/p6/mod_34lsub1.asm b/mpn/x86/p6/mod_34lsub1.asm
+index b88ab5d..46b3806 100644
+--- a/mpn/x86/p6/mod_34lsub1.asm
++++ b/mpn/x86/p6/mod_34lsub1.asm
+@@ -188,3 +188,4 @@ L(done_0):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/p6/mul_basecase.asm b/mpn/x86/p6/mul_basecase.asm
+index d87bc12..521b31e 100644
+--- a/mpn/x86/p6/mul_basecase.asm
++++ b/mpn/x86/p6/mul_basecase.asm
+@@ -524,7 +524,7 @@ L(unroll_outer_entry):
+ 	xorl	%eax, %ebx		C carries other way for odd index
+ 	xorl	%eax, %ecx
+ 
+-	jmp	*%edx
++	X86_NOTRACK jmp	*%edx
+ 
+ 
+ C -----------------------------------------------------------------------------
+@@ -605,3 +605,4 @@ deflit(`disp1', eval(disp0 + 4))
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/p6/sqr_basecase.asm b/mpn/x86/p6/sqr_basecase.asm
+index 8fc7fdf..f71304f 100644
+--- a/mpn/x86/p6/sqr_basecase.asm
++++ b/mpn/x86/p6/sqr_basecase.asm
+@@ -447,7 +447,7 @@ define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
+ 	cmovX(	%ebx, %ecx)	C high carry reverse
+ 	cmovX(	%eax, %ebx)	C low carry reverse
+ 	movl	%edx, VAR_JMP
+-	jmp	*%edx
++	X86_NOTRACK jmp	*%edx
+ 
+ 
+ 	C Must be on an even address here so the low bit of the jump address
+@@ -647,3 +647,4 @@ L(pic_calc):
+ 
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/aors_n.asm b/mpn/x86/pentium/aors_n.asm
+index 01ebfb9..ca124a5 100644
+--- a/mpn/x86/pentium/aors_n.asm
++++ b/mpn/x86/pentium/aors_n.asm
+@@ -201,3 +201,4 @@ L(end2):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/aorsmul_1.asm b/mpn/x86/pentium/aorsmul_1.asm
+index d83cc45..5cec8b3 100644
+--- a/mpn/x86/pentium/aorsmul_1.asm
++++ b/mpn/x86/pentium/aorsmul_1.asm
+@@ -142,3 +142,4 @@ L(top):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/com.asm b/mpn/x86/pentium/com.asm
+index b080545..00064ff 100644
+--- a/mpn/x86/pentium/com.asm
++++ b/mpn/x86/pentium/com.asm
+@@ -179,3 +179,4 @@ L(done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/copyd.asm b/mpn/x86/pentium/copyd.asm
+index 72a543b..c7f74b5 100644
+--- a/mpn/x86/pentium/copyd.asm
++++ b/mpn/x86/pentium/copyd.asm
+@@ -144,3 +144,4 @@ L(done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/copyi.asm b/mpn/x86/pentium/copyi.asm
+index d983d6b..bc7744e 100644
+--- a/mpn/x86/pentium/copyi.asm
++++ b/mpn/x86/pentium/copyi.asm
+@@ -162,3 +162,4 @@ L(done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/logops_n.asm b/mpn/x86/pentium/logops_n.asm
+index 1877317..41a9477 100644
+--- a/mpn/x86/pentium/logops_n.asm
++++ b/mpn/x86/pentium/logops_n.asm
+@@ -174,3 +174,4 @@ L(done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/lshift.asm b/mpn/x86/pentium/lshift.asm
+index 2a31f36..68cba52 100644
+--- a/mpn/x86/pentium/lshift.asm
++++ b/mpn/x86/pentium/lshift.asm
+@@ -241,3 +241,4 @@ L(L1):	movl	%edx,(%edi)		C store last limb
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/mmx/lshift.asm b/mpn/x86/pentium/mmx/lshift.asm
+index 04b0ddc..9e18c86 100644
+--- a/mpn/x86/pentium/mmx/lshift.asm
++++ b/mpn/x86/pentium/mmx/lshift.asm
+@@ -461,3 +461,4 @@ L(finish_zero_unaligned):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/mmx/mul_1.asm b/mpn/x86/pentium/mmx/mul_1.asm
+index 4ced577..b04a718 100644
+--- a/mpn/x86/pentium/mmx/mul_1.asm
++++ b/mpn/x86/pentium/mmx/mul_1.asm
+@@ -369,3 +369,4 @@ L(small_done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/mmx/rshift.asm b/mpn/x86/pentium/mmx/rshift.asm
+index e3b274b..5493d20 100644
+--- a/mpn/x86/pentium/mmx/rshift.asm
++++ b/mpn/x86/pentium/mmx/rshift.asm
+@@ -466,3 +466,4 @@ L(finish_zero_unaligned):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/mod_34lsub1.asm b/mpn/x86/pentium/mod_34lsub1.asm
+index 2d88223..0945de8 100644
+--- a/mpn/x86/pentium/mod_34lsub1.asm
++++ b/mpn/x86/pentium/mod_34lsub1.asm
+@@ -190,3 +190,4 @@ L(combine):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/mul_1.asm b/mpn/x86/pentium/mul_1.asm
+index a0858af..2c49130 100644
+--- a/mpn/x86/pentium/mul_1.asm
++++ b/mpn/x86/pentium/mul_1.asm
+@@ -175,3 +175,4 @@ L(top):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/mul_2.asm b/mpn/x86/pentium/mul_2.asm
+index 4c7beb5..e94e071 100644
+--- a/mpn/x86/pentium/mul_2.asm
++++ b/mpn/x86/pentium/mul_2.asm
+@@ -148,3 +148,4 @@ L(done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/mul_basecase.asm b/mpn/x86/pentium/mul_basecase.asm
+index e1d0f05..ff269bb 100644
+--- a/mpn/x86/pentium/mul_basecase.asm
++++ b/mpn/x86/pentium/mul_basecase.asm
+@@ -140,3 +140,4 @@ L(done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/rshift.asm b/mpn/x86/pentium/rshift.asm
+index 2105c4c..d98080d 100644
+--- a/mpn/x86/pentium/rshift.asm
++++ b/mpn/x86/pentium/rshift.asm
+@@ -241,3 +241,4 @@ L(L1):	movl	%edx,(%edi)		C store last limb
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium/sqr_basecase.asm b/mpn/x86/pentium/sqr_basecase.asm
+index b11d767..ee64eb3 100644
+--- a/mpn/x86/pentium/sqr_basecase.asm
++++ b/mpn/x86/pentium/sqr_basecase.asm
+@@ -526,3 +526,4 @@ L(diag):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/copyd.asm b/mpn/x86/pentium4/copyd.asm
+index 82af81c..bf06a05 100644
+--- a/mpn/x86/pentium4/copyd.asm
++++ b/mpn/x86/pentium4/copyd.asm
+@@ -69,3 +69,4 @@ L(end):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/copyi.asm b/mpn/x86/pentium4/copyi.asm
+index b614887..acbb3f4 100644
+--- a/mpn/x86/pentium4/copyi.asm
++++ b/mpn/x86/pentium4/copyi.asm
+@@ -91,3 +91,4 @@ L(replmovs):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/mmx/popham.asm b/mpn/x86/pentium4/mmx/popham.asm
+index 9563cb5..f7a6124 100644
+--- a/mpn/x86/pentium4/mmx/popham.asm
++++ b/mpn/x86/pentium4/mmx/popham.asm
+@@ -201,3 +201,4 @@ L(loaded):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/add_n.asm b/mpn/x86/pentium4/sse2/add_n.asm
+index 8e2380e..e329635 100644
+--- a/mpn/x86/pentium4/sse2/add_n.asm
++++ b/mpn/x86/pentium4/sse2/add_n.asm
+@@ -99,3 +99,4 @@ L(top):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/addlsh1_n.asm b/mpn/x86/pentium4/sse2/addlsh1_n.asm
+index 93b63b2..e801f7b 100644
+--- a/mpn/x86/pentium4/sse2/addlsh1_n.asm
++++ b/mpn/x86/pentium4/sse2/addlsh1_n.asm
+@@ -106,3 +106,4 @@ L(top):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/addmul_1.asm b/mpn/x86/pentium4/sse2/addmul_1.asm
+index 7810207..62a7675 100644
+--- a/mpn/x86/pentium4/sse2/addmul_1.asm
++++ b/mpn/x86/pentium4/sse2/addmul_1.asm
+@@ -187,3 +187,4 @@ PROLOGUE(mpn_addmul_1c)
+ 	movd	20(%esp), %mm6
+ 	jmp	L(ent)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/cnd_add_n.asm b/mpn/x86/pentium4/sse2/cnd_add_n.asm
+index b3f3474..7183b94 100644
+--- a/mpn/x86/pentium4/sse2/cnd_add_n.asm
++++ b/mpn/x86/pentium4/sse2/cnd_add_n.asm
+@@ -93,3 +93,4 @@ L(top):	movd	(%ebx,%ecx,4), %mm2
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/mpn/x86/pentium4/sse2/cnd_sub_n.asm
+index 339a23e..ba0fc47 100644
+--- a/mpn/x86/pentium4/sse2/cnd_sub_n.asm
++++ b/mpn/x86/pentium4/sse2/cnd_sub_n.asm
+@@ -112,3 +112,4 @@ L(done_mm1):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/divrem_1.asm b/mpn/x86/pentium4/sse2/divrem_1.asm
+index 0146fab..d8619e0 100644
+--- a/mpn/x86/pentium4/sse2/divrem_1.asm
++++ b/mpn/x86/pentium4/sse2/divrem_1.asm
+@@ -643,3 +643,4 @@ L(fraction_top):
+ 	jmp	L(fraction_done)
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/mod_1_1.asm b/mpn/x86/pentium4/sse2/mod_1_1.asm
+index ee88bab..2e5a514 100644
+--- a/mpn/x86/pentium4/sse2/mod_1_1.asm
++++ b/mpn/x86/pentium4/sse2/mod_1_1.asm
+@@ -164,3 +164,4 @@ C CAUTION: This is the same code as in k7/mod_1_1.asm
+ 	pop	%ebp
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/mod_1_4.asm b/mpn/x86/pentium4/sse2/mod_1_4.asm
+index eb2edb6..5ef3c4a 100644
+--- a/mpn/x86/pentium4/sse2/mod_1_4.asm
++++ b/mpn/x86/pentium4/sse2/mod_1_4.asm
+@@ -267,3 +267,4 @@ C CAUTION: This is the same code as in k7/mod_1_4.asm
+ 	pop	%ebp
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/mpn/x86/pentium4/sse2/mod_34lsub1.asm
+index 31e25b7..5b6b9a7 100644
+--- a/mpn/x86/pentium4/sse2/mod_34lsub1.asm
++++ b/mpn/x86/pentium4/sse2/mod_34lsub1.asm
+@@ -173,3 +173,4 @@ L(combine):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/mul_1.asm b/mpn/x86/pentium4/sse2/mul_1.asm
+index 6347b8b..9e4f3fc 100644
+--- a/mpn/x86/pentium4/sse2/mul_1.asm
++++ b/mpn/x86/pentium4/sse2/mul_1.asm
+@@ -162,3 +162,4 @@ PROLOGUE(mpn_mul_1c)
+ 	movd	20(%esp), %mm6
+ 	jmp	L(ent)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/mul_basecase.asm b/mpn/x86/pentium4/sse2/mul_basecase.asm
+index 6e3775a..0bad756 100644
+--- a/mpn/x86/pentium4/sse2/mul_basecase.asm
++++ b/mpn/x86/pentium4/sse2/mul_basecase.asm
+@@ -660,3 +660,4 @@ L(oel3):
+ 	pop	%esi			C				   3
+ 	ret				C				   3
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/rsh1add_n.asm b/mpn/x86/pentium4/sse2/rsh1add_n.asm
+index f421d13..543a637 100644
+--- a/mpn/x86/pentium4/sse2/rsh1add_n.asm
++++ b/mpn/x86/pentium4/sse2/rsh1add_n.asm
+@@ -124,3 +124,4 @@ L(done):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/sqr_basecase.asm b/mpn/x86/pentium4/sse2/sqr_basecase.asm
+index 2dd57d2..9695d42 100644
+--- a/mpn/x86/pentium4/sse2/sqr_basecase.asm
++++ b/mpn/x86/pentium4/sse2/sqr_basecase.asm
+@@ -703,3 +703,4 @@ L(diag):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/sub_n.asm b/mpn/x86/pentium4/sse2/sub_n.asm
+index 5ba1c01..2cd5b22 100644
+--- a/mpn/x86/pentium4/sse2/sub_n.asm
++++ b/mpn/x86/pentium4/sse2/sub_n.asm
+@@ -117,3 +117,4 @@ L(done_mm1):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/pentium4/sse2/submul_1.asm b/mpn/x86/pentium4/sse2/submul_1.asm
+index 020675b..1172f0a 100644
+--- a/mpn/x86/pentium4/sse2/submul_1.asm
++++ b/mpn/x86/pentium4/sse2/submul_1.asm
+@@ -180,3 +180,4 @@ L(eod):	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
+ 	movd	%mm0, 8(%edx)		C result
+ 	jmp	L(rt)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/rshift.asm b/mpn/x86/rshift.asm
+index a60dcaa..1cedc0d 100644
+--- a/mpn/x86/rshift.asm
++++ b/mpn/x86/rshift.asm
+@@ -106,3 +106,4 @@ L(end):	shrl	%cl,%ebx		C compute most significant limb
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/sec_tabselect.asm b/mpn/x86/sec_tabselect.asm
+index c7c2e05..3a8fa17 100644
+--- a/mpn/x86/sec_tabselect.asm
++++ b/mpn/x86/sec_tabselect.asm
+@@ -113,3 +113,4 @@ L(outer_end):
+ 	pop	%edi
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/sqr_basecase.asm b/mpn/x86/sqr_basecase.asm
+index 39f8a89..3414b05 100644
+--- a/mpn/x86/sqr_basecase.asm
++++ b/mpn/x86/sqr_basecase.asm
+@@ -357,3 +357,4 @@ L(diag):
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/udiv.asm b/mpn/x86/udiv.asm
+index a3ee088..2531ef7 100644
+--- a/mpn/x86/udiv.asm
++++ b/mpn/x86/udiv.asm
+@@ -50,3 +50,4 @@ deflit(`FRAME',0)
+ 	movl	%edx, (%ecx)
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/umul.asm b/mpn/x86/umul.asm
+index 34fe434..5c1da35 100644
+--- a/mpn/x86/umul.asm
++++ b/mpn/x86/umul.asm
+@@ -49,3 +49,4 @@ deflit(`FRAME',0)
+ 	movl	%edx, %eax
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86/x86-defs.m4 b/mpn/x86/x86-defs.m4
+index 81309b2..b3520d2 100644
+--- a/mpn/x86/x86-defs.m4
++++ b/mpn/x86/x86-defs.m4
+@@ -123,6 +123,7 @@ m4_assert_defined(`WANT_PROFILING')
+ 	TYPE($1,`function')
+ 	COFF_TYPE($1)
+ $1:
++	X86_ENDBR
+ ifelse(WANT_PROFILING,`prof',      `	call_mcount')
+ ifelse(WANT_PROFILING,`gprof',     `	call_mcount')
+ ifelse(WANT_PROFILING,`instrument',`	call_instrument(enter)')
+@@ -992,7 +993,11 @@ L(movl_eip_`'substr($2,1)):
+ 
+ dnl ASM_END
+ 
+-define(`ASM_END',`load_eip')
++define(`ASM_END',
++`load_eip
++X86_GNU_PROPERTY
++')
++
+ 
+ define(`load_eip', `')		dnl updated in LEA/LEAL
+ 
+diff --git a/mpn/x86_64/addaddmul_1msb0.asm b/mpn/x86_64/addaddmul_1msb0.asm
+index 87c21b4..2d03ddb 100644
+--- a/mpn/x86_64/addaddmul_1msb0.asm
++++ b/mpn/x86_64/addaddmul_1msb0.asm
+@@ -168,3 +168,4 @@ L(end):	cmp	$1, R32(n)
+ 	pop	%r12
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/aorrlsh1_n.asm b/mpn/x86_64/aorrlsh1_n.asm
+index 6ee0872..1441a6c 100644
+--- a/mpn/x86_64/aorrlsh1_n.asm
++++ b/mpn/x86_64/aorrlsh1_n.asm
+@@ -168,3 +168,4 @@ ifdef(`OPERATION_rsblsh1_n',`
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/aorrlshC_n.asm b/mpn/x86_64/aorrlshC_n.asm
+index de00154..691abde 100644
+--- a/mpn/x86_64/aorrlshC_n.asm
++++ b/mpn/x86_64/aorrlshC_n.asm
+@@ -170,3 +170,4 @@ ifelse(ADDSUB,add,`
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/aorrlsh_n.asm b/mpn/x86_64/aorrlsh_n.asm
+index 5ca128f..57f0e77 100644
+--- a/mpn/x86_64/aorrlsh_n.asm
++++ b/mpn/x86_64/aorrlsh_n.asm
+@@ -174,3 +174,4 @@ L(end):	add	R32(%rbx), R32(%rbx)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/aors_err1_n.asm b/mpn/x86_64/aors_err1_n.asm
+index 54d0b3f..8c42ea1 100644
+--- a/mpn/x86_64/aors_err1_n.asm
++++ b/mpn/x86_64/aors_err1_n.asm
+@@ -223,3 +223,4 @@ L(end):
+ 	pop	%rbx
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/aors_err2_n.asm b/mpn/x86_64/aors_err2_n.asm
+index ce5c2a4..0227e5d 100644
+--- a/mpn/x86_64/aors_err2_n.asm
++++ b/mpn/x86_64/aors_err2_n.asm
+@@ -170,3 +170,4 @@ L(end):
+ 	pop	%rbx
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/aors_err3_n.asm b/mpn/x86_64/aors_err3_n.asm
+index bb6d0c5..37047db 100644
+--- a/mpn/x86_64/aors_err3_n.asm
++++ b/mpn/x86_64/aors_err3_n.asm
+@@ -154,3 +154,4 @@ L(end):
+ 	pop	%rbx
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/aors_n.asm b/mpn/x86_64/aors_n.asm
+index d5a314a..b516c4d 100644
+--- a/mpn/x86_64/aors_n.asm
++++ b/mpn/x86_64/aors_n.asm
+@@ -176,3 +176,4 @@ L(end):	lea	32(up), up
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/aorsmul_1.asm b/mpn/x86_64/aorsmul_1.asm
+index dfe4dc4..e3bb2f9 100644
+--- a/mpn/x86_64/aorsmul_1.asm
++++ b/mpn/x86_64/aorsmul_1.asm
+@@ -188,3 +188,4 @@ IFDOS(``pop	%rdi		'')
+ IFDOS(``pop	%rsi		'')
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/atom/addmul_2.asm b/mpn/x86_64/atom/addmul_2.asm
+index c1dcdc4..c1d9451 100644
+--- a/mpn/x86_64/atom/addmul_2.asm
++++ b/mpn/x86_64/atom/addmul_2.asm
+@@ -184,3 +184,4 @@ L(end):	mul	v1
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/atom/aorrlsh1_n.asm b/mpn/x86_64/atom/aorrlsh1_n.asm
+index f44de19..693a302 100644
+--- a/mpn/x86_64/atom/aorrlsh1_n.asm
++++ b/mpn/x86_64/atom/aorrlsh1_n.asm
+@@ -236,3 +236,4 @@ IFDOS(`	mov	56(%rsp), %r8	')
+ 	sbb	R32(%rbp), R32(%rbp)	C save acy
+ 	jmp	L(ent)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/atom/aorrlsh2_n.asm b/mpn/x86_64/atom/aorrlsh2_n.asm
+index 02fb29d..c6ded74 100644
+--- a/mpn/x86_64/atom/aorrlsh2_n.asm
++++ b/mpn/x86_64/atom/aorrlsh2_n.asm
+@@ -189,3 +189,4 @@ ifdef(`OPERATION_rsblsh2_n',`
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/atom/lshift.asm b/mpn/x86_64/atom/lshift.asm
+index 1b37d5d..894b912 100644
+--- a/mpn/x86_64/atom/lshift.asm
++++ b/mpn/x86_64/atom/lshift.asm
+@@ -121,3 +121,4 @@ L(end):	shl	R8(%rcx), %r10
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/atom/lshiftc.asm b/mpn/x86_64/atom/lshiftc.asm
+index 7385f8f..40d8fff 100644
+--- a/mpn/x86_64/atom/lshiftc.asm
++++ b/mpn/x86_64/atom/lshiftc.asm
+@@ -125,3 +125,4 @@ L(end):	shl	R8(%rcx), %r10
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/atom/mul_2.asm b/mpn/x86_64/atom/mul_2.asm
+index 4bc22cd..87414d9 100644
+--- a/mpn/x86_64/atom/mul_2.asm
++++ b/mpn/x86_64/atom/mul_2.asm
+@@ -188,3 +188,4 @@ L(end):	mul	v1
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/atom/rsh1aors_n.asm b/mpn/x86_64/atom/rsh1aors_n.asm
+index 6f5f638..f3952c0 100644
+--- a/mpn/x86_64/atom/rsh1aors_n.asm
++++ b/mpn/x86_64/atom/rsh1aors_n.asm
+@@ -285,3 +285,4 @@ L(cj1):	pop	%r15
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/atom/rshift.asm b/mpn/x86_64/atom/rshift.asm
+index 29c027d..f4c59e1 100644
+--- a/mpn/x86_64/atom/rshift.asm
++++ b/mpn/x86_64/atom/rshift.asm
+@@ -119,3 +119,4 @@ L(end):	shr	R8(cnt), %r10
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/atom/sublsh1_n.asm b/mpn/x86_64/atom/sublsh1_n.asm
+index 1306acd..762e1ee 100644
+--- a/mpn/x86_64/atom/sublsh1_n.asm
++++ b/mpn/x86_64/atom/sublsh1_n.asm
+@@ -240,3 +240,4 @@ IFDOS(`	mov	56(%rsp), %r8	')
+ 	sbb	R32(%rbp), R32(%rbp)	C save acy
+ 	jmp	L(ent)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bd1/addmul_2.asm b/mpn/x86_64/bd1/addmul_2.asm
+index b54e91a..b1c149b 100644
+--- a/mpn/x86_64/bd1/addmul_2.asm
++++ b/mpn/x86_64/bd1/addmul_2.asm
+@@ -233,3 +233,4 @@ L(end):	mul	v0
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bd1/hamdist.asm b/mpn/x86_64/bd1/hamdist.asm
+index 29e78a3..f93ce4d 100644
+--- a/mpn/x86_64/bd1/hamdist.asm
++++ b/mpn/x86_64/bd1/hamdist.asm
+@@ -204,3 +204,4 @@ DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ END_OBJECT(L(cnsts))
+ ')
++ASM_END()
+diff --git a/mpn/x86_64/bd1/mul_2.asm b/mpn/x86_64/bd1/mul_2.asm
+index 85fa7aa..e910cee 100644
+--- a/mpn/x86_64/bd1/mul_2.asm
++++ b/mpn/x86_64/bd1/mul_2.asm
+@@ -193,3 +193,4 @@ L(end):	mov	-8(up), %rax
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bd1/mul_basecase.asm b/mpn/x86_64/bd1/mul_basecase.asm
+index e47ba58..ebae74d 100644
+--- a/mpn/x86_64/bd1/mul_basecase.asm
++++ b/mpn/x86_64/bd1/mul_basecase.asm
+@@ -414,3 +414,4 @@ L(ret2):pop	%rbp
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bd1/popcount.asm b/mpn/x86_64/bd1/popcount.asm
+index 28ce461..063c2cc 100644
+--- a/mpn/x86_64/bd1/popcount.asm
++++ b/mpn/x86_64/bd1/popcount.asm
+@@ -189,3 +189,4 @@ DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ END_OBJECT(L(cnsts))
+ ')
++ASM_END()
+diff --git a/mpn/x86_64/bd2/gcd_11.asm b/mpn/x86_64/bd2/gcd_11.asm
+index b167077..3d1c788 100644
+--- a/mpn/x86_64/bd2/gcd_11.asm
++++ b/mpn/x86_64/bd2/gcd_11.asm
+@@ -94,3 +94,4 @@ L(end):	mov	v0, %rax
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bd2/gcd_22.asm b/mpn/x86_64/bd2/gcd_22.asm
+index a4f30ea..b886678 100644
+--- a/mpn/x86_64/bd2/gcd_22.asm
++++ b/mpn/x86_64/bd2/gcd_22.asm
+@@ -140,3 +140,4 @@ L(end):	C mov	v0, %rax
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bd4/gcd_11.asm b/mpn/x86_64/bd4/gcd_11.asm
+index 4176b85..d172e32 100644
+--- a/mpn/x86_64/bd4/gcd_11.asm
++++ b/mpn/x86_64/bd4/gcd_11.asm
+@@ -94,3 +94,4 @@ L(end):	C rax = result
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bdiv_dbm1c.asm b/mpn/x86_64/bdiv_dbm1c.asm
+index a53bd52..c383ee3 100644
+--- a/mpn/x86_64/bdiv_dbm1c.asm
++++ b/mpn/x86_64/bdiv_dbm1c.asm
+@@ -104,3 +104,4 @@ L(lo1):	sub	%rax, %r8
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bdiv_q_1.asm b/mpn/x86_64/bdiv_q_1.asm
+index 85538c9..c983c7f 100644
+--- a/mpn/x86_64/bdiv_q_1.asm
++++ b/mpn/x86_64/bdiv_q_1.asm
+@@ -193,3 +193,4 @@ L(one):	shr	R8(%rcx), %rax
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bt1/aors_n.asm b/mpn/x86_64/bt1/aors_n.asm
+index 9b6b5c7..04d81dd 100644
+--- a/mpn/x86_64/bt1/aors_n.asm
++++ b/mpn/x86_64/bt1/aors_n.asm
+@@ -157,3 +157,4 @@ PROLOGUE(func_nc)
+ IFDOS(`	mov	56(%rsp), %r8	')
+ 	jmp	L(ent)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bt1/aorsmul_1.asm b/mpn/x86_64/bt1/aorsmul_1.asm
+index 41e1d8a..d309321 100644
+--- a/mpn/x86_64/bt1/aorsmul_1.asm
++++ b/mpn/x86_64/bt1/aorsmul_1.asm
+@@ -189,3 +189,4 @@ IFDOS(`	pop	%rdi		')
+ IFDOS(`	pop	%rsi		')
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bt1/copyd.asm b/mpn/x86_64/bt1/copyd.asm
+index 877714e..23fb80b 100644
+--- a/mpn/x86_64/bt1/copyd.asm
++++ b/mpn/x86_64/bt1/copyd.asm
+@@ -89,3 +89,4 @@ L(end):	cmp	$-4, R32(n)
+ L(ret):	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bt1/copyi.asm b/mpn/x86_64/bt1/copyi.asm
+index ee0f578..25718e6 100644
+--- a/mpn/x86_64/bt1/copyi.asm
++++ b/mpn/x86_64/bt1/copyi.asm
+@@ -92,3 +92,4 @@ L(end):	cmp	$4, R32(n)
+ L(ret):	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bt1/gcd_11.asm b/mpn/x86_64/bt1/gcd_11.asm
+index ef53392..03bc06d 100644
+--- a/mpn/x86_64/bt1/gcd_11.asm
++++ b/mpn/x86_64/bt1/gcd_11.asm
+@@ -117,3 +117,4 @@ L(count_better):
+ 	bsf	u0, cnt
+ 	jmp	L(shr)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bt1/mul_1.asm b/mpn/x86_64/bt1/mul_1.asm
+index 4394d6e..634cb35 100644
+--- a/mpn/x86_64/bt1/mul_1.asm
++++ b/mpn/x86_64/bt1/mul_1.asm
+@@ -239,3 +239,4 @@ IFDOS(`	pop	%rdi		')
+ IFDOS(`	pop	%rsi		')
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bt1/mul_basecase.asm b/mpn/x86_64/bt1/mul_basecase.asm
+index e7d46bf..1726190 100644
+--- a/mpn/x86_64/bt1/mul_basecase.asm
++++ b/mpn/x86_64/bt1/mul_basecase.asm
+@@ -484,3 +484,4 @@ L(ret):	pop	%r13
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/bt1/sqr_basecase.asm b/mpn/x86_64/bt1/sqr_basecase.asm
+index 0e417a1..8f665d1 100644
+--- a/mpn/x86_64/bt1/sqr_basecase.asm
++++ b/mpn/x86_64/bt1/sqr_basecase.asm
+@@ -563,3 +563,4 @@ L(esd):	add	%rbx, w0
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/cnd_aors_n.asm b/mpn/x86_64/cnd_aors_n.asm
+index 13a2ab3..b720ecb 100644
+--- a/mpn/x86_64/cnd_aors_n.asm
++++ b/mpn/x86_64/cnd_aors_n.asm
+@@ -181,3 +181,4 @@ L(end):	neg	R32(%rax)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/com.asm b/mpn/x86_64/com.asm
+index 006acaf..ec72e19 100644
+--- a/mpn/x86_64/com.asm
++++ b/mpn/x86_64/com.asm
+@@ -93,3 +93,4 @@ L(e10):	movq	24(up,n,8), %r9
+ L(ret):	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/copyd.asm b/mpn/x86_64/copyd.asm
+index a5e6e59..02ab53f 100644
+--- a/mpn/x86_64/copyd.asm
++++ b/mpn/x86_64/copyd.asm
+@@ -91,3 +91,4 @@ L(end):	shr	R32(n)
+ 	mov	%r9, -16(rp)
+ 1:	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/copyi.asm b/mpn/x86_64/copyi.asm
+index bafce7a..8c6dbdc 100644
+--- a/mpn/x86_64/copyi.asm
++++ b/mpn/x86_64/copyi.asm
+@@ -90,3 +90,4 @@ L(end):	shr	R32(n)
+ 	mov	%r9, 16(rp)
+ 1:	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/aors_err1_n.asm b/mpn/x86_64/core2/aors_err1_n.asm
+index 3f875ae..c9c6c36 100644
+--- a/mpn/x86_64/core2/aors_err1_n.asm
++++ b/mpn/x86_64/core2/aors_err1_n.asm
+@@ -223,3 +223,4 @@ L(end):
+ 	pop	%rbx
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/aors_n.asm b/mpn/x86_64/core2/aors_n.asm
+index f9e0039..7981b7f 100644
+--- a/mpn/x86_64/core2/aors_n.asm
++++ b/mpn/x86_64/core2/aors_n.asm
+@@ -148,3 +148,4 @@ PROLOGUE(func_nc)
+ IFDOS(`	mov	56(%rsp), %r8	')
+ 	jmp	L(start)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/aorsmul_1.asm b/mpn/x86_64/core2/aorsmul_1.asm
+index a7a5d6e..b2b067a 100644
+--- a/mpn/x86_64/core2/aorsmul_1.asm
++++ b/mpn/x86_64/core2/aorsmul_1.asm
+@@ -186,3 +186,4 @@ L(n1):	mov	8(rp), %r10
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/divrem_1.asm b/mpn/x86_64/core2/divrem_1.asm
+index 1b3f139..d41c494 100644
+--- a/mpn/x86_64/core2/divrem_1.asm
++++ b/mpn/x86_64/core2/divrem_1.asm
+@@ -241,3 +241,4 @@ L(ret):	pop	%rbx
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/gcd_11.asm b/mpn/x86_64/core2/gcd_11.asm
+index b00451f..b730a55 100644
+--- a/mpn/x86_64/core2/gcd_11.asm
++++ b/mpn/x86_64/core2/gcd_11.asm
+@@ -91,3 +91,4 @@ L(end):	C rax = result
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/gcd_22.asm b/mpn/x86_64/core2/gcd_22.asm
+index b5aa73b..0ccde8a 100644
+--- a/mpn/x86_64/core2/gcd_22.asm
++++ b/mpn/x86_64/core2/gcd_22.asm
+@@ -135,3 +135,4 @@ L(end):	C mov	v0, %rax
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/hamdist.asm b/mpn/x86_64/core2/hamdist.asm
+index a78753d..be451d7 100644
+--- a/mpn/x86_64/core2/hamdist.asm
++++ b/mpn/x86_64/core2/hamdist.asm
+@@ -208,3 +208,4 @@ DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ END_OBJECT(L(cnsts))
++ASM_END()
+diff --git a/mpn/x86_64/core2/logops_n.asm b/mpn/x86_64/core2/logops_n.asm
+index 5ff174c..451d556 100644
+--- a/mpn/x86_64/core2/logops_n.asm
++++ b/mpn/x86_64/core2/logops_n.asm
+@@ -283,3 +283,4 @@ L(ret):	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
+ ')
++ASM_END()
+diff --git a/mpn/x86_64/core2/lshift.asm b/mpn/x86_64/core2/lshift.asm
+index 9016a71..62053c2 100644
+--- a/mpn/x86_64/core2/lshift.asm
++++ b/mpn/x86_64/core2/lshift.asm
+@@ -143,3 +143,4 @@ L(1):	shl	R8(cnt), %r9
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/lshiftc.asm b/mpn/x86_64/core2/lshiftc.asm
+index c428f13..cdd4e11 100644
+--- a/mpn/x86_64/core2/lshiftc.asm
++++ b/mpn/x86_64/core2/lshiftc.asm
+@@ -157,3 +157,4 @@ L(1):	shl	R8(cnt), %r9
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/mul_basecase.asm b/mpn/x86_64/core2/mul_basecase.asm
+index d16be85..0dcf0f8 100644
+--- a/mpn/x86_64/core2/mul_basecase.asm
++++ b/mpn/x86_64/core2/mul_basecase.asm
+@@ -347,6 +347,7 @@ L(m2e0):mul	v1
+ 	jz	L(ret2)
+ 
+ L(do_am0):
++	X86_ENDBR
+ 	push	%r15
+ 	push	vn_param
+ 
+@@ -520,6 +521,7 @@ L(m2e1):mul	v1
+ 	jz	L(ret2)
+ 
+ L(do_am1):
++	X86_ENDBR
+ 	push	%r15
+ 	push	vn_param
+ 
+@@ -693,6 +695,7 @@ L(m2e2):mul	v1
+ 	jz	L(ret2)
+ 
+ L(do_am2):
++	X86_ENDBR
+ 	push	%r15
+ 	push	vn_param
+ 
+@@ -866,6 +869,7 @@ L(m2e3):mul	v1
+ 	jz	L(ret2)
+ 
+ L(do_am3):
++	X86_ENDBR
+ 	push	%r15
+ 	push	vn_param
+ 
+@@ -973,3 +977,4 @@ L(lo3):	mul	v0
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/mullo_basecase.asm b/mpn/x86_64/core2/mullo_basecase.asm
+index 0f03d86..11814d5 100644
+--- a/mpn/x86_64/core2/mullo_basecase.asm
++++ b/mpn/x86_64/core2/mullo_basecase.asm
+@@ -425,3 +425,4 @@ L(n3):	mov	(vp_param), %r9
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/popcount.asm b/mpn/x86_64/core2/popcount.asm
+index 39d8c5d..5e03ef3 100644
+--- a/mpn/x86_64/core2/popcount.asm
++++ b/mpn/x86_64/core2/popcount.asm
+@@ -183,3 +183,4 @@ DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ END_OBJECT(L(cnsts))
++ASM_END()
+diff --git a/mpn/x86_64/core2/rsh1aors_n.asm b/mpn/x86_64/core2/rsh1aors_n.asm
+index 27eed37..5b4fe7e 100644
+--- a/mpn/x86_64/core2/rsh1aors_n.asm
++++ b/mpn/x86_64/core2/rsh1aors_n.asm
+@@ -167,3 +167,4 @@ L(end):	shrd	$1, %rbx, %rbp
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/rshift.asm b/mpn/x86_64/core2/rshift.asm
+index 7578a53..86cc804 100644
+--- a/mpn/x86_64/core2/rshift.asm
++++ b/mpn/x86_64/core2/rshift.asm
+@@ -141,3 +141,4 @@ L(1):	shr	R8(cnt), %r9
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/sqr_basecase.asm b/mpn/x86_64/core2/sqr_basecase.asm
+index a112c1b..65286b0 100644
+--- a/mpn/x86_64/core2/sqr_basecase.asm
++++ b/mpn/x86_64/core2/sqr_basecase.asm
+@@ -982,3 +982,4 @@ L(n3):	mov	%rax, %r10
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/core2/sublshC_n.asm b/mpn/x86_64/core2/sublshC_n.asm
+index 272700d..e30562b 100644
+--- a/mpn/x86_64/core2/sublshC_n.asm
++++ b/mpn/x86_64/core2/sublshC_n.asm
+@@ -156,3 +156,4 @@ L(end):	shr	$RSH, %r11
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreibwl/addmul_1.asm b/mpn/x86_64/coreibwl/addmul_1.asm
+index ee7e4ee..4ea5580 100644
+--- a/mpn/x86_64/coreibwl/addmul_1.asm
++++ b/mpn/x86_64/coreibwl/addmul_1.asm
+@@ -110,33 +110,39 @@ L(tab):	JMPENT(	L(f0), L(tab))
+ 	JMPENT(	L(f7), L(tab))
+ 	TEXT
+ 
+-L(f0):	mulx(	(up), %r10, %r8)
++L(f0):	X86_ENDBR
++	mulx(	(up), %r10, %r8)
+ 	lea	-8(up), up
+ 	lea	-8(rp), rp
+ 	lea	-1(n), n
+ 	jmp	L(b0)
+ 
+-L(f3):	mulx(	(up), %r9, %rax)
++L(f3):	X86_ENDBR
++	mulx(	(up), %r9, %rax)
+ 	lea	16(up), up
+ 	lea	-48(rp), rp
+ 	jmp	L(b3)
+ 
+-L(f4):	mulx(	(up), %r10, %r8)
++L(f4):	X86_ENDBR
++	mulx(	(up), %r10, %r8)
+ 	lea	24(up), up
+ 	lea	-40(rp), rp
+ 	jmp	L(b4)
+ 
+-L(f5):	mulx(	(up), %r9, %rax)
++L(f5):	X86_ENDBR
++	mulx(	(up), %r9, %rax)
+ 	lea	32(up), up
+ 	lea	-32(rp), rp
+ 	jmp	L(b5)
+ 
+-L(f6):	mulx(	(up), %r10, %r8)
++L(f6):	X86_ENDBR
++	mulx(	(up), %r10, %r8)
+ 	lea	40(up), up
+ 	lea	-24(rp), rp
+ 	jmp	L(b6)
+ 
+-L(f1):	mulx(	(up), %r9, %rax)
++L(f1):	X86_ENDBR
++	mulx(	(up), %r9, %rax)
+ 	jrcxz	L(1)
+ 	jmp	L(b1)
+ L(1):	add	(rp), %r9
+@@ -156,7 +162,8 @@ ifdef(`PIC',
+ `	nop;nop;nop;nop',
+ `	nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop')
+ 
+-L(f2):	mulx(	(up), %r10, %r8)
++L(f2):	X86_ENDBR
++	mulx(	(up), %r10, %r8)
+ 	lea	8(up), up
+ 	lea	8(rp), rp
+ 	mulx(	(up), %r9, %rax)
+@@ -200,7 +207,8 @@ L(b3):	adox(	48,(rp), %r9)
+ 	mulx(	(up), %r9, %rax)
+ 	jmp	L(top)
+ 
+-L(f7):	mulx(	(up), %r9, %rax)
++L(f7):	X86_ENDBR
++	mulx(	(up), %r9, %rax)
+ 	lea	-16(up), up
+ 	lea	-16(rp), rp
+ 	jmp	L(b7)
+diff --git a/mpn/x86_64/coreibwl/mul_1.asm b/mpn/x86_64/coreibwl/mul_1.asm
+index b7fae2f..77121a5 100644
+--- a/mpn/x86_64/coreibwl/mul_1.asm
++++ b/mpn/x86_64/coreibwl/mul_1.asm
+@@ -108,48 +108,56 @@ L(tab):	JMPENT(	L(f0), L(tab))
+ 	JMPENT(	L(f7), L(tab))
+ 	TEXT
+ 
+-L(f0):	mulx(	(up), %r10, %r8)
++L(f0):	X86_ENDBR
++	mulx(	(up), %r10, %r8)
+ 	lea	56(up), up
+ 	lea	-8(rp), rp
+ 	jmp	L(b0)
+ 
+-L(f3):	mulx(	(up), %r9, %rax)
++L(f3):	X86_ENDBR
++	mulx(	(up), %r9, %rax)
+ 	lea	16(up), up
+ 	lea	16(rp), rp
+ 	inc	n
+ 	jmp	L(b3)
+ 
+-L(f4):	mulx(	(up), %r10, %r8)
++L(f4):	X86_ENDBR
++	mulx(	(up), %r10, %r8)
+ 	lea	24(up), up
+ 	lea	24(rp), rp
+ 	inc	n
+ 	jmp	L(b4)
+ 
+-L(f5):	mulx(	(up), %r9, %rax)
++L(f5):	X86_ENDBR
++	mulx(	(up), %r9, %rax)
+ 	lea	32(up), up
+ 	lea	32(rp), rp
+ 	inc	n
+ 	jmp	L(b5)
+ 
+-L(f6):	mulx(	(up), %r10, %r8)
++L(f6):	X86_ENDBR
++	mulx(	(up), %r10, %r8)
+ 	lea	40(up), up
+ 	lea	40(rp), rp
+ 	inc	n
+ 	jmp	L(b6)
+ 
+-L(f7):	mulx(	(up), %r9, %rax)
++L(f7):	X86_ENDBR
++	mulx(	(up), %r9, %rax)
+ 	lea	48(up), up
+ 	lea	48(rp), rp
+ 	inc	n
+ 	jmp	L(b7)
+ 
+-L(f1):	mulx(	(up), %r9, %rax)
++L(f1):	X86_ENDBR
++	mulx(	(up), %r9, %rax)
+ 	test	n, n
+ 	jnz	L(b1)
+ L(1):	mov	%r9, (rp)
+ 	ret
+ 
+-L(f2):	mulx(	(up), %r10, %r8)
++L(f2):	X86_ENDBR
++	mulx(	(up), %r10, %r8)
+ 	lea	8(up), up
+ 	lea	8(rp), rp
+ 	mulx(	(up), %r9, %rax)
+diff --git a/mpn/x86_64/coreibwl/mul_basecase.asm b/mpn/x86_64/coreibwl/mul_basecase.asm
+index 42ca976..c5e60e7 100644
+--- a/mpn/x86_64/coreibwl/mul_basecase.asm
++++ b/mpn/x86_64/coreibwl/mul_basecase.asm
+@@ -157,45 +157,53 @@ ifdef(`PIC',
+ 	jmp	*(%r10,%rax,8)
+ ')
+ 
+-L(mf0):	mulx(	(up), w2, w3)
++L(mf0):	X86_ENDBR
++	mulx(	(up), w2, w3)
+ 	lea	56(up), up
+ 	lea	-8(rp), rp
+ 	jmp	L(mb0)
+ 
+-L(mf3):	mulx(	(up), w0, w1)
++L(mf3):	X86_ENDBR
++	mulx(	(up), w0, w1)
+ 	lea	16(up), up
+ 	lea	16(rp), rp
+ 	inc	n
+ 	jmp	L(mb3)
+ 
+-L(mf4):	mulx(	(up), w2, w3)
++L(mf4):	X86_ENDBR
++	mulx(	(up), w2, w3)
+ 	lea	24(up), up
+ 	lea	24(rp), rp
+ 	inc	n
+ 	jmp	L(mb4)
+ 
+-L(mf5):	mulx(	(up), w0, w1)
++L(mf5):	X86_ENDBR
++	mulx(	(up), w0, w1)
+ 	lea	32(up), up
+ 	lea	32(rp), rp
+ 	inc	n
+ 	jmp	L(mb5)
+ 
+-L(mf6):	mulx(	(up), w2, w3)
++L(mf6):	X86_ENDBR
++	mulx(	(up), w2, w3)
+ 	lea	40(up), up
+ 	lea	40(rp), rp
+ 	inc	n
+ 	jmp	L(mb6)
+ 
+-L(mf7):	mulx(	(up), w0, w1)
++L(mf7):	X86_ENDBR
++	mulx(	(up), w0, w1)
+ 	lea	48(up), up
+ 	lea	48(rp), rp
+ 	inc	n
+ 	jmp	L(mb7)
+ 
+-L(mf1):	mulx(	(up), w0, w1)
++L(mf1):	X86_ENDBR
++	mulx(	(up), w0, w1)
+ 	jmp	L(mb1)
+ 
+-L(mf2):	mulx(	(up), w2, w3)
++L(mf2):	X86_ENDBR
++	mulx(	(up), w2, w3)
+ 	lea	8(up), up
+ 	lea	8(rp), rp
+ 	mulx(	(up), w0, w1)
+@@ -256,32 +264,39 @@ L(outer):
+ 	lea	8(vp), vp
+ 	jmp	*jaddr
+ 
+-L(f0):	mulx(	8,(up), w2, w3)
++L(f0):	X86_ENDBR
++	mulx(	8,(up), w2, w3)
+ 	lea	8(rp,unneg,8), rp
+ 	lea	-1(n), n
+ 	jmp	L(b0)
+ 
+-L(f3):	mulx(	-16,(up), w0, w1)
++L(f3):	X86_ENDBR
++	mulx(	-16,(up), w0, w1)
+ 	lea	-56(rp,unneg,8), rp
+ 	jmp	L(b3)
+ 
+-L(f4):	mulx(	-24,(up), w2, w3)
++L(f4):	X86_ENDBR
++	mulx(	-24,(up), w2, w3)
+ 	lea	-56(rp,unneg,8), rp
+ 	jmp	L(b4)
+ 
+-L(f5):	mulx(	-32,(up), w0, w1)
++L(f5):	X86_ENDBR
++	mulx(	-32,(up), w0, w1)
+ 	lea	-56(rp,unneg,8), rp
+ 	jmp	L(b5)
+ 
+-L(f6):	mulx(	-40,(up), w2, w3)
++L(f6):	X86_ENDBR
++	mulx(	-40,(up), w2, w3)
+ 	lea	-56(rp,unneg,8), rp
+ 	jmp	L(b6)
+ 
+-L(f7):	mulx(	16,(up), w0, w1)
++L(f7):	X86_ENDBR
++	mulx(	16,(up), w0, w1)
+ 	lea	8(rp,unneg,8), rp
+ 	jmp	L(b7)
+ 
+-L(f1):	mulx(	(up), w0, w1)
++L(f1):	X86_ENDBR
++	mulx(	(up), w0, w1)
+ 	lea	8(rp,unneg,8), rp
+ 	jmp	L(b1)
+ 
+@@ -303,6 +318,7 @@ L(done):
+ 	ret
+ 
+ L(f2):
++	X86_ENDBR
+ 	mulx(	-8,(up), w2, w3)
+ 	lea	8(rp,unneg,8), rp
+ 	mulx(	(up), w0, w1)
+@@ -367,3 +383,4 @@ L(atab):JMPENT(	L(f0), L(atab))
+ 	JMPENT(	L(f7), L(atab))
+ 	TEXT
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreibwl/mullo_basecase.asm b/mpn/x86_64/coreibwl/mullo_basecase.asm
+index 5cdb209..b3e435b 100644
+--- a/mpn/x86_64/coreibwl/mullo_basecase.asm
++++ b/mpn/x86_64/coreibwl/mullo_basecase.asm
+@@ -393,3 +393,4 @@ L(mtab):JMPENT(	L(mf7), L(mtab))
+ 	JMPENT(	L(mf4), L(mtab))
+ 	JMPENT(	L(mf5), L(mtab))
+ 	JMPENT(	L(mf6), L(mtab))
++ASM_END()
+diff --git a/mpn/x86_64/coreibwl/sqr_basecase.asm b/mpn/x86_64/coreibwl/sqr_basecase.asm
+index e81b01b..cd523cf 100644
+--- a/mpn/x86_64/coreibwl/sqr_basecase.asm
++++ b/mpn/x86_64/coreibwl/sqr_basecase.asm
+@@ -181,14 +181,16 @@ ifdef(`PIC',
+ 	jmp	*(%r10,%rax,8)
+ ')
+ 
+-L(mf0):	mulx(	u0, w0, w1)		C up[0]^2
++L(mf0):	X86_ENDBR
++  mulx(	u0, w0, w1)		C up[0]^2
+ 	add	u0, u0
+ 	mulx(	8,(up), w2, w3)
+ 	lea	64(up), up
+ 	add	w1, w2
+ 	jmp	L(mb0)
+ 
+-L(mf3):	mulx(	u0, w2, w3)		C up[0]^2
++L(mf3): X86_ENDBR
++  mulx(	u0, w2, w3)		C up[0]^2
+ 	add	u0, u0
+ 	mov	w2, (rp)
+ 	mulx(	8,(up), w0, w1)
+@@ -197,7 +199,8 @@ L(mf3):	mulx(	u0, w2, w3)		C up[0]^2
+ 	add	w3, w0
+ 	jmp	L(mb3)
+ 
+-L(mf4):	mulx(	u0, w0, w1)		C up[0]^2
++L(mf4): X86_ENDBR
++  mulx(	u0, w0, w1)		C up[0]^2
+ 	add	u0, u0
+ 	mulx(	8,(up), w2, w3)
+ 	mov	w0, (rp)
+@@ -206,7 +209,8 @@ L(mf4):	mulx(	u0, w0, w1)		C up[0]^2
+ 	add	w1, w2
+ 	jmp	L(mb4)
+ 
+-L(mf5):	mulx(	u0, w2, w3)		C up[0]^2
++L(mf5): X86_ENDBR
++  mulx(	u0, w2, w3)		C up[0]^2
+ 	add	u0, u0
+ 	mulx(	8,(up), w0, w1)
+ 	mov	w2, (rp)
+@@ -215,7 +219,8 @@ L(mf5):	mulx(	u0, w2, w3)		C up[0]^2
+ 	add	w3, w0
+ 	jmp	L(mb5)
+ 
+-L(mf6):	mulx(	u0, w0, w1)		C up[0]^2
++L(mf6): X86_ENDBR
++  mulx(	u0, w0, w1)		C up[0]^2
+ 	add	u0, u0
+ 	mulx(	8,(up), w2, w3)
+ 	mov	w0, (rp)
+@@ -224,7 +229,8 @@ L(mf6):	mulx(	u0, w0, w1)		C up[0]^2
+ 	add	w1, w2
+ 	jmp	L(mb6)
+ 
+-L(mf7):	mulx(	u0, w2, w3)		C up[0]^2
++L(mf7): X86_ENDBR
++  mulx(	u0, w2, w3)		C up[0]^2
+ 	add	u0, u0
+ 	mulx(	8,(up), w0, w1)
+ 	mov	w2, (rp)
+@@ -233,7 +239,8 @@ L(mf7):	mulx(	u0, w2, w3)		C up[0]^2
+ 	add	w3, w0
+ 	jmp	L(mb7)
+ 
+-L(mf1):	mulx(	u0, w2, w3)		C up[0]^2
++L(mf1): X86_ENDBR
++  mulx(	u0, w2, w3)		C up[0]^2
+ 	add	u0, u0
+ 	mulx(	8,(up), w0, w1)
+ 	mov	w2, (rp)
+@@ -242,7 +249,8 @@ L(mf1):	mulx(	u0, w2, w3)		C up[0]^2
+ 	add	w3, w0
+ 	jmp	L(mb1)
+ 
+-L(mf2):	mulx(	u0, w0, w1)		C up[0]^2
++L(mf2): X86_ENDBR
++  mulx(	u0, w0, w1)		C up[0]^2
+ 	add	u0, u0
+ 	mulx(	8,(up), w2, w3)
+ 	mov	w0, (rp)
+@@ -300,7 +308,8 @@ ifdef(`PIC',
+ 
+ L(ed0):	adox(	(rp), w0)
+ 	adox(	%rcx, w1)		C relies on rcx = 0
+-L(f7):	mov	w0, (rp)
++L(f7): X86_ENDBR
++  mov	w0, (rp)
+ 	adc	%rcx, w1		C relies on rcx = 0
+ 	mov	w1, 8(rp)
+ 	lea	-64(up,un_save,8), up
+@@ -356,7 +365,8 @@ L(b0):	mov	w0, (rp)
+ 
+ L(ed1):	adox(	(rp), w0)
+ 	adox(	%rcx, w1)		C relies on rcx = 0
+-L(f0):	mov	w0, (rp)
++L(f0): X86_ENDBR
++  mov	w0, (rp)
+ 	adc	%rcx, w1		C relies on rcx = 0
+ 	mov	w1, 8(rp)
+ 	lea	-64(up,un_save,8), up
+@@ -415,7 +425,8 @@ L(b1):	mulx(	8,(up), w2, w3)
+ 
+ L(ed2):	adox(	(rp), w0)
+ 	adox(	%rcx, w1)		C relies on rcx = 0
+-L(f1):	mov	w0, (rp)
++L(f1): X86_ENDBR
++  mov	w0, (rp)
+ 	adc	%rcx, w1		C relies on rcx = 0
+ 	mov	w1, 8(rp)
+ 	lea	(up,un_save,8), up
+@@ -477,7 +488,8 @@ L(b2):	adox(	48,(rp), w0)
+ 
+ L(ed3):	adox(	(rp), w0)
+ 	adox(	%rcx, w1)		C relies on rcx = 0
+-L(f2):	mov	w0, (rp)
++L(f2):  X86_ENDBR
++  mov	w0, (rp)
+ 	adc	%rcx, w1		C relies on rcx = 0
+ 	mov	w1, 8(rp)
+ 	lea	(up,un_save,8), up
+@@ -535,7 +547,8 @@ L(b3):	mulx(	-16,(up), w0, w1)
+ 
+ L(ed4):	adox(	(rp), w0)
+ 	adox(	%rcx, w1)		C relies on rcx = 0
+-L(f3):	mov	w0, (rp)
++L(f3): X86_ENDBR
++  mov	w0, (rp)
+ 	adc	%rcx, w1		C relies on rcx = 0
+ 	mov	w1, 8(rp)
+ 	lea	(up,un_save,8), up
+@@ -592,7 +605,8 @@ L(b4):	mulx(	-24,(up), w2, w3)
+ 
+ L(ed5):	adox(	(rp), w0)
+ 	adox(	%rcx, w1)		C relies on rcx = 0
+-L(f4):	mov	w0, (rp)
++L(f4): X86_ENDBR
++  mov	w0, (rp)
+ 	adc	%rcx, w1		C relies on rcx = 0
+ 	mov	w1, 8(rp)
+ 	lea	(up,un_save,8), up
+@@ -649,7 +663,8 @@ L(b5):	mulx(	-32,(up), w0, w1)
+ 
+ L(ed6):	adox(	(rp), w0)
+ 	adox(	%rcx, w1)		C relies on rcx = 0
+-L(f5):	mov	w0, (rp)
++L(f5): X86_ENDBR
++  mov	w0, (rp)
+ 	adc	%rcx, w1		C relies on rcx = 0
+ 	mov	w1, 8(rp)
+ 	lea	(up,un_save,8), up
+@@ -706,7 +721,8 @@ L(b6):	adcx(	w1, w2)
+ 
+ L(ed7):	adox(	(rp), w0)
+ 	adox(	%rcx, w1)		C relies on rcx = 0
+-L(f6):	mov	w0, (rp)
++L(f6): X86_ENDBR
++  mov	w0, (rp)
+ 	adc	%rcx, w1		C relies on rcx = 0
+ 	mov	w1, 8(rp)
+ 	lea	(up,un_save,8), up
+@@ -837,3 +853,4 @@ L(atab):JMPENT(	L(f6), L(atab))
+ 	JMPENT(	L(f5), L(atab))
+ 	TEXT
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreihwl/addmul_2.asm b/mpn/x86_64/coreihwl/addmul_2.asm
+index 9d1c405..322037e 100644
+--- a/mpn/x86_64/coreihwl/addmul_2.asm
++++ b/mpn/x86_64/coreihwl/addmul_2.asm
+@@ -239,3 +239,4 @@ L(end):	mulx(	v0, %rax, w3)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreihwl/aors_n.asm b/mpn/x86_64/coreihwl/aors_n.asm
+index fc99627..f9d89f7 100644
+--- a/mpn/x86_64/coreihwl/aors_n.asm
++++ b/mpn/x86_64/coreihwl/aors_n.asm
+@@ -259,3 +259,4 @@ L(tab):	JMPENT(	L(0), L(tab))
+ 	JMPENT(	L(5), L(tab))
+ 	JMPENT(	L(6), L(tab))
+ 	JMPENT(	L(7), L(tab))
++ASM_END()
+diff --git a/mpn/x86_64/coreihwl/aorsmul_1.asm b/mpn/x86_64/coreihwl/aorsmul_1.asm
+index 3f43afa..d01c941 100644
+--- a/mpn/x86_64/coreihwl/aorsmul_1.asm
++++ b/mpn/x86_64/coreihwl/aorsmul_1.asm
+@@ -199,3 +199,4 @@ L(ret):	pop	%r13
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreihwl/gcd_22.asm b/mpn/x86_64/coreihwl/gcd_22.asm
+index b5863b6..e41731e 100644
+--- a/mpn/x86_64/coreihwl/gcd_22.asm
++++ b/mpn/x86_64/coreihwl/gcd_22.asm
+@@ -136,3 +136,4 @@ L(end):	mov	v0, %rax
+ L(ret):	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreihwl/mul_2.asm b/mpn/x86_64/coreihwl/mul_2.asm
+index f1f044f..f48e5d8 100644
+--- a/mpn/x86_64/coreihwl/mul_2.asm
++++ b/mpn/x86_64/coreihwl/mul_2.asm
+@@ -174,3 +174,4 @@ L(end):	mulx(	v1, %rdx, %rax)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreihwl/mul_basecase.asm b/mpn/x86_64/coreihwl/mul_basecase.asm
+index b2656c8..14826e8 100644
+--- a/mpn/x86_64/coreihwl/mul_basecase.asm
++++ b/mpn/x86_64/coreihwl/mul_basecase.asm
+@@ -439,3 +439,4 @@ L(ret2):pop	%rbp
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreihwl/mullo_basecase.asm b/mpn/x86_64/coreihwl/mullo_basecase.asm
+index e65559b..b29352c 100644
+--- a/mpn/x86_64/coreihwl/mullo_basecase.asm
++++ b/mpn/x86_64/coreihwl/mullo_basecase.asm
+@@ -420,3 +420,4 @@ L(n3):	mov	(vp), %r9
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreihwl/redc_1.asm b/mpn/x86_64/coreihwl/redc_1.asm
+index b1d6c0a..3b09a73 100644
+--- a/mpn/x86_64/coreihwl/redc_1.asm
++++ b/mpn/x86_64/coreihwl/redc_1.asm
+@@ -435,3 +435,4 @@ L(ret):	pop	%r15
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreihwl/sqr_basecase.asm b/mpn/x86_64/coreihwl/sqr_basecase.asm
+index 641cdf3..b6ea890 100644
+--- a/mpn/x86_64/coreihwl/sqr_basecase.asm
++++ b/mpn/x86_64/coreihwl/sqr_basecase.asm
+@@ -504,3 +504,4 @@ L(dend):adc	%rbx, %rdx
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreinhm/aorrlsh_n.asm b/mpn/x86_64/coreinhm/aorrlsh_n.asm
+index eed64e7..3f25eea 100644
+--- a/mpn/x86_64/coreinhm/aorrlsh_n.asm
++++ b/mpn/x86_64/coreinhm/aorrlsh_n.asm
+@@ -198,3 +198,4 @@ IFDOS(`	mov	64(%rsp), %r9	')	C cy
+ 	sbb	R32(%rbx), R32(%rbx)	C initialise CF save register
+ 	jmp	L(ent)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreinhm/hamdist.asm b/mpn/x86_64/coreinhm/hamdist.asm
+index a5a63e4..a84bcbc 100644
+--- a/mpn/x86_64/coreinhm/hamdist.asm
++++ b/mpn/x86_64/coreinhm/hamdist.asm
+@@ -194,3 +194,4 @@ L(tab):	JMPENT(	L(0), L(tab))
+ 	JMPENT(	L(1), L(tab))
+ 	JMPENT(	L(2), L(tab))
+ 	JMPENT(	L(3), L(tab))
++ASM_END()
+diff --git a/mpn/x86_64/coreinhm/popcount.asm b/mpn/x86_64/coreinhm/popcount.asm
+index 0a3c867..24c4ebc 100644
+--- a/mpn/x86_64/coreinhm/popcount.asm
++++ b/mpn/x86_64/coreinhm/popcount.asm
+@@ -180,3 +180,4 @@ L(tab):	JMPENT(	L(0), L(tab))
+ 	JMPENT(	L(5), L(tab))
+ 	JMPENT(	L(6), L(tab))
+ 	JMPENT(	L(7), L(tab))
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/addmul_2.asm b/mpn/x86_64/coreisbr/addmul_2.asm
+index 21f0bf4..45c7b15 100644
+--- a/mpn/x86_64/coreisbr/addmul_2.asm
++++ b/mpn/x86_64/coreisbr/addmul_2.asm
+@@ -222,3 +222,4 @@ L(end):	mul	v1
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/aorrlshC_n.asm b/mpn/x86_64/coreisbr/aorrlshC_n.asm
+index 23ace41..6af7da8 100644
+--- a/mpn/x86_64/coreisbr/aorrlshC_n.asm
++++ b/mpn/x86_64/coreisbr/aorrlshC_n.asm
+@@ -171,3 +171,4 @@ L(end):	shr	$RSH, %rbp
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/aorrlsh_n.asm b/mpn/x86_64/coreisbr/aorrlsh_n.asm
+index db8ee68..56ca497 100644
+--- a/mpn/x86_64/coreisbr/aorrlsh_n.asm
++++ b/mpn/x86_64/coreisbr/aorrlsh_n.asm
+@@ -213,3 +213,4 @@ IFDOS(`	mov	64(%rsp), %r9	')	C cy
+ 	sbb	R32(%rbx), R32(%rbx)	C initialise CF save register
+ 	jmp	L(ent)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/aors_n.asm b/mpn/x86_64/coreisbr/aors_n.asm
+index 61fee3e..d466248 100644
+--- a/mpn/x86_64/coreisbr/aors_n.asm
++++ b/mpn/x86_64/coreisbr/aors_n.asm
+@@ -201,3 +201,4 @@ PROLOGUE(func_nc)
+ IFDOS(`	mov	56(%rsp), %r8	')
+ 	jmp	L(ent)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/cnd_add_n.asm b/mpn/x86_64/coreisbr/cnd_add_n.asm
+index 43abcc8..3d72bf8 100644
+--- a/mpn/x86_64/coreisbr/cnd_add_n.asm
++++ b/mpn/x86_64/coreisbr/cnd_add_n.asm
+@@ -172,3 +172,4 @@ L(end):	neg	R32(%rax)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/cnd_sub_n.asm b/mpn/x86_64/coreisbr/cnd_sub_n.asm
+index f55492b..3371269 100644
+--- a/mpn/x86_64/coreisbr/cnd_sub_n.asm
++++ b/mpn/x86_64/coreisbr/cnd_sub_n.asm
+@@ -198,3 +198,4 @@ L(end):	neg	R32(%rax)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/mul_1.asm b/mpn/x86_64/coreisbr/mul_1.asm
+index a43a117..1f17293 100644
+--- a/mpn/x86_64/coreisbr/mul_1.asm
++++ b/mpn/x86_64/coreisbr/mul_1.asm
+@@ -197,3 +197,4 @@ L(00c):	add	cin, %r10
+ 	mov	8(up,n,8), %rax
+ 	jmp	L(L0c)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/mul_2.asm b/mpn/x86_64/coreisbr/mul_2.asm
+index 781534d..10f1769 100644
+--- a/mpn/x86_64/coreisbr/mul_2.asm
++++ b/mpn/x86_64/coreisbr/mul_2.asm
+@@ -165,3 +165,4 @@ L(end):	mul	v0
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/mul_basecase.asm b/mpn/x86_64/coreisbr/mul_basecase.asm
+index 35fd1cc..d5c7e5b 100644
+--- a/mpn/x86_64/coreisbr/mul_basecase.asm
++++ b/mpn/x86_64/coreisbr/mul_basecase.asm
+@@ -405,3 +405,4 @@ L(ret2):pop	%rbp
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/mullo_basecase.asm b/mpn/x86_64/coreisbr/mullo_basecase.asm
+index a41a8ac..acf7776 100644
+--- a/mpn/x86_64/coreisbr/mullo_basecase.asm
++++ b/mpn/x86_64/coreisbr/mullo_basecase.asm
+@@ -382,3 +382,4 @@ L(n3):	mov	(vp_param), %r9
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/rsh1aors_n.asm b/mpn/x86_64/coreisbr/rsh1aors_n.asm
+index fd2eaea..eefad99 100644
+--- a/mpn/x86_64/coreisbr/rsh1aors_n.asm
++++ b/mpn/x86_64/coreisbr/rsh1aors_n.asm
+@@ -191,3 +191,4 @@ L(end):	shrd	$1, %rbx, %rbp
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/coreisbr/sqr_basecase.asm b/mpn/x86_64/coreisbr/sqr_basecase.asm
+index 46a3612..1600e25 100644
+--- a/mpn/x86_64/coreisbr/sqr_basecase.asm
++++ b/mpn/x86_64/coreisbr/sqr_basecase.asm
+@@ -482,3 +482,4 @@ L(dend):add	%r8, %r10
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/div_qr_1n_pi1.asm b/mpn/x86_64/div_qr_1n_pi1.asm
+index b3d45e2..9fd2633 100644
+--- a/mpn/x86_64/div_qr_1n_pi1.asm
++++ b/mpn/x86_64/div_qr_1n_pi1.asm
+@@ -245,3 +245,4 @@ L(q_incr_loop):
+ 	lea	8(U1), U1
+ 	jmp	L(q_incr_loop)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/div_qr_2n_pi1.asm b/mpn/x86_64/div_qr_2n_pi1.asm
+index 5e59a0a..c189c33 100644
+--- a/mpn/x86_64/div_qr_2n_pi1.asm
++++ b/mpn/x86_64/div_qr_2n_pi1.asm
+@@ -156,3 +156,4 @@ L(fix):	C Unlikely update. u2 >= d1
+ 	sbb	d1, u2
+ 	jmp	L(bck)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/div_qr_2u_pi1.asm b/mpn/x86_64/div_qr_2u_pi1.asm
+index 85af96f..f2ac526 100644
+--- a/mpn/x86_64/div_qr_2u_pi1.asm
++++ b/mpn/x86_64/div_qr_2u_pi1.asm
+@@ -198,3 +198,4 @@ L(fix_qh):	C Unlikely update. u2 >= d1
+ 	sbb	d1, u2
+ 	jmp	L(bck_qh)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/dive_1.asm b/mpn/x86_64/dive_1.asm
+index 988bdab..1929091 100644
+--- a/mpn/x86_64/dive_1.asm
++++ b/mpn/x86_64/dive_1.asm
+@@ -156,3 +156,4 @@ L(one):	shr	R8(%rcx), %rax
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/divrem_1.asm b/mpn/x86_64/divrem_1.asm
+index d4d61ad..edfd893 100644
+--- a/mpn/x86_64/divrem_1.asm
++++ b/mpn/x86_64/divrem_1.asm
+@@ -312,3 +312,4 @@ L(ret):	pop	%rbx
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/divrem_2.asm b/mpn/x86_64/divrem_2.asm
+index 20811cc..e10f328 100644
+--- a/mpn/x86_64/divrem_2.asm
++++ b/mpn/x86_64/divrem_2.asm
+@@ -190,3 +190,4 @@ L(fix):	seta	%dl
+ 	sbb	%r11, %rbx
+ 	jmp	L(bck)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastavx/copyd.asm b/mpn/x86_64/fastavx/copyd.asm
+index 56d472f..a69a624 100644
+--- a/mpn/x86_64/fastavx/copyd.asm
++++ b/mpn/x86_64/fastavx/copyd.asm
+@@ -170,3 +170,4 @@ L(bc):	test	$4, R8(n)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastavx/copyi.asm b/mpn/x86_64/fastavx/copyi.asm
+index 7607747..f50aa47 100644
+--- a/mpn/x86_64/fastavx/copyi.asm
++++ b/mpn/x86_64/fastavx/copyi.asm
+@@ -167,3 +167,4 @@ L(bc):	test	$4, R8(n)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/com-palignr.asm b/mpn/x86_64/fastsse/com-palignr.asm
+index 69027bc..50cd40f 100644
+--- a/mpn/x86_64/fastsse/com-palignr.asm
++++ b/mpn/x86_64/fastsse/com-palignr.asm
+@@ -309,3 +309,4 @@ L(end):	test	$1, R8(n)
+ 1:	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/com.asm b/mpn/x86_64/fastsse/com.asm
+index c867222..aec7d25 100644
+--- a/mpn/x86_64/fastsse/com.asm
++++ b/mpn/x86_64/fastsse/com.asm
+@@ -173,3 +173,4 @@ IFDOS(`	add	$56, %rsp	')
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/copyd-palignr.asm b/mpn/x86_64/fastsse/copyd-palignr.asm
+index fac6f8a..fa1e4a4 100644
+--- a/mpn/x86_64/fastsse/copyd-palignr.asm
++++ b/mpn/x86_64/fastsse/copyd-palignr.asm
+@@ -252,3 +252,4 @@ L(end):	test	$1, R8(n)
+ 1:	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/copyd.asm b/mpn/x86_64/fastsse/copyd.asm
+index b3c4706..ce820c5 100644
+--- a/mpn/x86_64/fastsse/copyd.asm
++++ b/mpn/x86_64/fastsse/copyd.asm
+@@ -164,3 +164,4 @@ L(sma):	test	$8, R8(n)
+ L(don):	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/copyi-palignr.asm b/mpn/x86_64/fastsse/copyi-palignr.asm
+index 9876a47..fb4655f 100644
+--- a/mpn/x86_64/fastsse/copyi-palignr.asm
++++ b/mpn/x86_64/fastsse/copyi-palignr.asm
+@@ -298,3 +298,4 @@ L(end):	test	$1, R8(n)
+ 1:	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/copyi.asm b/mpn/x86_64/fastsse/copyi.asm
+index 97f7865..826caad 100644
+--- a/mpn/x86_64/fastsse/copyi.asm
++++ b/mpn/x86_64/fastsse/copyi.asm
+@@ -183,3 +183,4 @@ dnl	jnc	1b
+ L(ret):	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/lshift-movdqu2.asm b/mpn/x86_64/fastsse/lshift-movdqu2.asm
+index a05e850..217f2cd 100644
+--- a/mpn/x86_64/fastsse/lshift-movdqu2.asm
++++ b/mpn/x86_64/fastsse/lshift-movdqu2.asm
+@@ -180,3 +180,4 @@ L(end8):movq	(ap), %xmm0
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/lshift.asm b/mpn/x86_64/fastsse/lshift.asm
+index 6a17b93..79a5554 100644
+--- a/mpn/x86_64/fastsse/lshift.asm
++++ b/mpn/x86_64/fastsse/lshift.asm
+@@ -171,3 +171,4 @@ L(end8):movq	(ap), %xmm0
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
+index 8250910..9f14435 100644
+--- a/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
++++ b/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
+@@ -191,3 +191,4 @@ L(end8):movq	(ap), %xmm0
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/lshiftc.asm b/mpn/x86_64/fastsse/lshiftc.asm
+index a616075..a6630cb 100644
+--- a/mpn/x86_64/fastsse/lshiftc.asm
++++ b/mpn/x86_64/fastsse/lshiftc.asm
+@@ -181,3 +181,4 @@ L(end8):movq	(ap), %xmm0
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/rshift-movdqu2.asm b/mpn/x86_64/fastsse/rshift-movdqu2.asm
+index 1e270b1..15bcc02 100644
+--- a/mpn/x86_64/fastsse/rshift-movdqu2.asm
++++ b/mpn/x86_64/fastsse/rshift-movdqu2.asm
+@@ -199,3 +199,4 @@ L(bc):	dec	R32(n)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fastsse/sec_tabselect.asm b/mpn/x86_64/fastsse/sec_tabselect.asm
+index e7b7feb..f3b76eb 100644
+--- a/mpn/x86_64/fastsse/sec_tabselect.asm
++++ b/mpn/x86_64/fastsse/sec_tabselect.asm
+@@ -202,3 +202,4 @@ IFDOS(`	add	$88, %rsp	')
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/fat/fat_entry.asm b/mpn/x86_64/fat/fat_entry.asm
+index 5f244ac..2322be8 100644
+--- a/mpn/x86_64/fat/fat_entry.asm
++++ b/mpn/x86_64/fat/fat_entry.asm
+@@ -207,3 +207,4 @@ PROLOGUE(__gmpn_cpuid)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/gcd_11.asm b/mpn/x86_64/gcd_11.asm
+index f9b3bcc..1e5ac68 100644
+--- a/mpn/x86_64/gcd_11.asm
++++ b/mpn/x86_64/gcd_11.asm
+@@ -112,3 +112,4 @@ L(shift_alot):
+ 	mov	u0, %rdx
+ 	jmp	L(mid)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/gcd_22.asm b/mpn/x86_64/gcd_22.asm
+index 78f985f..c3b0b89 100644
+--- a/mpn/x86_64/gcd_22.asm
++++ b/mpn/x86_64/gcd_22.asm
+@@ -161,3 +161,4 @@ L(end):	C mov	v0, %rax
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k10/gcd_22.asm b/mpn/x86_64/k10/gcd_22.asm
+index f58b4cc..c7fe668 100644
+--- a/mpn/x86_64/k10/gcd_22.asm
++++ b/mpn/x86_64/k10/gcd_22.asm
+@@ -140,3 +140,4 @@ L(end):	C mov	v0, %rax
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k10/hamdist.asm b/mpn/x86_64/k10/hamdist.asm
+index f70494a..d885e2d 100644
+--- a/mpn/x86_64/k10/hamdist.asm
++++ b/mpn/x86_64/k10/hamdist.asm
+@@ -107,3 +107,4 @@ L(top):	mov	(ap,n,8), %r8
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k10/popcount.asm b/mpn/x86_64/k10/popcount.asm
+index 3814aea..45bcba5 100644
+--- a/mpn/x86_64/k10/popcount.asm
++++ b/mpn/x86_64/k10/popcount.asm
+@@ -79,7 +79,7 @@ C	neg	R32(%rcx)
+ 
+ 	lea	L(top)(%rip), %rdx
+ 	lea	(%rdx,%rcx,2), %rdx
+-	jmp	*%rdx
++	X86_NOTRACK jmp	*%rdx
+ ',`
+ 	lea	(up,n,8), up
+ 
+@@ -101,7 +101,7 @@ C	lea	(%rcx,%rcx,4), %rcx	C 10x
+ 
+ 	lea	L(top)(%rip), %rdx
+ 	add	%rcx, %rdx
+-	jmp	*%rdx
++	X86_NOTRACK jmp	*%rdx
+ ')
+ 
+ 	ALIGN(32)
+@@ -136,3 +136,4 @@ C 1 = n mod 8
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k8/addmul_2.asm b/mpn/x86_64/k8/addmul_2.asm
+index 78bcba1..38caa4d 100644
+--- a/mpn/x86_64/k8/addmul_2.asm
++++ b/mpn/x86_64/k8/addmul_2.asm
+@@ -193,3 +193,4 @@ L(end):	xor	R32(w1), R32(w1)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k8/aorrlsh_n.asm b/mpn/x86_64/k8/aorrlsh_n.asm
+index ff3a184..3ab7050 100644
+--- a/mpn/x86_64/k8/aorrlsh_n.asm
++++ b/mpn/x86_64/k8/aorrlsh_n.asm
+@@ -215,3 +215,4 @@ L(cj1):	mov	%r9, 8(rp,n,8)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k8/bdiv_q_1.asm b/mpn/x86_64/k8/bdiv_q_1.asm
+index 1172b0d..606d54f 100644
+--- a/mpn/x86_64/k8/bdiv_q_1.asm
++++ b/mpn/x86_64/k8/bdiv_q_1.asm
+@@ -177,3 +177,4 @@ L(one):	shr	R8(%rcx), %rax
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k8/div_qr_1n_pi1.asm b/mpn/x86_64/k8/div_qr_1n_pi1.asm
+index 86de08c..e91b809 100644
+--- a/mpn/x86_64/k8/div_qr_1n_pi1.asm
++++ b/mpn/x86_64/k8/div_qr_1n_pi1.asm
+@@ -247,3 +247,4 @@ L(q_incr_loop):
+ 	lea	8(U1), U1
+ 	jmp	L(q_incr_loop)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k8/mul_basecase.asm b/mpn/x86_64/k8/mul_basecase.asm
+index ca2efb9..9126c2b 100644
+--- a/mpn/x86_64/k8/mul_basecase.asm
++++ b/mpn/x86_64/k8/mul_basecase.asm
+@@ -335,8 +335,10 @@ C     addmul_2 for remaining vp's
+ 	C adjusted value of n that is reloaded on each iteration
+ 
+ L(addmul_outer_0):
++	X86_ENDBR
+ 	add	$3, un
+ 	lea	0(%rip), outer_addr
++	X86_ENDBR
+ 
+ 	mov	un, n
+ 	mov	-24(up,un,8), %rax
+@@ -348,6 +350,7 @@ L(addmul_outer_0):
+ 	jmp	L(addmul_entry_0)
+ 
+ L(addmul_outer_1):
++	X86_ENDBR
+ 	mov	un, n
+ 	mov	(up,un,8), %rax
+ 	mul	v0
+@@ -358,8 +361,10 @@ L(addmul_outer_1):
+ 	jmp	L(addmul_entry_1)
+ 
+ L(addmul_outer_2):
++	X86_ENDBR
+ 	add	$1, un
+ 	lea	0(%rip), outer_addr
++	X86_ENDBR
+ 
+ 	mov	un, n
+ 	mov	-8(up,un,8), %rax
+@@ -372,8 +377,10 @@ L(addmul_outer_2):
+ 	jmp	L(addmul_entry_2)
+ 
+ L(addmul_outer_3):
++	X86_ENDBR
+ 	add	$2, un
+ 	lea	0(%rip), outer_addr
++	X86_ENDBR
+ 
+ 	mov	un, n
+ 	mov	-16(up,un,8), %rax
+@@ -467,3 +474,4 @@ L(ret):	pop	%r15
+ 	ret
+ 
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k8/mullo_basecase.asm b/mpn/x86_64/k8/mullo_basecase.asm
+index fa00f42..4a931a5 100644
+--- a/mpn/x86_64/k8/mullo_basecase.asm
++++ b/mpn/x86_64/k8/mullo_basecase.asm
+@@ -99,12 +99,14 @@ dnl	JMPENT(	L(2m4), L(tab))			C 10
+ dnl	JMPENT(	L(3m4), L(tab))			C 11
+ 	TEXT
+ 
+-L(1):	imul	%r8, %rax
++L(1):	X86_ENDBR
++	imul	%r8, %rax
+ 	mov	%rax, (rp)
+ 	FUNC_EXIT()
+ 	ret
+ 
+-L(2):	mov	8(vp_param), %r11
++L(2):	X86_ENDBR
++	mov	8(vp_param), %r11
+ 	imul	%rax, %r11		C u0 x v1
+ 	mul	%r8			C u0 x v0
+ 	mov	%rax, (rp)
+@@ -115,7 +117,8 @@ L(2):	mov	8(vp_param), %r11
+ 	FUNC_EXIT()
+ 	ret
+ 
+-L(3):	mov	8(vp_param), %r9	C v1
++L(3):	X86_ENDBR
++	mov	8(vp_param), %r9	C v1
+ 	mov	16(vp_param), %r11
+ 	mul	%r8			C u0 x v0 -> <r1,r0>
+ 	mov	%rax, (rp)		C r0
+@@ -335,6 +338,7 @@ L(mul_2_entry_1):
+ 
+ 
+ L(addmul_outer_1):
++	X86_ENDBR
+ 	lea	-2(n), j
+ 	mov	-16(up,n,8), %rax
+ 	mul	v0
+@@ -346,6 +350,7 @@ L(addmul_outer_1):
+ 	jmp	L(addmul_entry_1)
+ 
+ L(addmul_outer_3):
++	X86_ENDBR
+ 	lea	0(n), j
+ 	mov	-16(up,n,8), %rax
+ 	xor	R32(w3), R32(w3)
+@@ -434,3 +439,4 @@ L(ret):	pop	%r15
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k8/mulmid_basecase.asm b/mpn/x86_64/k8/mulmid_basecase.asm
+index 86f1414..7d5f158 100644
+--- a/mpn/x86_64/k8/mulmid_basecase.asm
++++ b/mpn/x86_64/k8/mulmid_basecase.asm
+@@ -329,6 +329,7 @@ C     addmul_2 for remaining vp's
+ 
+ 	ALIGN(16)
+ L(addmul_prologue_0):
++	X86_ENDBR
+ 	mov	-8(up,n,8), %rax
+ 	mul	v1
+ 	mov	%rax, w1
+@@ -338,6 +339,7 @@ L(addmul_prologue_0):
+ 
+ 	ALIGN(16)
+ L(addmul_prologue_1):
++	X86_ENDBR
+ 	mov	16(up,n,8), %rax
+ 	mul	v1
+ 	mov	%rax, w0
+@@ -348,6 +350,7 @@ L(addmul_prologue_1):
+ 
+ 	ALIGN(16)
+ L(addmul_prologue_2):
++	X86_ENDBR
+ 	mov	8(up,n,8), %rax
+ 	mul	v1
+ 	mov	%rax, w3
+@@ -357,6 +360,7 @@ L(addmul_prologue_2):
+ 
+ 	ALIGN(16)
+ L(addmul_prologue_3):
++	X86_ENDBR
+ 	mov	(up,n,8), %rax
+ 	mul	v1
+ 	mov	%rax, w2
+@@ -471,6 +475,7 @@ L(diag_prologue_0):
+ 	mov	vp, vp_inner
+ 	mov	vn, n
+ 	lea	0(%rip), outer_addr
++	X86_ENDBR
+ 	mov     -8(up,n,8), %rax
+ 	jmp	L(diag_entry_0)
+ 
+@@ -480,6 +485,7 @@ L(diag_prologue_1):
+ 	add	$3, vn
+ 	mov	vn, n
+ 	lea	0(%rip), outer_addr
++	X86_ENDBR
+ 	mov     -8(vp_inner), %rax
+ 	jmp	L(diag_entry_1)
+ 
+@@ -489,6 +495,7 @@ L(diag_prologue_2):
+ 	add	$2, vn
+ 	mov	vn, n
+ 	lea	0(%rip), outer_addr
++	X86_ENDBR
+ 	mov	16(vp_inner), %rax
+ 	jmp	L(diag_entry_2)
+ 
+@@ -507,6 +514,7 @@ L(diag_entry_0):
+ 	adc     %rdx, w1
+ 	adc     $0, w2
+ L(diag_entry_3):
++	X86_ENDBR
+ 	mov     -16(up,n,8), %rax
+ 	mulq    8(vp_inner)
+ 	add     %rax, w0
+@@ -557,3 +565,4 @@ L(ret):	pop	%r15
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k8/redc_1.asm b/mpn/x86_64/k8/redc_1.asm
+index 9327b21..3e241af 100644
+--- a/mpn/x86_64/k8/redc_1.asm
++++ b/mpn/x86_64/k8/redc_1.asm
+@@ -125,7 +125,8 @@ L(tab):	JMPENT(	L(0), L(tab))
+ 	TEXT
+ 
+ 	ALIGN(16)
+-L(1):	mov	(mp_param), %rax
++L(1):	X86_ENDBR
++	mov	(mp_param), %rax
+ 	mul	q0
+ 	add	8(up), %rax
+ 	adc	16(up), %rdx
+@@ -136,7 +137,8 @@ L(1):	mov	(mp_param), %rax
+ 
+ 
+ 	ALIGN(16)
+-L(2):	mov	(mp_param), %rax
++L(2):	X86_ENDBR
++	mov	(mp_param), %rax
+ 	mul	q0
+ 	xor	R32(%r14), R32(%r14)
+ 	mov	%rax, %r10
+@@ -171,7 +173,8 @@ L(2):	mov	(mp_param), %rax
+ 	jmp	L(ret)
+ 
+ 
+-L(3):	mov	(mp_param), %rax
++L(3):	X86_ENDBR
++	mov	(mp_param), %rax
+ 	mul	q0
+ 	mov	%rax, %rbx
+ 	mov	%rdx, %r10
+@@ -248,7 +251,7 @@ L(3):	mov	(mp_param), %rax
+ 
+ 
+ 	ALIGN(16)
+-L(2m4):
++L(2m4):	X86_ENDBR
+ L(lo2):	mov	(mp,nneg,8), %rax
+ 	mul	q0
+ 	xor	R32(%r14), R32(%r14)
+@@ -324,7 +327,7 @@ L(le2):	add	%r10, (up)
+ 
+ 
+ 	ALIGN(16)
+-L(1m4):
++L(1m4):	X86_ENDBR
+ L(lo1):	mov	(mp,nneg,8), %rax
+ 	xor	%r9, %r9
+ 	xor	R32(%rbx), R32(%rbx)
+@@ -398,7 +401,7 @@ L(le1):	add	%r10, (up)
+ 
+ 	ALIGN(16)
+ L(0):
+-L(0m4):
++L(0m4):	X86_ENDBR
+ L(lo0):	mov	(mp,nneg,8), %rax
+ 	mov	nneg, i
+ 	mul	q0
+@@ -463,7 +466,7 @@ L(le0):	add	%r10, (up)
+ 
+ 
+ 	ALIGN(16)
+-L(3m4):
++L(3m4):	X86_ENDBR
+ L(lo3):	mov	(mp,nneg,8), %rax
+ 	mul	q0
+ 	mov	%rax, %rbx
+@@ -589,3 +592,4 @@ L(ret):	pop	%r15
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/k8/sqr_basecase.asm b/mpn/x86_64/k8/sqr_basecase.asm
+index 60cf945..37858b4 100644
+--- a/mpn/x86_64/k8/sqr_basecase.asm
++++ b/mpn/x86_64/k8/sqr_basecase.asm
+@@ -131,7 +131,8 @@ L(tab):	JMPENT(	L(4), L(tab))
+ 	JMPENT(	L(3m4), L(tab))
+ 	TEXT
+ 
+-L(1):	mov	(up), %rax
++L(1):	X86_ENDBR
++	mov	(up), %rax
+ 	mul	%rax
+ 	add	$40, %rsp
+ 	mov	%rax, (rp)
+@@ -139,7 +140,8 @@ L(1):	mov	(up), %rax
+ 	FUNC_EXIT()
+ 	ret
+ 
+-L(2):	mov	(up), %rax
++L(2):	X86_ENDBR
++	mov	(up), %rax
+ 	mov	%rax, %r8
+ 	mul	%rax
+ 	mov	8(up), %r11
+@@ -165,7 +167,8 @@ L(2):	mov	(up), %rax
+ 	FUNC_EXIT()
+ 	ret
+ 
+-L(3):	mov	(up), %rax
++L(3):	X86_ENDBR
++	mov	(up), %rax
+ 	mov	%rax, %r10
+ 	mul	%rax
+ 	mov	8(up), %r11
+@@ -210,7 +213,8 @@ L(3):	mov	(up), %rax
+ 	FUNC_EXIT()
+ 	ret
+ 
+-L(4):	mov	(up), %rax
++L(4):	X86_ENDBR
++	mov	(up), %rax
+ 	mov	%rax, %r11
+ 	mul	%rax
+ 	mov	8(up), %rbx
+@@ -282,6 +286,7 @@ L(4):	mov	(up), %rax
+ 
+ 
+ L(0m4):
++	X86_ENDBR
+ 	lea	-16(rp,n,8), tp		C point tp in middle of result operand
+ 	mov	(up), v0
+ 	mov	8(up), %rax
+@@ -340,6 +345,7 @@ L(L3):	xor	R32(w1), R32(w1)
+ 
+ 
+ L(1m4):
++	X86_ENDBR
+ 	lea	8(rp,n,8), tp		C point tp in middle of result operand
+ 	mov	(up), v0		C u0
+ 	mov	8(up), %rax		C u1
+@@ -418,6 +424,7 @@ L(m2x):	mov	(up,j,8), %rax
+ 
+ 
+ L(2m4):
++	X86_ENDBR
+ 	lea	-16(rp,n,8), tp		C point tp in middle of result operand
+ 	mov	(up), v0
+ 	mov	8(up), %rax
+@@ -474,7 +481,7 @@ L(L1):	xor	R32(w0), R32(w0)
+ 	jmp	L(dowhile_mid)
+ 
+ 
+-L(3m4):
++L(3m4):	X86_ENDBR
+ 	lea	8(rp,n,8), tp		C point tp in middle of result operand
+ 	mov	(up), v0		C u0
+ 	mov	8(up), %rax		C u1
+@@ -805,3 +812,4 @@ L(d1):	mov	%r11, 24(rp,j,8)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/logops_n.asm b/mpn/x86_64/logops_n.asm
+index e25854d..b3969ba 100644
+--- a/mpn/x86_64/logops_n.asm
++++ b/mpn/x86_64/logops_n.asm
+@@ -258,3 +258,4 @@ L(ret):	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
+ ')
++ASM_END()
+diff --git a/mpn/x86_64/lshift.asm b/mpn/x86_64/lshift.asm
+index fff3152..4187bdc 100644
+--- a/mpn/x86_64/lshift.asm
++++ b/mpn/x86_64/lshift.asm
+@@ -170,3 +170,4 @@ L(ast):	mov	(up), %r10
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/lshiftc.asm b/mpn/x86_64/lshiftc.asm
+index c4ba04a..f6fe4c9 100644
+--- a/mpn/x86_64/lshiftc.asm
++++ b/mpn/x86_64/lshiftc.asm
+@@ -180,3 +180,4 @@ L(ast):	mov	(up), %r10
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/lshsub_n.asm b/mpn/x86_64/lshsub_n.asm
+index 4d428c0..62877d7 100644
+--- a/mpn/x86_64/lshsub_n.asm
++++ b/mpn/x86_64/lshsub_n.asm
+@@ -170,3 +170,4 @@ L(end):
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/missing.asm b/mpn/x86_64/missing.asm
+index 9b65c89..22dac17 100644
+--- a/mpn/x86_64/missing.asm
++++ b/mpn/x86_64/missing.asm
+@@ -128,3 +128,4 @@ PROLOGUE(__gmp_adcx)
+ 	ret
+ EPILOGUE()
+ PROTECT(__gmp_adcx)
++ASM_END()
+diff --git a/mpn/x86_64/mod_1_2.asm b/mpn/x86_64/mod_1_2.asm
+index 40fcaeb..fbaae3b 100644
+--- a/mpn/x86_64/mod_1_2.asm
++++ b/mpn/x86_64/mod_1_2.asm
+@@ -239,3 +239,4 @@ ifdef(`SHLD_SLOW',`
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/mod_1_4.asm b/mpn/x86_64/mod_1_4.asm
+index 6cf304c..8969e42 100644
+--- a/mpn/x86_64/mod_1_4.asm
++++ b/mpn/x86_64/mod_1_4.asm
+@@ -270,3 +270,4 @@ ifdef(`SHLD_SLOW',`
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/mod_34lsub1.asm b/mpn/x86_64/mod_34lsub1.asm
+index 75421a6..70282b6 100644
+--- a/mpn/x86_64/mod_34lsub1.asm
++++ b/mpn/x86_64/mod_34lsub1.asm
+@@ -145,46 +145,55 @@ L(tab):	JMPENT(	L(0), L(tab))
+ 	JMPENT(	L(8), L(tab))
+ 	TEXT
+ 
+-L(6):	add	(ap), %rax
++L(6):	X86_ENDBR
++	add	(ap), %rax
+ 	adc	8(ap), %rcx
+ 	adc	16(ap), %rdx
+ 	adc	$0, %r9
+ 	add	$24, ap
+-L(3):	add	(ap), %rax
++L(3):	X86_ENDBR
++	add	(ap), %rax
+ 	adc	8(ap), %rcx
+ 	adc	16(ap), %rdx
+ 	jmp	L(cj1)
+ 
+-L(7):	add	(ap), %rax
++L(7):	X86_ENDBR
++	add	(ap), %rax
+ 	adc	8(ap), %rcx
+ 	adc	16(ap), %rdx
+ 	adc	$0, %r9
+ 	add	$24, ap
+-L(4):	add	(ap), %rax
++L(4):	X86_ENDBR
++	add	(ap), %rax
+ 	adc	8(ap), %rcx
+ 	adc	16(ap), %rdx
+ 	adc	$0, %r9
+ 	add	$24, ap
+-L(1):	add	(ap), %rax
++L(1):	X86_ENDBR
++	add	(ap), %rax
+ 	adc	$0, %rcx
+ 	jmp	L(cj2)
+ 
+-L(8):	add	(ap), %rax
++L(8):	X86_ENDBR
++	add	(ap), %rax
+ 	adc	8(ap), %rcx
+ 	adc	16(ap), %rdx
+ 	adc	$0, %r9
+ 	add	$24, ap
+-L(5):	add	(ap), %rax
++L(5):	X86_ENDBR
++	add	(ap), %rax
+ 	adc	8(ap), %rcx
+ 	adc	16(ap), %rdx
+ 	adc	$0, %r9
+ 	add	$24, ap
+-L(2):	add	(ap), %rax
++L(2):	X86_ENDBR
++	add	(ap), %rax
+ 	adc	8(ap), %rcx
+ 
+ L(cj2):	adc	$0, %rdx
+ L(cj1):	adc	$0, %r9
+-L(0):	add	%r9, %rax
++L(0):	X86_ENDBR
++	add	%r9, %rax
+ 	adc	$0, %rcx
+ 	adc	$0, %rdx
+ 	adc	$0, %rax
+@@ -213,3 +222,4 @@ L(0):	add	%r9, %rax
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/mode1o.asm b/mpn/x86_64/mode1o.asm
+index 2cd2b08..3377435 100644
+--- a/mpn/x86_64/mode1o.asm
++++ b/mpn/x86_64/mode1o.asm
+@@ -169,3 +169,4 @@ L(one):
+ 
+ EPILOGUE(mpn_modexact_1c_odd)
+ EPILOGUE(mpn_modexact_1_odd)
++ASM_END()
+diff --git a/mpn/x86_64/mul_1.asm b/mpn/x86_64/mul_1.asm
+index e1ba89b..44764dd 100644
+--- a/mpn/x86_64/mul_1.asm
++++ b/mpn/x86_64/mul_1.asm
+@@ -190,3 +190,4 @@ IFDOS(``pop	%rdi		'')
+ IFDOS(``pop	%rsi		'')
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/mul_2.asm b/mpn/x86_64/mul_2.asm
+index d64313b..b6c6bf1 100644
+--- a/mpn/x86_64/mul_2.asm
++++ b/mpn/x86_64/mul_2.asm
+@@ -202,3 +202,4 @@ L(m22):	mul	v1
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/nano/dive_1.asm b/mpn/x86_64/nano/dive_1.asm
+index e9a0763..aead4d5 100644
+--- a/mpn/x86_64/nano/dive_1.asm
++++ b/mpn/x86_64/nano/dive_1.asm
+@@ -164,3 +164,4 @@ L(one):	shr	R8(%rcx), %rax
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/pentium4/aors_n.asm b/mpn/x86_64/pentium4/aors_n.asm
+index 8e6ee1b..3751e38 100644
+--- a/mpn/x86_64/pentium4/aors_n.asm
++++ b/mpn/x86_64/pentium4/aors_n.asm
+@@ -194,3 +194,4 @@ L(ret):	mov	R32(%rbx), R32(%rax)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/pentium4/mod_34lsub1.asm b/mpn/x86_64/pentium4/mod_34lsub1.asm
+index f34b3f0..bf83f62 100644
+--- a/mpn/x86_64/pentium4/mod_34lsub1.asm
++++ b/mpn/x86_64/pentium4/mod_34lsub1.asm
+@@ -165,3 +165,4 @@ L(combine):
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/pentium4/rsh1aors_n.asm b/mpn/x86_64/pentium4/rsh1aors_n.asm
+index 5528ce4..219a809 100644
+--- a/mpn/x86_64/pentium4/rsh1aors_n.asm
++++ b/mpn/x86_64/pentium4/rsh1aors_n.asm
+@@ -332,3 +332,4 @@ L(cj1):	or	%r14, %rbx
+ L(c3):	mov	$1, R8(%rax)
+ 	jmp	L(rc3)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/pentium4/rshift.asm b/mpn/x86_64/pentium4/rshift.asm
+index b7c1ee2..848045f 100644
+--- a/mpn/x86_64/pentium4/rshift.asm
++++ b/mpn/x86_64/pentium4/rshift.asm
+@@ -167,3 +167,4 @@ L(ast):	movq	(up), %mm2
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/popham.asm b/mpn/x86_64/popham.asm
+index 3a29b2e..b7ceb17 100644
+--- a/mpn/x86_64/popham.asm
++++ b/mpn/x86_64/popham.asm
+@@ -161,3 +161,4 @@ L(end):
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/rsh1aors_n.asm b/mpn/x86_64/rsh1aors_n.asm
+index a3e9cc5..797e250 100644
+--- a/mpn/x86_64/rsh1aors_n.asm
++++ b/mpn/x86_64/rsh1aors_n.asm
+@@ -187,3 +187,4 @@ L(end):	mov	%rbx, (rp)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/rshift.asm b/mpn/x86_64/rshift.asm
+index 3f344f1..0fc5877 100644
+--- a/mpn/x86_64/rshift.asm
++++ b/mpn/x86_64/rshift.asm
+@@ -174,3 +174,4 @@ L(ast):	mov	(up), %r10
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/sec_tabselect.asm b/mpn/x86_64/sec_tabselect.asm
+index e8aed26..5dce3c1 100644
+--- a/mpn/x86_64/sec_tabselect.asm
++++ b/mpn/x86_64/sec_tabselect.asm
+@@ -174,3 +174,4 @@ L(b00):	pop	%r15
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/sqr_diag_addlsh1.asm b/mpn/x86_64/sqr_diag_addlsh1.asm
+index f486125..a1d8767 100644
+--- a/mpn/x86_64/sqr_diag_addlsh1.asm
++++ b/mpn/x86_64/sqr_diag_addlsh1.asm
+@@ -114,3 +114,4 @@ L(end):	add	%r10, %r8
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/sublsh1_n.asm b/mpn/x86_64/sublsh1_n.asm
+index c6d829f..c18f32a 100644
+--- a/mpn/x86_64/sublsh1_n.asm
++++ b/mpn/x86_64/sublsh1_n.asm
+@@ -158,3 +158,4 @@ L(end):	add	R32(%rbp), R32(%rax)
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/x86_64-defs.m4 b/mpn/x86_64/x86_64-defs.m4
+index 64e3729..2a1c1b0 100644
+--- a/mpn/x86_64/x86_64-defs.m4
++++ b/mpn/x86_64/x86_64-defs.m4
+@@ -94,9 +94,9 @@ m4_assert_numargs(1)
+ `	GLOBL	$1
+ 	TYPE($1,`function')
+ $1:
++	X86_ENDBR
+ ')
+ 
+-
+ dnl  Usage: ASSERT([cond][,instructions])
+ dnl
+ dnl  If WANT_ASSERT is 1, output the given instructions and expect the given
+@@ -149,6 +149,10 @@ ifdef(`PIC',
+ 	`lea	$1(%rip), $2')
+ ')
+ 
++dnl ASM_END
++
++define(`ASM_END', `X86_GNU_PROPERTY')
++
+ 
+ define(`DEF_OBJECT',
+ m4_assert_numargs_range(2,3)
+diff --git a/mpn/x86_64/zen/aorrlsh_n.asm b/mpn/x86_64/zen/aorrlsh_n.asm
+index e049b2f..6e6783f 100644
+--- a/mpn/x86_64/zen/aorrlsh_n.asm
++++ b/mpn/x86_64/zen/aorrlsh_n.asm
+@@ -102,26 +102,30 @@ ifdef(`PIC',`
+ 	jmp	*(%r11,%rax,8)
+ ')
+ 
+-L(0):	lea	32(up), up
++L(0):	X86_ENDBR
++  lea	32(up), up
+ 	lea	32(vp), vp
+ 	lea	32(rp), rp
+ 	xor	R32(%r11), R32(%r11)
+ 	jmp	L(e0)
+ 
+-L(7):	mov	%r10, %r11
++L(7):	X86_ENDBRmov
++	%r10, %r11
+ 	lea	24(up), up
+ 	lea	24(vp), vp
+ 	lea	24(rp), rp
+ 	xor	R32(%r10), R32(%r10)
+ 	jmp	L(e7)
+ 
+-L(6):	lea	16(up), up
++L(6):		X86_ENDBR
++  movlea	16(up), up
+ 	lea	16(vp), vp
+ 	lea	16(rp), rp
+ 	xor	R32(%r11), R32(%r11)
+ 	jmp	L(e6)
+ 
+-L(5):	mov	%r10, %r11
++L(5):	X86_ENDBRmov
++  mov	%r10, %r11
+ 	lea	8(up), up
+ 	lea	8(vp), vp
+ 	lea	8(rp), rp
+@@ -191,23 +195,27 @@ L(e1):	shlx(	cnt, %r11, %rax)
+ 	lea	(%r10,%rax), %rax
+ 	jmp	L(top)
+ 
+-L(4):	xor	R32(%r11), R32(%r11)
++L(4): 	X86_ENDBRmov
++  xor	R32(%r11), R32(%r11)
+ 	jmp	L(e4)
+ 
+-L(3):	mov	%r10, %r11
++L(3):	X86_ENDBRmov
++  mov	%r10, %r11
+ 	lea	-8(up), up
+ 	lea	-8(vp), vp
+ 	lea	-8(rp), rp
+ 	xor	R32(%r10), R32(%r10)
+ 	jmp	L(e3)
+ 
+-L(2):	lea	-16(up), up
++L(2):	X86_ENDBRmov
++  lea	-16(up), up
+ 	lea	-16(vp), vp
+ 	lea	-16(rp), rp
+ 	xor	R32(%r11), R32(%r11)
+ 	jmp	L(e2)
+ 
+-L(1):	mov	%r10, %r11
++L(1):	X86_ENDBRmov
++  mov	%r10, %r11
+ 	lea	-24(up), up
+ 	lea	40(vp), vp
+ 	lea	40(rp), rp
+@@ -224,3 +232,4 @@ L(tab):	JMPENT(	L(0), L(tab))
+ 	JMPENT(	L(5), L(tab))
+ 	JMPENT(	L(6), L(tab))
+ 	JMPENT(	L(7), L(tab))
++ASM_END()
+diff --git a/mpn/x86_64/zen/mul_basecase.asm b/mpn/x86_64/zen/mul_basecase.asm
+index affa3b6..c70d548 100644
+--- a/mpn/x86_64/zen/mul_basecase.asm
++++ b/mpn/x86_64/zen/mul_basecase.asm
+@@ -453,3 +453,4 @@ L(wd3):	adc	%r11, 8(rp)
+ 	jne	L(3)
+ 	jmp	L(end)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/zen/mullo_basecase.asm b/mpn/x86_64/zen/mullo_basecase.asm
+index 2ae729a..c081698 100644
+--- a/mpn/x86_64/zen/mullo_basecase.asm
++++ b/mpn/x86_64/zen/mullo_basecase.asm
+@@ -297,3 +297,4 @@ L(lo0):	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+ 	inc	%r14
+ 	jmp	L(outer)
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/zen/sbpi1_bdiv_r.asm b/mpn/x86_64/zen/sbpi1_bdiv_r.asm
+index f6e8f9c..277b3c3 100644
+--- a/mpn/x86_64/zen/sbpi1_bdiv_r.asm
++++ b/mpn/x86_64/zen/sbpi1_bdiv_r.asm
+@@ -505,3 +505,4 @@ L(ret):	mov	%rbp, %rax
+ 	pop	%r15
+ 	ret
+ EPILOGUE()
++ASM_END()
+diff --git a/mpn/x86_64/zen/sqr_basecase.asm b/mpn/x86_64/zen/sqr_basecase.asm
+index a7c6127..d185deb 100644
+--- a/mpn/x86_64/zen/sqr_basecase.asm
++++ b/mpn/x86_64/zen/sqr_basecase.asm
+@@ -480,3 +480,4 @@ C	pop	%r14
+ 	FUNC_EXIT()
+ 	ret
+ EPILOGUE()
++ASM_END()
+-- 
+2.32.0
+
diff --git a/SOURCES/gmp-mparam.h b/SOURCES/gmp-mparam.h
new file mode 100644
index 0000000..1d4e087
--- /dev/null
+++ b/SOURCES/gmp-mparam.h
@@ -0,0 +1,88 @@
+/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
+2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+/*
+ * This gmp-mparam.h is a wrapper include file for the original gmp-mparam.h, 
+ * which has been renamed to gmp-mparam-<arch>.h. There are conflicts for the
+ * original gmp-mparam.h on multilib systems, which result from arch-specific
+ * configuration options. Please do not use the arch-specific file directly.
+ *
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Thomas Woerner <twoerner@redhat.com>
+ */
+
+#ifdef gmp_mparam_wrapper_h
+#error "gmp_mparam_wrapper_h should not be defined!"
+#endif
+#define gmp_mparam_wrapper_h
+
+#if defined(__arm__)
+#include "gmp-mparam-arm.h"
+#elif defined(__i386__)
+#include "gmp-mparam-i386.h"
+#elif defined(__ia64__)
+#include "gmp-mparam-ia64.h"
+#elif defined(__powerpc64__)
+# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#include "gmp-mparam-ppc64.h"
+# else
+#include "gmp-mparam-ppc64le.h"
+# endif
+#elif defined(__powerpc__)
+# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#include "gmp-mparam-ppc.h"
+# else
+#include "gmp-mparam-ppcle.h"
+# endif
+#elif defined(__s390x__)
+#include "gmp-mparam-s390x.h"
+#elif defined(__s390__)
+#include "gmp-mparam-s390.h"
+#elif defined(__x86_64__)
+#include "gmp-mparam-x86_64.h"
+#elif defined(__alpha__)
+#include "gmp-mparam-alpha.h"
+#elif defined(__sh__)
+#include "gmp-mparam-sh.h"
+#elif defined(__sparc__) && defined (__arch64__)
+#include "gmp-mparam-sparc64.h"
+#elif defined(__sparc__)                      
+#include "gmp-mparam-sparc.h"
+#elif defined(__aarch64__)
+#include "gmp-mparam-aarch64.h"
+#elif defined(__mips64) && defined(__MIPSEL__)
+#include "gmp-mparam-mips64el.h"
+#elif defined(__mips64)
+#include "gmp-mparam-mips64.h"
+#elif defined(__mips) && defined(__MIPSEL__)
+#include "gmp-mparam-mipsel.h"
+#elif defined(__mips)
+#include "gmp-mparam-mips.h"
+#elif defined(__riscv)
+#if __riscv_xlen == 64
+#include "gmp-mparam-riscv64.h"
+#else
+#error "No support for riscv32"
+#endif
+#else
+#error "The gmp-devel package is not usable with the architecture."
+#endif
+
+#undef gmp_mparam_wrapper_h
diff --git a/SOURCES/gmp.h b/SOURCES/gmp.h
new file mode 100644
index 0000000..0a91606
--- /dev/null
+++ b/SOURCES/gmp.h
@@ -0,0 +1,88 @@
+/* Definitions for GNU multiple precision functions.   -*- mode: c -*-
+
+Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
+2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+/*
+ * This gmp.h is a wrapper include file for the original gmp.h, which has been
+ * renamed to gmp-<arch>.h. There are conflicts for the original gmp.h on
+ * multilib systems, which result from arch-specific configuration options.
+ * Please do not use the arch-specific file directly.
+ *
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Thomas Woerner <twoerner@redhat.com>
+ */
+
+#ifdef gmp_wrapper_h
+#error "gmp_wrapper_h should not be defined!"
+#endif
+#define gmp_wrapper_h
+
+#if defined(__arm__)
+#include "gmp-arm.h"
+#elif defined(__i386__)
+#include "gmp-i386.h"
+#elif defined(__ia64__)
+#include "gmp-ia64.h"
+#elif defined(__powerpc64__)
+# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#include "gmp-ppc64.h"
+# else
+#include "gmp-ppc64le.h"
+# endif
+#elif defined(__powerpc__)
+# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#include "gmp-ppc.h"
+# else
+#include "gmp-ppcle.h"
+# endif
+#elif defined(__s390x__)
+#include "gmp-s390x.h"
+#elif defined(__s390__)
+#include "gmp-s390.h"
+#elif defined(__x86_64__)
+#include "gmp-x86_64.h"
+#elif defined(__alpha__)
+#include "gmp-alpha.h"
+#elif defined(__sh__)
+#include "gmp-sh.h"
+#elif defined(__sparc__) && defined (__arch64__)
+#include "gmp-sparc64.h"
+#elif defined(__sparc__)
+#include "gmp-sparc.h"
+#elif defined(__aarch64__)
+#include "gmp-aarch64.h"
+#elif defined(__mips64) && defined(__MIPSEL__)
+#include "gmp-mips64el.h"
+#elif defined(__mips64)
+#include "gmp-mips64.h"
+#elif defined(__mips) && defined(__MIPSEL__)
+#include "gmp-mipsel.h"
+#elif defined(__mips)
+#include "gmp-mips.h"
+#elif defined(__riscv)
+#if __riscv_xlen == 64
+#include "gmp-riscv64.h"
+#else
+#error "No support for riscv32"
+#endif
+#else
+#error "The gmp-devel package is not usable with the architecture."
+#endif
+
+#undef gmp_wrapper_h
diff --git a/SOURCES/ibm_z13_simd_part1.patch b/SOURCES/ibm_z13_simd_part1.patch
new file mode 100644
index 0000000..73f6c83
--- /dev/null
+++ b/SOURCES/ibm_z13_simd_part1.patch
@@ -0,0 +1,595 @@
+Co-authored-by: Stefan Liebler <stli at linux.ibm.com>
+---
+ mpn/s390_64/z13/addmul_1.c   | 358 +++++++++++++++++++++++++++++++++++
+ mpn/s390_64/z13/common-vec.h | 175 +++++++++++++++++
+ mpn/s390_64/z13/mul_1.c      |  31 +++
+ 3 files changed, 564 insertions(+)
+ create mode 100644 mpn/s390_64/z13/addmul_1.c
+ create mode 100644 mpn/s390_64/z13/common-vec.h
+ create mode 100644 mpn/s390_64/z13/mul_1.c
+
+diff --git a/mpn/s390_64/z13/addmul_1.c b/mpn/s390_64/z13/addmul_1.c
+new file mode 100644
+index 000000000..022e5edcc
+--- /dev/null
++++ b/mpn/s390_64/z13/addmul_1.c
+@@ -0,0 +1,358 @@
++/* Addmul_1 / mul_1 for IBM z13 and later
++   Contributed by Marius Hillenbrand
++
++Copyright 2021 Free Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.  */
++
++#include "gmp-impl.h"
++#include "s390_64/z13/common-vec.h"
++
++#undef FUNCNAME
++
++#ifdef DO_INLINE
++#  ifdef OPERATION_addmul_1
++#    define ADD
++#    define FUNCNAME inline_addmul_1
++#  elif defined(OPERATION_mul_1)
++#    define FUNCNAME inline_mul_1
++#  endif
++
++#else
++#  ifdef OPERATION_addmul_1
++#    define ADD
++#    define FUNCNAME mpn_addmul_1
++#  elif defined(OPERATION_mul_1)
++#    define FUNCNAME mpn_mul_1
++#  endif
++#endif
++
++#ifdef DO_INLINE
++static inline mp_limb_t
++FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
++    __attribute__ ((always_inline));
++
++static inline
++#endif
++mp_limb_t
++FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
++{
++  ASSERT (n >= 1);
++  ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n));
++
++  /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in
++     VRs (using each VR as a single 128-bit accumulator).
++     The inner loop is unrolled to four limbs, with two blocks of four
++     multiplications each. Since the MLGR operation operates on even/odd GPR
++     pairs, pin the products appropriately. */
++
++  /* products as GPR pairs */
++  register mp_limb_t p0_high asm("r0");
++  register mp_limb_t p0_low asm("r1");
++
++  register mp_limb_t p1_high asm("r8");
++  register mp_limb_t p1_low asm("r9");
++
++  register mp_limb_t p2_high asm("r6");
++  register mp_limb_t p2_low asm("r7");
++
++  register mp_limb_t p3_high asm("r10");
++  register mp_limb_t p3_low asm("r11");
++
++  /* carry flag for 128-bit add in VR for first carry chain */
++  vec_t carry_vec0 = { .dw = vec_splat_u64 (0) };
++  mp_limb_t carry_limb = 0;
++
++#ifdef ADD
++  /* 2nd carry flag for 2nd carry chain with addmul */
++  vec_t carry_vec1 = { .dw = vec_splat_u64 (0) };
++  vec_t sum0;
++  vec_t rp0_addend, rp1_addend;
++  rp0_addend.dw = vec_splat_u64 (0);
++  rp1_addend.dw = vec_splat_u64 (0);
++#endif
++  vec_t sum1;
++
++  vec_t carry_prod = { .dw = vec_splat_u64 (0) };
++
++  /* The scalar multiplications compete with pointer and index increments for
++   * issue ports. Thus, increment the loop index in the middle of the loop so
++   * that the operations for the next iteration's multiplications can be
++   * loaded in time (looks horrible, yet helps performance) and make sure we
++   * use addressing with base reg + index reg + immediate displacement
++   * (so that only the single index needs incrementing, instead of multiple
++   * pointers). */
++#undef LOOP_ADVANCE
++#undef IDX_OFFSET
++
++#define LOOP_ADVANCE 4 * sizeof (mp_limb_t)
++#define IDX_OFFSET (LOOP_ADVANCE)
++  register ssize_t idx = 0 - IDX_OFFSET;
++
++  /*
++   * branch-on-count implicitly hint to the branch prediction as taken, while
++   * compare-and-branch hints as not taken. currently, using branch-on-count
++   * has a performance advantage, but it is not clear that it is generally the
++   * better choice (e.g., branch-on-count requires decrementing the separate
++   * counter). so, allow switching the loop condition to enable either
++   * category of branch instructions:
++   * - idx is less than an upper bound, for compare-and-branch
++   * - iteration counter greater than zero, for branch-on-count
++   */
++#define BRCTG
++#ifdef BRCTG
++  ssize_t iterations = (size_t)n / 4;
++#else
++  ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET;
++#endif
++
++  /* products will be transferred into VRs before adding up.
++   * see main loop below for comments on accumulation scheme. */
++  vec_t product0, product1, product2;
++
++  product0.dw = vec_splat_u64 (0);
++
++  switch ((size_t)n % 4)
++    {
++    case 0:
++      break;
++
++    case 1:
++      idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET;
++
++      p3_low = s1p[0];
++      s390_umul_ppmm (p3_high, p3_low, s2limb);
++
++#ifdef ADD
++      rp0_addend.dw[1] = rp[0];
++      product0.dw[1] = p3_low;
++
++      sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
++      carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
++
++      rp[0] = sum0.dw[1];
++#else
++      rp[0] = p3_low;
++#endif
++
++      carry_limb = p3_high;
++      break;
++
++    case 2:
++      p0_low = s1p[0];
++      p3_low = s1p[1];
++      idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET;
++
++      s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb);
++
++      carry_prod.dw[0] = p3_low;
++
++      product0.dw = vec_load_2di_as_pair (p0_high, p0_low);
++
++      carry_limb = p3_high;
++
++#ifdef ADD
++      rp0_addend = vec_load_elements_reversed (rp, 0);
++      sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
++      carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
++
++      sum1.sw = vec_add_u128 (sum0.sw, product0.sw);
++      carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw);
++#else
++      sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw);
++      carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw);
++#endif
++
++      vec_store_elements_reversed (rp, 0, sum1);
++
++      break;
++
++    case 3:
++      idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET;
++
++      p0_low = s1p[0];
++      s390_umul_ppmm (p0_high, p0_low, s2limb);
++
++#ifdef ADD
++      rp0_addend.dw[1] = rp[0];
++      product0.dw[1] = p0_low;
++
++      sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
++      carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
++
++      rp[0] = sum0.dw[1];
++#else
++      rp[0] = p0_low;
++#endif
++      carry_limb = p0_high;
++
++      p1_low = s1p[1];
++      p3_low = s1p[2];
++
++      s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb);
++
++      carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb);
++      product1.dw = vec_load_2di_as_pair (p1_high, p1_low);
++      carry_limb = p3_high;
++
++#ifdef ADD
++      rp0_addend = vec_load_elements_reversed (rp, 8);
++      sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
++      carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
++
++      sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw);
++      carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw);
++#else
++      sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
++      carry_vec0.sw
++          = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
++#endif
++      vec_store_elements_reversed (rp, 8, sum1);
++      break;
++    }
++
++#ifdef BRCTG
++  for (; iterations > 0; iterations--)
++    {
++#else
++  while (idx < idx_bound)
++    {
++#endif
++      vec_t overlap_addend0;
++      vec_t overlap_addend1;
++
++      /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the
++       * result in a GPR pair. One of the factors is taken from the GPR pair
++       * and overwritten.
++       * To reuse factors, it turned out cheaper to load limbs multiple times
++       * than copying GPR contents. Enforce that and the use of addressing by
++       * base + index gpr + immediate displacement via inline asm.
++       */
++      ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET);
++      ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET);
++      ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET);
++      ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET);
++
++      /*
++       * accumulate products as follows (for addmul):
++       *                       | rp[i+3] | rp[i+2] | rp[i+1] | rp[i]   |
++       *                                             p0_high | p0_low  |
++       *                                   p1_high | p1_low  | carry-limb in
++       *                         p2_high | p2_low  |
++       * c-limb out <- p3_high | p3_low  |
++       *                       | <    128-bit VR   > <   128-bit VR    >
++       *
++       *                         <   rp1_addend    > <  rp0_addend     >
++       *     carry-chain 0  <-   +           <-      +  <- carry_vec0[127]
++       *                         <   product1      > <  product0       >
++       *     carry-chain 1  <-   +           <-      +  <- carry_vec1[127]
++       *                         < overlap_addend1 > < overlap_addend0 >
++       *
++       * note that a 128-bit add with carry in + out is built from two insns
++       * - vec_adde_u128 (vacq) provides sum
++       * - vec_addec_u128 (vacccq) provides the new carry bit
++       */
++
++      s390_double_umul_ppmm (p0_high, p0_low, p1_high, p1_low, s2limb);
++
++      /*
++       * "barrier" to enforce scheduling loads for all limbs and first round
++       * of MLGR before anything else.
++       */
++      asm volatile("");
++
++      product0.dw = vec_load_2di_as_pair (p0_high, p0_low);
++
++#ifdef ADD
++      rp0_addend = vec_load_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET);
++      rp1_addend = vec_load_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET);
++#endif
++      /* increment loop index to unblock dependant loads of limbs for the next
++       * iteration (see above at #define LOOP_ADVANCE) */
++      idx += LOOP_ADVANCE;
++
++      s390_double_umul_ppmm (p2_high, p2_low, p3_high, p3_low, s2limb);
++
++      overlap_addend0.dw = vec_load_2di_as_pair (p1_low, carry_limb);
++      asm volatile("");
++
++#ifdef ADD
++      sum0.sw = vec_adde_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw);
++      sum1.sw = vec_adde_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw);
++
++      carry_vec0.sw
++          = vec_addec_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw);
++      carry_vec1.sw
++          = vec_addec_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw);
++#else
++      sum1.sw = vec_adde_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw);
++      carry_vec0.sw
++          = vec_addec_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw);
++#endif
++
++      asm volatile("");
++      product2.dw = vec_load_2di_as_pair (p2_high, p2_low);
++      overlap_addend1.dw = vec_load_2di_as_pair (p3_low, p1_high);
++
++      vec_t sum4;
++
++#ifdef ADD
++      vec_t sum3;
++      sum3.sw = vec_adde_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw);
++      sum4.sw = vec_adde_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw);
++
++      carry_vec0.sw
++          = vec_addec_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw);
++      carry_vec1.sw
++          = vec_addec_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw);
++#else
++      sum4.sw = vec_adde_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw);
++      carry_vec0.sw
++          = vec_addec_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw);
++#endif
++      vec_store_elements_reversed_idx (rp, idx, IDX_OFFSET - LOOP_ADVANCE,
++                                       sum1);
++      vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE,
++                                       sum4);
++
++      carry_limb = p3_high;
++    }
++
++#ifdef ADD
++  carry_vec0.dw += carry_vec1.dw;
++  carry_limb += carry_vec0.dw[1];
++#else
++  carry_limb += carry_vec0.dw[1];
++#endif
++
++  return carry_limb;
++}
++
++#undef OPERATION_addmul_1
++#undef OPERATION_mul_1
++#undef FUNCNAME
++#undef ADD
+diff --git a/mpn/s390_64/z13/common-vec.h b/mpn/s390_64/z13/common-vec.h
+new file mode 100644
+index 000000000..a59e6eefe
+--- /dev/null
++++ b/mpn/s390_64/z13/common-vec.h
+@@ -0,0 +1,175 @@
++/* Common vector helpers and macros for IBM z13 and later
++
++Copyright 2021 Free Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.  */
++
++#ifndef __S390_64_Z13_COMMON_VEC_H
++#define __S390_64_Z13_COMMON_VEC_H
++
++#include <unistd.h>
++#include <vecintrin.h>
++
++/*
++ * Vector intrinsics use vector element types that kind-of make sense for the
++ * specific operation (e.g., vec_permi permutes doublewords). To use VRs
++ * interchangeably with different intrinsics, typedef the two variants and wrap
++ * them in a union.
++ */
++#define VLEN_BYTES 16
++typedef unsigned long long v2di __attribute__ ((vector_size (VLEN_BYTES)));
++typedef unsigned char v16qi __attribute__ ((vector_size (VLEN_BYTES)));
++
++/*
++ * The Z vector intrinsics use vectors with different element types (e.g.,
++ * v16qi for the 128-bit adds and v2di for vec_permi).
++ */
++union vec
++{
++  v2di dw;
++  v16qi sw;
++};
++
++typedef union vec vec_t;
++
++/*
++ * single-instruction combine of two GPRs into a VR
++ */
++static inline v2di
++vec_load_2di_as_pair (unsigned long a, unsigned long b)
++{
++  v2di res;
++  __asm__("vlvgp\t%0,%1,%2" : "=v"(res) : "r"(a), "r"(b));
++  return res;
++}
++
++/*
++ * 64x64 mult where caller needs to care about proper register allocation:
++ * multiply xl with m1, treating both as unsigned, and place the result in
++ * xh:xl.
++ * mlgr operates on register pairs, so xh must be an even gpr followed by xl
++ */
++#define s390_umul_ppmm(xh, xl, m1)                                              \
++  do                                                                          \
++    {                                                                         \
++      asm("mlgr\t%0,%3" : "=r"(xh), "=r"(xl) : "%1"(xl), "r"(m1));            \
++    }                                                                         \
++  while (0);
++
++/*
++ * two 64x64 multiplications, scheduled so that they will dispatch and issue to
++ * different sides: each mlgr is dispatched alone in an instruction group and
++ * subsequent groups will issue on different execution sides.
++ * there is a variant where both products use the same multiplicand and one
++ * that uses two different multiplicands. constraints from s390_umul_ppmm apply
++ * here.
++ */
++#define s390_double_umul_ppmm(X0H, X0L, X1H, X1L, MX)                           \
++  do                                                                          \
++    {                                                                         \
++      asm("mlgr\t%[x0h],%[mx]\n\t"                                            \
++          "mlgr\t%[x1h],%[mx]"                                                \
++          : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H),              \
++            [x1l] "=r"(X1L)                                                   \
++          : "[x0l]"(X0L), "[x1l]"(X1L), [mx] "r"(MX));                        \
++    }                                                                         \
++  while (0);
++
++#define s390_double_umul_ppmm_distinct(X0H, X0L, X1H, X1L, MX0, MX1)            \
++  do                                                                          \
++    {                                                                         \
++      asm("mlgr\t%[x0h],%[mx0]\n\t"                                           \
++          "mlgr\t%[x1h],%[mx1]"                                               \
++          : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H),              \
++            [x1l] "=r"(X1L)                                                   \
++          : "[x0l]"(X0L), "[x1l]"(X1L), [mx0] "r"(MX0), [mx1] "r"(MX1));      \
++    }                                                                         \
++  while (0);
++
++#define ASM_LOADGPR_BASE(DST, BASE, OFFSET)                                   \
++  asm volatile("lg\t%[r],%[off](%[b])"                                        \
++               : [r] "=r"(DST)                                                \
++               : [b] "a"(BASE), [off] "L"(OFFSET)                             \
++               : "memory");
++
++#define ASM_LOADGPR(DST, BASE, INDEX, OFFSET)                                 \
++  asm volatile("lg\t%[r],%[off](%[b],%[x])"                                   \
++               : [r] "=r"(DST)                                                \
++               : [b] "a"(BASE), [x] "a"(INDEX), [off] "L"(OFFSET)             \
++               : "memory");
++
++/*
++ * Load a vector register from memory and swap the two 64-bit doubleword
++ * elements.
++ */
++static inline vec_t
++vec_load_elements_reversed_idx (mp_limb_t const *base, ssize_t const index,
++                                ssize_t const offset)
++{
++  vec_t res;
++  char *ptr = (char *)base;
++
++  res.sw = *(v16qi *)(ptr + index + offset);
++  res.dw = vec_permi (res.dw, res.dw, 2);
++
++  return res;
++}
++
++static inline vec_t
++vec_load_elements_reversed (mp_limb_t const *base, ssize_t const offset)
++{
++  return vec_load_elements_reversed_idx (base, 0, offset);
++}
++
++/*
++ * Store a vector register to memory and swap the two 64-bit doubleword
++ * elements.
++ */
++static inline void
++vec_store_elements_reversed_idx (mp_limb_t *base, ssize_t const index,
++                                 ssize_t const offset, vec_t vec)
++{
++  char *ptr = (char *)base;
++
++  vec.dw = vec_permi (vec.dw, vec.dw, 2);
++  *(v16qi *)(ptr + index + offset) = vec.sw;
++}
++
++static inline void
++vec_store_elements_reversed (mp_limb_t *base, ssize_t const offset, vec_t vec)
++{
++  vec_store_elements_reversed_idx (base, 0, offset, vec);
++}
++
++#define ASM_VZERO(VEC)                                                        \
++  do                                                                          \
++    {                                                                         \
++      asm("vzero\t%[vec]" : [vec] "=v"(VEC));                                 \
++    }                                                                         \
++  while (0)
++
++#endif
+diff --git a/mpn/s390_64/z13/mul_1.c b/mpn/s390_64/z13/mul_1.c
+new file mode 100644
+index 000000000..7584dc8c7
+--- /dev/null
++++ b/mpn/s390_64/z13/mul_1.c
+@@ -0,0 +1,31 @@
++/* mul_1 for IBM z13 or later
++
++Copyright 2021 Free Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.  */
++
++#include "s390_64/z13/addmul_1.c"
+-- 
+2.40.1
+
diff --git a/SOURCES/ibm_z13_simd_part2.patch b/SOURCES/ibm_z13_simd_part2.patch
new file mode 100644
index 0000000..3d216d9
--- /dev/null
+++ b/SOURCES/ibm_z13_simd_part2.patch
@@ -0,0 +1,535 @@
+Co-authored-by: Stefan Liebler <stli at linux.ibm.com>
+---
+ mpn/s390_64/z13/aormul_2.c   | 476 +++++++++++++++++++++++++++++++++++
+ mpn/s390_64/z13/gmp-mparam.h |  37 +++
+ 2 files changed, 513 insertions(+)
+ create mode 100644 mpn/s390_64/z13/aormul_2.c
+ create mode 100644 mpn/s390_64/z13/gmp-mparam.h
+
+diff --git a/mpn/s390_64/z13/aormul_2.c b/mpn/s390_64/z13/aormul_2.c
+new file mode 100644
+index 000000000..9a69fc38e
+--- /dev/null
++++ b/mpn/s390_64/z13/aormul_2.c
+@@ -0,0 +1,476 @@
++/* Addmul_2 / mul_2 for IBM z13 or later
++
++Copyright 2021 Free Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.  */
++
++#include "gmp-impl.h"
++
++#include "s390_64/z13/common-vec.h"
++
++#undef FUNCNAME
++
++#ifdef DO_INLINE
++#  ifdef OPERATION_addmul_2
++#    define ADD
++#    define FUNCNAME inline_addmul_2
++#  elif defined(OPERATION_mul_2)
++#    define FUNCNAME inline_mul_2
++#  else
++#    error Missing define for operation to perform
++#  endif
++#else
++#  ifdef OPERATION_addmul_2
++#    define ADD
++#    define FUNCNAME mpn_addmul_2
++#  elif defined(OPERATION_mul_2)
++#    define FUNCNAME mpn_mul_2
++#  else
++#    error Missing define for operation to perform
++#  endif
++#endif
++
++#ifdef DO_INLINE
++static inline mp_limb_t
++FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, const mp_limb_t *vp)
++    __attribute__ ((always_inline));
++
++static inline
++#endif
++mp_limb_t
++FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n,
++          const mp_limb_t *vp)
++{
++
++  /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in
++     VRs (using each VR as a single 128-bit accumulator).
++     The inner loop is unrolled to four limbs, with two blocks of four
++     multiplications each. Since the MLGR operation operates on even/odd GPR
++     pairs, pin the products appropriately. */
++
++  register mp_limb_t p0_high asm("r0");
++  register mp_limb_t p0_low asm("r1");
++
++  register mp_limb_t p1_high asm("r8");
++  register mp_limb_t p1_low asm("r9");
++
++  register mp_limb_t p2_high asm("r6");
++  register mp_limb_t p2_low asm("r7");
++
++  register mp_limb_t p3_high asm("r10");
++  register mp_limb_t p3_low asm("r11");
++
++  vec_t carry_prod = { .dw = vec_splat_u64 (0) };
++  vec_t zero = { .dw = vec_splat_u64 (0) };
++
++  /* two carry-bits for the 128-bit VR adds - stored in VRs */
++#ifdef ADD
++  vec_t carry_vec0 = { .dw = vec_splat_u64 (0) };
++#endif
++  vec_t carry_vec1 = { .dw = vec_splat_u64 (0) };
++
++  vec_t tmp;
++
++  vec_t sum0, sum1;
++
++  /* products transferred into VRs for accumulating there */
++  vec_t pv0, pv3;
++  vec_t pv1_low, pv1_high, pv2_low, pv2_high;
++  vec_t low, middle, high;
++#ifdef ADD
++  vec_t rp0, rp1;
++#endif
++
++  register mp_limb_t v0 asm("r12");
++  register mp_limb_t v1 asm("r5");
++  v0 = vp[0];
++  v1 = vp[1];
++
++  /* The scalar multiplications compete with pointer and index increments for
++   * issue ports. Thus, increment the loop index in the middle of the loop so
++   * that the operations for the next iteration's multiplications can be
++   * loaded in time (looks horrible, yet helps performance) and make sure we
++   * use addressing with base reg + index reg + immediate displacement
++   * (so that only the single index needs incrementing, instead of multiple
++   * pointers). */
++#undef LOOP_ADVANCE
++#define LOOP_ADVANCE (4 * sizeof (mp_limb_t))
++#define IDX_OFFSET (LOOP_ADVANCE)
++
++  register ssize_t idx = 0 - IDX_OFFSET;
++#ifdef BRCTG
++  ssize_t iterations = (size_t)n / 4;
++#else
++  ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET;
++#endif
++
++  /*
++   * To minimize latency in the carry chain, accumulate in VRs with 128-bit
++   * adds with carry in and out. As a downside, these require two insns for
++   * each add - one to calculate the sum, one to deliver the carry out.
++   * To reduce the overall number of insns to execute, combine adding up
++   * product limbs such that there cannot be a carry out and one (for mul) or
++   * two (for addmul) adds with carry chains.
++   *
++   * Since (2^64-1) * (2^64-1) = (2^128-1) - 2 * (2^64-1), we can add two
++   * limbs into each 128-bit product without causing carry out.
++   *
++   * For each block of 2 limbs * 2 limbs
++   *
++   *                             |  u[i] * v[0] (p2) |
++   *                   |  u[i] * v[1] (p0) |
++   *                   | u[i+1] * v[0](p1) |
++   *         | u[i+1] * v[1](p3) |
++   *         <     128 bits     > <    128 bits      >
++   *
++   * we can begin accumulating with "simple" carry-oblivious 128-bit adds:
++   * - p0 + low limb of p1
++   *      + high limb of p2
++   *      and combine resulting low limb with p2's low limb
++   * - p3 + high limb of p1
++   *      + high limb of sum above
++   * ... which will will result in two 128-bit limbs to be fed into the carry
++   * chain(s).
++   * Overall, that scheme saves instructions and improves performance, despite
++   * slightly increasing latency between multiplications and carry chain (yet
++   * not in the carry chain).
++   */
++
++#define LOAD_LOW_LIMB(VEC, LIMB)                                              \
++  do                                                                          \
++    {                                                                         \
++      asm("vzero\t%[vec]\n\t"                                                 \
++          "vlvgg\t%[vec],%[limb],1"                                           \
++          : [vec] "=v"(VEC)                                                   \
++          : [limb] "r"(LIMB));                                                \
++    }                                                                         \
++  while (0)
++
++  /* for the 128-bit adds in the carry chain, to calculate a + b + carry-in we
++   * need paired vec_adde_u128 (delivers sum) and vec_addec_u128 (delivers new
++   * carry) */
++#define ADD_UP2_CARRY_INOUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2)               \
++  do                                                                          \
++    {                                                                         \
++      sum##SUMIDX.sw                                                          \
++          = vec_adde_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw);   \
++      carry_vec##CARRYIDX.sw                                                  \
++          = vec_addec_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw);  \
++    }                                                                         \
++  while (0)
++
++#define ADD_UP_CARRY_INOUT(SUMIDX, ADDEND1, ADDEND2)                          \
++  ADD_UP2_CARRY_INOUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2)
++
++  /* variant without carry-in for prologue */
++#define ADD_UP2_CARRY_OUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2)                 \
++  do                                                                          \
++    {                                                                         \
++      sum##SUMIDX.sw = vec_add_u128 (ADDEND1.sw, ADDEND2.sw);                 \
++      carry_vec##CARRYIDX.sw = vec_addc_u128 (ADDEND1.sw, ADDEND2.sw);        \
++    }                                                                         \
++  while (0)
++
++#define ADD_UP_CARRY_OUT(SUMIDX, ADDEND1, ADDEND2)                            \
++  ADD_UP2_CARRY_OUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2)
++
++  /* prologue for 4x-unrolled main loop */
++  switch ((size_t)n % 4)
++    {
++    case 1:
++      ASM_LOADGPR_BASE (p0_low, up, 0);
++      ASM_LOADGPR_BASE (p1_low, up, 0);
++      s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1);
++      carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low);
++
++/* gcc tries to be too clever and vlr from a reg that is already zero. vzero is
++ * cheaper. */
++#  define NEW_CARRY(VEC, LIMB)                                                \
++    do                                                                        \
++      {                                                                       \
++        asm("vzero\t%[vec]\n\t"                                               \
++            "vlvgg\t%[vec],%[limb],1"                                         \
++            : [vec] "=v"(VEC)                                                 \
++            : [limb] "r"(LIMB));                                              \
++      }                                                                       \
++    while (0)
++
++      NEW_CARRY (tmp, p0_high);
++
++      carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw);
++#ifdef ADD
++      carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp);
++#else
++      rp[0] = p0_low;
++#endif
++      idx += sizeof (mp_limb_t);
++      break;
++
++    case 2:
++      ASM_LOADGPR_BASE (p0_low, up, 0);
++      ASM_LOADGPR_BASE (p1_low, up, 8);
++      ASM_LOADGPR_BASE (p2_low, up, 0);
++      ASM_LOADGPR_BASE (p3_low, up, 8);
++
++      asm(""
++          : "=r"(p0_low), "=r"(p2_low)
++          : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low));
++      s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
++      s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
++
++      pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
++      LOAD_LOW_LIMB (pv1_low, p1_low);
++      LOAD_LOW_LIMB (pv1_high, p1_high);
++      pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
++      LOAD_LOW_LIMB (pv2_high, p2_high);
++      pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
++      LOAD_LOW_LIMB (pv2_low, p2_low);
++      pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
++      middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
++      low.dw = vec_permi (middle.dw, pv2_low.dw, 3);
++      middle.dw = vec_permi (zero.dw, middle.dw, 0);
++      high.sw = vec_add_u128 (middle.sw, pv3.sw);
++#ifdef ADD
++      rp0 = vec_load_elements_reversed (rp, 0);
++      ADD_UP_CARRY_OUT (0, rp0, carry_prod);
++#else
++      sum0 = carry_prod;
++#endif
++      ADD_UP_CARRY_OUT (1, sum0, low);
++      vec_store_elements_reversed (rp, 0, sum1);
++      carry_prod = high;
++
++      idx += 2 * sizeof (mp_limb_t);
++      break;
++
++    case 3:
++      ASM_LOADGPR_BASE (p0_low, up, 0);
++      ASM_LOADGPR_BASE (p1_low, up, 0);
++      s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1);
++      carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low);
++      NEW_CARRY (tmp, p0_high);
++      carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw);
++
++#ifdef ADD
++      carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp);
++#else
++      rp[0] = p0_low;
++#endif
++
++      ASM_LOADGPR_BASE (p0_low, up, 8);
++      ASM_LOADGPR_BASE (p1_low, up, 16);
++      ASM_LOADGPR_BASE (p2_low, up, 8);
++      ASM_LOADGPR_BASE (p3_low, up, 16);
++
++      asm(""
++          : "=r"(p0_low), "=r"(p2_low)
++          : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low));
++      s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
++      s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
++
++      pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
++
++      LOAD_LOW_LIMB (pv1_low, p1_low);
++      LOAD_LOW_LIMB (pv1_high, p1_high);
++
++      pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
++      LOAD_LOW_LIMB (pv2_high, p2_high);
++      pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
++
++      LOAD_LOW_LIMB (pv2_low, p2_low);
++
++      pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
++      middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
++
++      low.dw = vec_permi (middle.dw, pv2_low.dw, 3);
++      middle.dw = vec_permi (zero.dw, middle.dw, 0);
++      high.sw = vec_add_u128 (middle.sw, pv3.sw);
++
++#ifdef ADD
++      vec_t rp0 = vec_load_elements_reversed (rp, 8);
++      ADD_UP_CARRY_OUT (0, rp0, carry_prod);
++#else
++      sum0 = carry_prod;
++#endif
++      ADD_UP_CARRY_INOUT (1, sum0, low);
++
++      vec_store_elements_reversed (rp, 8, sum1);
++
++      carry_prod = high;
++
++      idx += 3 * sizeof (mp_limb_t);
++      break;
++    }
++
++    /*
++     * branch-on-count implicitly hint to the branch prediction as taken, while
++     * compare-and-branch hints as not taken. currently, using branch-on-count
++     * has a performance advantage, but it is not clear that it is generally
++     * the better choice (e.g., branch-on-count requires decrementing the
++     * separate counter). so, allow switching the loop condition to enable
++     * either category of branch instructions:
++     * - idx is less than an upper bound, for compare-and-branch
++     * - iteration counter greater than zero, for branch-on-count
++     */
++#ifdef BRCTG
++  for (; iterations > 0; iterations--)
++    {
++#else
++  while (idx < idx_bound)
++    {
++#endif
++      /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the
++       * result in a GPR pair. One of the factors is taken from the GPR pair
++       * and overwritten.
++       * To reuse factors, it turned out cheaper to load limbs multiple times
++       * than copying GPR contents. Enforce that and the use of addressing by
++       * base + index gpr + immediate displacement via inline asm.
++       */
++      ASM_LOADGPR (p0_low, up, idx, 0 + IDX_OFFSET);
++      ASM_LOADGPR (p1_low, up, idx, 8 + IDX_OFFSET);
++      ASM_LOADGPR (p2_low, up, idx, 0 + IDX_OFFSET);
++      ASM_LOADGPR (p3_low, up, idx, 8 + IDX_OFFSET);
++
++      s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
++
++      pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
++
++      LOAD_LOW_LIMB (pv1_low, p1_low);
++      LOAD_LOW_LIMB (pv1_high, p1_high);
++
++      s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
++
++      pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
++      LOAD_LOW_LIMB (pv2_high, p2_high);
++      pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
++
++      LOAD_LOW_LIMB (pv2_low, p2_low);
++
++      ASM_LOADGPR (p0_low, up, idx, 16 + IDX_OFFSET);
++      ASM_LOADGPR (p1_low, up, idx, 24 + IDX_OFFSET);
++      ASM_LOADGPR (p2_low, up, idx, 16 + IDX_OFFSET);
++      ASM_LOADGPR (p3_low, up, idx, 24 + IDX_OFFSET);
++
++      idx += LOOP_ADVANCE;
++
++      /*
++       * "barrier" to enforce scheduling the index increment before the second
++       * block of multiplications. not required for clang.
++       */
++#ifndef __clang__
++      asm(""
++          : "=r"(idx), "=r"(p0_high), "=r"(p2_high)
++          : "0"(idx), "1"(p0_high), "2"(p2_high));
++#endif
++
++      s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
++      s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
++
++      /*
++       * "barrier" to enforce scheduling all MLGRs first, before any adding
++       * up. note that clang produces better code without.
++       */
++#ifndef __clang__
++      asm(""
++          : "=v"(pv0.sw), "=v"(pv3.sw)
++          : "1"(pv3.sw), "0"(pv0.sw), "r"(p0_high), "r"(p2_high));
++#endif
++
++      pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
++      middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
++
++      low.dw = vec_permi (middle.dw, pv2_low.dw,
++                          3); /* least-significant doubleword from both vectors */
++      middle.dw = vec_permi (zero.dw, middle.dw, 0);
++      high.sw = vec_add_u128 (middle.sw, pv3.sw);
++
++#ifdef ADD
++      rp0 = vec_load_elements_reversed_idx (rp, idx,
++                                            0 + IDX_OFFSET - LOOP_ADVANCE);
++      ADD_UP_CARRY_INOUT (0, rp0, carry_prod);
++#else
++      sum0 = carry_prod;
++#endif
++      ADD_UP_CARRY_INOUT (1, sum0, low);
++
++      vec_store_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET - LOOP_ADVANCE,
++                                       sum1);
++
++      carry_prod = high;
++
++      vec_t pv0_2, pv3_2;
++      vec_t pv1_low_2, pv1_high_2, pv2_low_2, pv2_high_2;
++      vec_t low_2, middle_2, high_2;
++      vec_t sum2, sum3;
++
++      pv0_2.dw = vec_load_2di_as_pair (p0_high, p0_low);
++      LOAD_LOW_LIMB (pv1_low_2, p1_low);
++      LOAD_LOW_LIMB (pv1_high_2, p1_high);
++
++      pv0_2.sw = vec_add_u128 (pv0_2.sw, pv1_low_2.sw);
++      LOAD_LOW_LIMB (pv2_high_2, p2_high);
++      pv3_2.dw = vec_load_2di_as_pair (p3_high, p3_low);
++      pv3_2.sw = vec_add_u128 (pv3_2.sw, pv1_high_2.sw);
++      middle_2.sw = vec_add_u128 (pv0_2.sw, pv2_high_2.sw);
++
++      LOAD_LOW_LIMB (pv2_low_2, p2_low);
++      low_2.dw
++          = vec_permi (middle_2.dw, pv2_low_2.dw,
++                       3); /* least-significant doubleword from both vectors */
++      middle_2.dw = vec_permi (zero.dw, middle_2.dw, 0);
++      high_2.sw = vec_add_u128 (middle_2.sw, pv3_2.sw);
++
++      /*
++       * another "barrier" to influence scheduling. (also helps in clang)
++       */
++      asm("" : : "v"(pv0_2.sw), "r"(p2_high), "r"(p3_high), "v"(pv3_2.sw));
++
++#ifdef ADD
++      rp1 = vec_load_elements_reversed_idx (rp, idx,
++                                            16 + IDX_OFFSET - LOOP_ADVANCE);
++      ADD_UP2_CARRY_INOUT (2, 0, rp1, carry_prod);
++#else
++      sum2 = carry_prod;
++#endif
++      ADD_UP2_CARRY_INOUT (3, 1, sum2, low_2);
++
++      vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE,
++                                       sum3);
++
++      carry_prod = high_2;
++    }
++
++#ifdef ADD
++  sum0.sw = vec_adde_u128 (carry_prod.sw, carry_vec0.sw, carry_vec1.sw);
++#else
++  sum0.sw = vec_add_u128 (carry_prod.sw, carry_vec1.sw);
++#endif
++
++  *(mp_ptr) (((char *)rp) + idx + 0 + IDX_OFFSET) = (mp_limb_t)sum0.dw[1];
++
++  return (mp_limb_t)sum0.dw[0];
++}
+diff --git a/mpn/s390_64/z13/gmp-mparam.h b/mpn/s390_64/z13/gmp-mparam.h
+new file mode 100644
+index 000000000..a17503fd0
+--- /dev/null
++++ b/mpn/s390_64/z13/gmp-mparam.h
+@@ -0,0 +1,37 @@
++/* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file.
++
++Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.  */
++
++#define GMP_LIMB_BITS 64
++#define GMP_LIMB_BYTES 8
++
++#define HAVE_NATIVE_mpn_addmul_2 1
++#define HAVE_NATIVE_mpn_mul_2 1
++
++#include "mpn/s390_64/gmp-mparam.h"
+-- 
+2.40.1
diff --git a/SOURCES/ibm_z13_simd_part3.patch b/SOURCES/ibm_z13_simd_part3.patch
new file mode 100644
index 0000000..8301e57
--- /dev/null
+++ b/SOURCES/ibm_z13_simd_part3.patch
@@ -0,0 +1,138 @@
+Co-authored-by: Stefan Liebler <stli at linux.ibm.com>
+---
+ mpn/s390_64/z13/mul_basecase.c | 124 +++++++++++++++++++++++++++++++++
+ 1 file changed, 124 insertions(+)
+ create mode 100644 mpn/s390_64/z13/mul_basecase.c
+
+diff --git a/mpn/s390_64/z13/mul_basecase.c b/mpn/s390_64/z13/mul_basecase.c
+new file mode 100644
+index 000000000..f1b7160b3
+--- /dev/null
++++ b/mpn/s390_64/z13/mul_basecase.c
+@@ -0,0 +1,124 @@
++/* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two
++   natural numbers of length m and n.
++
++   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
++   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
++
++Copyright 2021 Free Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.  */
++
++#include <stdlib.h>
++
++#include "gmp-impl.h"
++
++/* Note: we explicitly inline all mul and addmul routines here to reduce the
++ * number of branches in prologues of unrolled functions. That comes at the
++   cost of duplicating common loop bodies in object code. */
++#define DO_INLINE
++
++/*
++ * tweak loop conditions in addmul subroutines to enable use of
++ * branch-relative-on-count (BRCTG) instructions, which currently results in
++ * better performance.
++ */
++#define BRCTG
++
++#include "s390_64/z13/common-vec.h"
++
++#define OPERATION_mul_1
++#include "s390_64/z13/addmul_1.c"
++#undef OPERATION_mul_1
++
++#define OPERATION_addmul_1
++#include "s390_64/z13/addmul_1.c"
++#undef OPERATION_addmul_1
++
++#define OPERATION_mul_2
++#include "s390_64/z13/aormul_2.c"
++#undef OPERATION_mul_2
++
++#define OPERATION_addmul_2
++#include "s390_64/z13/aormul_2.c"
++#undef OPERATION_addmul_2
++
++void
++mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp,
++                  mp_size_t vn)
++{
++  ASSERT (un >= vn);
++  ASSERT (vn >= 1);
++  ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un));
++  ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn));
++
++  /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch
++   * for un%4 and inline specific variants. */
++
++#define BRANCH_FOR_MOD(N)                                                     \
++  do                                                                          \
++    {                                                                         \
++      if (vn >= 2)                                                            \
++        {                                                                     \
++          rp[un + 1] = inline_mul_2 (rp, up, un, vp);                         \
++          rp += 2, vp += 2, vn -= 2;                                          \
++        }                                                                     \
++      else                                                                    \
++        {                                                                     \
++          rp[un] = inline_mul_1 (rp, up, un, vp[0]);                          \
++          return;                                                             \
++        }                                                                     \
++                                                                              \
++      while (vn >= 2)                                                         \
++        {                                                                     \
++          rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp);                  \
++          rp += 2, vp += 2, vn -= 2;                                          \
++        }                                                                     \
++                                                                              \
++      while (vn >= 1)                                                         \
++        {                                                                     \
++          rp[un] = inline_addmul_1 (rp, up, un, vp[0]);                       \
++          rp += 1, vp += 1, vn -= 1;                                          \
++        }                                                                     \
++    }                                                                         \
++  while (0);
++
++  switch (((size_t)un) % 4)
++    {
++    case 0:
++      BRANCH_FOR_MOD (0);
++      break;
++    case 1:
++      BRANCH_FOR_MOD (1);
++      break;
++    case 2:
++      BRANCH_FOR_MOD (2);
++      break;
++    case 3:
++      BRANCH_FOR_MOD (3);
++      break;
++    }
++}
+-- 
+2.40.1
diff --git a/SOURCES/ibm_z13_simd_part4.patch b/SOURCES/ibm_z13_simd_part4.patch
new file mode 100644
index 0000000..c87c17c
--- /dev/null
+++ b/SOURCES/ibm_z13_simd_part4.patch
@@ -0,0 +1,151 @@
+From: Marius Hillenbrand <mhillen at linux.ibm.com>
+
+---
+ mpn/s390_64/z13/gmp-mparam.h | 129 ++++++++++++++++++++++++++++++++++-
+ 1 file changed, 127 insertions(+), 2 deletions(-)
+
+diff --git a/mpn/s390_64/z13/gmp-mparam.h b/mpn/s390_64/z13/gmp-mparam.h
+index a17503fd0..50e7f39d1 100644
+--- a/mpn/s390_64/z13/gmp-mparam.h
++++ b/mpn/s390_64/z13/gmp-mparam.h
+@@ -1,6 +1,6 @@
+ /* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file.
+ 
+-Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
++Copyright 2021 Free Software Foundation, Inc.
+ 
+ This file is part of the GNU MP Library.
+ 
+@@ -34,4 +34,129 @@ see https://www.gnu.org/licenses/.  */
+ #define HAVE_NATIVE_mpn_addmul_2 1
+ #define HAVE_NATIVE_mpn_mul_2 1
+ 
+-#include "mpn/s390_64/gmp-mparam.h"
++/* Generated by tuneup.c, 2021-07-30, gcc 10.2 */
++
++#define DIVREM_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
++#define DIVREM_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
++#define MOD_1_1P_METHOD                      2
++#define MOD_1_NORM_THRESHOLD             MP_SIZE_T_MAX  /* never */
++#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
++#define MOD_1N_TO_MOD_1_1_THRESHOLD         17
++#define MOD_1U_TO_MOD_1_1_THRESHOLD         15
++#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
++#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
++#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      5
++#define USE_PREINV_DIVREM_1                  1
++#define DIV_QR_1N_PI1_METHOD                 3
++#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
++#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
++#define DIV_QR_2_PI2_THRESHOLD             996
++#define DIVEXACT_1_THRESHOLD                 4
++#define BMOD_1_TO_MOD_1_THRESHOLD            0  /* always */
++
++#define DIV_1_VS_MUL_1_PERCENT             404
++
++#define MUL_TOOM22_THRESHOLD                23
++#define MUL_TOOM33_THRESHOLD                94
++#define MUL_TOOM44_THRESHOLD               166
++#define MUL_TOOM6H_THRESHOLD               286
++#define MUL_TOOM8H_THRESHOLD               626
++
++#define MUL_TOOM32_TO_TOOM43_THRESHOLD     113
++#define MUL_TOOM32_TO_TOOM53_THRESHOLD     138
++#define MUL_TOOM42_TO_TOOM53_THRESHOLD     143
++#define MUL_TOOM42_TO_TOOM63_THRESHOLD     145
++#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
++
++#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
++#define SQR_TOOM2_THRESHOLD                 12
++#define SQR_TOOM3_THRESHOLD                 84
++#define SQR_TOOM4_THRESHOLD                234
++#define SQR_TOOM6_THRESHOLD                318
++#define SQR_TOOM8_THRESHOLD                478
++
++#define MULMID_TOOM42_THRESHOLD             42
++
++#define MULMOD_BNM1_THRESHOLD               13
++#define SQRMOD_BNM1_THRESHOLD                7
++
++#define MUL_FFT_MODF_THRESHOLD             332  /* k = 5 */
++#define MUL_FFT_TABLE3                                      \
++  { {    332, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
++    {     21, 7}, {     21, 8}, {     11, 7}, {     24, 8}, \
++    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
++    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
++    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
++    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
++    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
++    {     23, 9}, {     47,11}, {     15,10}, {     31, 9}, \
++    {     67,10}, {     47,11}, {   2048,12}, {   4096,13}, \
++    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
++    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
++    {2097152,22}, {4194304,23}, {8388608,24} }
++#define MUL_FFT_TABLE3_SIZE 47
++#define MUL_FFT_THRESHOLD                 2752
++
++#define SQR_FFT_MODF_THRESHOLD             240  /* k = 5 */
++#define SQR_FFT_TABLE3                                      \
++  { {    240, 5}, {      8, 4}, {     17, 5}, {     13, 6}, \
++    {      7, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
++    {      9, 5}, {     19, 6}, {     15, 7}, {      8, 6}, \
++    {     17, 7}, {      9, 6}, {     19, 7}, {     10, 6}, \
++    {     21, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \
++    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
++    {     21, 9}, {     11, 8}, {     23, 9}, {     15, 8}, \
++    {     31, 9}, {     19, 8}, {     39, 9}, {     23,10}, \
++    {     15, 9}, {     39,10}, {     23,11}, {     15,10}, \
++    {     31, 9}, {     63,10}, {     47,11}, {   2048,12}, \
++    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16}, \
++    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
++    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
++#define SQR_FFT_TABLE3_SIZE 52
++#define SQR_FFT_THRESHOLD                 1856
++
++#define MULLO_BASECASE_THRESHOLD             0  /* always */
++#define MULLO_DC_THRESHOLD                  25
++#define MULLO_MUL_N_THRESHOLD             5397
++#define SQRLO_BASECASE_THRESHOLD             0  /* always */
++#define SQRLO_DC_THRESHOLD                 396
++#define SQRLO_SQR_THRESHOLD               3704
++
++#define DC_DIV_QR_THRESHOLD                 15
++#define DC_DIVAPPR_Q_THRESHOLD              50
++#define DC_BDIV_QR_THRESHOLD                66
++#define DC_BDIV_Q_THRESHOLD                202
++
++#define INV_MULMOD_BNM1_THRESHOLD           46
++#define INV_NEWTON_THRESHOLD                29
++#define INV_APPR_THRESHOLD                  13
++
++#define BINV_NEWTON_THRESHOLD              312
++#define REDC_1_TO_REDC_2_THRESHOLD          79
++#define REDC_2_TO_REDC_N_THRESHOLD           0  /* always */
++
++#define MU_DIV_QR_THRESHOLD                979
++#define MU_DIVAPPR_Q_THRESHOLD             979
++#define MUPI_DIV_QR_THRESHOLD               13
++#define MU_BDIV_QR_THRESHOLD               942
++#define MU_BDIV_Q_THRESHOLD               1367
++
++#define POWM_SEC_TABLE  3,19,215,1730
++
++#define GET_STR_DC_THRESHOLD                10
++#define GET_STR_PRECOMPUTE_THRESHOLD        15
++#define SET_STR_DC_THRESHOLD               882
++#define SET_STR_PRECOMPUTE_THRESHOLD      2520
++
++#define FAC_DSC_THRESHOLD                  228
++#define FAC_ODD_THRESHOLD                   24
++
++#define MATRIX22_STRASSEN_THRESHOLD         19
++#define HGCD2_DIV1_METHOD                    1
++#define HGCD_THRESHOLD                      61
++#define HGCD_APPR_THRESHOLD                 51
++#define HGCD_REDUCE_THRESHOLD             1962
++#define GCD_DC_THRESHOLD                   217
++#define GCDEXT_DC_THRESHOLD                263
++#define JACOBI_BASE_METHOD                   4
++
+-- 
+2.40.1
diff --git a/SPECS/gmp.spec b/SPECS/gmp.spec
new file mode 100644
index 0000000..069f8d7
--- /dev/null
+++ b/SPECS/gmp.spec
@@ -0,0 +1,739 @@
+#
+# Important for %%{ix86}:
+# This rpm has to be build on a CPU with sse2 support like Pentium 4 !
+#
+
+Summary: A GNU arbitrary precision library
+Name: gmp
+Version: 6.2.0
+Release: 13%{?dist}
+Epoch: 1
+URL: http://gmplib.org/
+Source0: ftp://ftp.gmplib.org/pub/gmp-%{version}/gmp-%{version}.tar.bz2
+# or ftp://ftp.gnu.org/pub/gnu/gmp/gmp-%{version}.tar.xz
+Source2: gmp.h
+Source3: gmp-mparam.h
+Patch2: gmp-6.0.0-debuginfo.patch
+Patch3: gmp-intel-cet.patch
+Patch4: cve-2021-43618.patch
+Patch5: ibm_z13_simd_part1.patch
+Patch6: ibm_z13_simd_part2.patch
+Patch7: ibm_z13_simd_part3.patch
+Patch8: ibm_z13_simd_part4.patch
+License: LGPLv3+ or GPLv2+
+BuildRequires: autoconf automake libtool
+BuildRequires: gcc
+BuildRequires: gcc-c++
+BuildRequires: git
+#autoreconf on arm needs:
+BuildRequires: perl-Carp
+# Generate the .hmac checksum unless --without fips is used
+%bcond_without fips
+%if %{with fips}
+BuildRequires: fipscheck
+%endif
+BuildRequires: make
+
+%description
+The gmp package contains GNU MP, a library for arbitrary precision
+arithmetic, signed integers operations, rational numbers and floating
+point numbers. GNU MP is designed for speed, for both small and very
+large operands. GNU MP is fast because it uses fullwords as the basic
+arithmetic type, it uses fast algorithms, it carefully optimizes
+assembly code for many CPUs' most common inner loops, and it generally
+emphasizes speed over simplicity/elegance in its operations.
+
+Install the gmp package if you need a fast arbitrary precision
+library.
+
+%package c++
+Summary: C++ bindings for the GNU MP arbitrary precision library
+Requires: %{name}%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description c++
+Bindings for using the GNU MP arbitrary precision library in C++ applications.
+
+%package devel
+Summary: Development tools for the GNU MP arbitrary precision library
+Requires: %{name}%{?_isa} = %{epoch}:%{version}-%{release}
+Requires: %{name}-c++%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description devel
+The libraries, header files and documentation for using the GNU MP 
+arbitrary precision library in applications.
+
+If you want to develop applications which will use the GNU MP library,
+you'll need to install the gmp-devel package.  You'll also need to
+install the gmp package.
+
+%package static
+Summary: Development tools for the GNU MP arbitrary precision library
+Requires: %{name}-devel = %{epoch}:%{version}-%{release}
+
+%description static
+The static libraries for using the GNU MP arbitrary precision library 
+in applications.
+
+%prep
+%autosetup -S git
+
+# switch the defaults to new cpus on s390x
+%ifarch s390x
+( cd mpn/s390_64; ln -s z13 s390x )
+%endif
+
+%build
+autoreconf -ifv
+if as --help | grep -q execstack; then
+  # the object files do not require an executable stack
+  export CCAS="gcc -c -Wa,--noexecstack"
+fi
+
+%ifarch %{ix86}
+  export CFLAGS=$(echo %{optflags} | sed -e "s/-mtune=[^ ]*//g" | sed -e "s/-march=[^ ]*/-march=i686/g")
+  export CXXFLAGS=$(echo %{optflags} | sed -e "s/-mtune=[^ ]*//g" | sed -e "s/-march=[^ ]*/-march=i686/g")
+%endif
+
+%configure --enable-cxx --enable-fat
+
+sed -e 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' \
+    -e 's|^runpath_var=LD_RUN_PATH|runpath_var=DIE_RPATH_DIE|g' \
+    -e 's|-lstdc++ -lm|-lstdc++|' \
+    -i libtool
+export LD_LIBRARY_PATH=`pwd`/.libs
+%make_build
+
+%if %{with fips}
+%define __spec_install_post \
+    %{?__debug_package:%{__debug_install_post}} \
+    %{__arch_install_post} \
+    %{__os_install_post} \
+    fipshmac -d $RPM_BUILD_ROOT%{_libdir} $RPM_BUILD_ROOT%{_libdir}/libgmp.so.10.* \
+    file=`basename $RPM_BUILD_ROOT%{_libdir}/libgmp.so.10.*.hmac` && \
+	mkdir -p $RPM_BUILD_ROOT%{_libdir}/fipscheck && \
+        mv $RPM_BUILD_ROOT%{_libdir}/$file $RPM_BUILD_ROOT%{_libdir}/fipscheck/$file && \
+        ln -s $file $RPM_BUILD_ROOT%{_libdir}/fipscheck/libgmp.so.10.hmac && \
+        cp $RPM_BUILD_ROOT%{_libdir}/fipscheck/$file $RPM_BUILD_ROOT%{_libdir}/.$file && \
+        ln -s .$file $RPM_BUILD_ROOT%{_libdir}/.libgmp.so.10.hmac 
+%{nil}
+%endif
+
+%install
+export LD_LIBRARY_PATH=`pwd`/.libs
+%make_install 
+install -m 644 gmp-mparam.h ${RPM_BUILD_ROOT}%{_includedir}
+rm -f $RPM_BUILD_ROOT%{_libdir}/lib{gmp,mp,gmpxx}.la
+rm -f $RPM_BUILD_ROOT%{_infodir}/dir
+/sbin/ldconfig -n $RPM_BUILD_ROOT%{_libdir}
+ln -sf libgmpxx.so.4 $RPM_BUILD_ROOT%{_libdir}/libgmpxx.so
+
+# Rename gmp.h to gmp-<arch>.h and gmp-mparam.h to gmp-mparam-<arch>.h to 
+# avoid file conflicts on multilib systems and install wrapper include files
+# gmp.h and gmp-mparam-<arch>.h
+basearch=%{_arch}
+# always use i386 for iX86
+%ifarch %{ix86}
+basearch=i386
+%endif
+# always use arm for arm*
+%ifarch %{arm}
+basearch=arm
+%endif
+# superH architecture support
+%ifarch sh3 sh4
+basearch=sh
+%endif
+# Rename files and install wrappers
+
+mv %{buildroot}/%{_includedir}/gmp.h %{buildroot}/%{_includedir}/gmp-${basearch}.h
+install -m644 %{SOURCE2} %{buildroot}/%{_includedir}/gmp.h
+mv %{buildroot}/%{_includedir}/gmp-mparam.h %{buildroot}/%{_includedir}/gmp-mparam-${basearch}.h
+install -m644 %{SOURCE3} %{buildroot}/%{_includedir}/gmp-mparam.h
+
+
+%check
+%ifnarch ppc
+export LD_LIBRARY_PATH=`pwd`/.libs
+%make_build check
+%endif
+
+%ldconfig_scriptlets
+
+%ldconfig_scriptlets c++
+
+%files
+%{!?_licensedir:%global license %%doc}
+%license COPYING COPYING.LESSERv3 COPYINGv2 COPYINGv3
+%doc NEWS README
+%{_libdir}/libgmp.so.*
+%if %{with fips}
+%{_libdir}/.libgmp.so.*.hmac
+%{_libdir}/fipscheck/libgmp.so.*.hmac
+%endif
+
+%files c++
+%{_libdir}/libgmpxx.so.*
+
+%files devel
+%{_libdir}/libgmp.so
+%{_libdir}/libgmpxx.so
+%{_libdir}/pkgconfig/gmp.pc
+%{_libdir}/pkgconfig/gmpxx.pc
+%{_includedir}/*.h
+%{_infodir}/gmp.info*
+
+%files static
+%{_libdir}/libgmp.a
+%{_libdir}/libgmpxx.a
+
+%changelog
+* Tue Aug 03 2023 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-13
+- Fix: previous commit removed one function from the library and thus broke the ABI
+- function gmpn_preinv_divrem_1 should now not be removed
+Related: rhbz#2044216
+
+* Tue Jul 18 2023 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-12
+- Add SIMD optimization patches for s390x (provided by the IBM)
+Resolves: rhbz#2044216
+
+* Tue Jun 06 2023 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-11
+Fix: Integer overflow and resultant buffer overflow via crafted input
+Resolves: CVE-2021-43618
+
+* Fri Aug 27 2021 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-10
+- Add the support for intel CET
+Resolves: rhbz#1977890
+
+* Wed Aug 18 2021 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-9
+- Move the .hmac files to the fipscheck subfolder
+- Make symlinks from their original location (Fedora contains the .hmac files there) pointing to their new location
+Resolves: rhbz#1980758
+
+* Mon Aug 09 2021 Mohan Boddu <mboddu@redhat.com> - 1:6.2.0-8
+- Rebuilt for IMA sigs, glibc 2.34, aarch64 flags
+  Related: rhbz#1991688
+
+* Thu Apr 15 2021 Mohan Boddu <mboddu@redhat.com> - 1:6.2.0-7
+- Rebuilt for RHEL 9 BETA on Apr 15th 2021. Related: rhbz#1947937
+
+* Tue Jan 26 2021 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.2.0-6
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_34_Mass_Rebuild
+
+* Tue Sep 15 2020 Kalev Lember <klember@redhat.com> - 1:6.2.0-5
+- Move gmpxx.pc to -devel subpackage as well
+
+* Fri Aug 07 2020 Peter Robinson <pbrobinson@fedoraproject.org> - 1:6.2.0-4
+- The pkgcfg file should be in devel
+
+* Tue Jul 28 2020 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-3
+- Use make macros
+
+* Mon Jul 27 2020 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.2.0-2
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_33_Mass_Rebuild
+
+* Mon Feb 17 2020 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-1
+- Rebase to 6.2.0
+
+* Tue Jan 28 2020 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-13
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild
+
+* Tue Dec 03 2019 Jakub Martisko <jamartis@redhat.com> - 1:6.1.2-12
+- Reenable the fat binaries build option
+Resolves: #1779060
+
+* Thu Jul 25 2019 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-11
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_31_Mass_Rebuild
+
+* Fri Feb 15 2019 Anderson Toshiyuki Sasaki <ansasaki@redhat.com> - 1:6.1.2-10
+- Create HMAC checksum for FIPS integrity self tests
+
+* Thu Jan 31 2019 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-9
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild
+
+* Fri Jul 13 2018 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-8
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_29_Mass_Rebuild
+
+* Wed Feb 07 2018 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-7
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild
+
+* Wed Aug 02 2017 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-6
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild
+
+* Wed Jul 26 2017 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-5
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Mass_Rebuild
+
+* Mon Mar 13 2017 David Kaspar [Dee'Kej] <dkaspar@redhat.com> - 1:6.1.2-4
+- Fix the build process for ix89 family
+
+* Fri Feb 17 2017 David Kaspar [Dee'Kej] <dkaspar@redhat.com> - 1:6.1.2-3
+- Build process updated to correctly build .debug_info for i386
+  and to correctly use hardening flags
+
+* Fri Feb 10 2017 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-2
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_26_Mass_Rebuild
+
+* Tue Dec 20 2016 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:6.1.2-1
+- rebase
+
+* Wed Jun 22 2016 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:6.1.1-1
+- rebase
+
+* Fri Apr 08 2016 Yaakov Selkowitz <yselkowi@redhat.com> - 1:6.1.0-3
+- Split c++ subpackage (#1325439)
+
+* Wed Feb 03 2016 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.0-2
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_24_Mass_Rebuild
+
+* Wed Nov 25 2015 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:6.1.0-1
+- rebase to 6.1.0
+- gmp-6.0.0-ppc64.patch already upstream, dropped
+
+* Mon Sep 14 2015 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:6.0.0-13
+- do not package sse2 variant, use --enable-fat instead (a bit dangerous, some low level routines will be skipped in `make check`)
+
+* Fri Sep 04 2015 Michal Toman <mtoman@fedoraproject.org> - 1:6.0.0-12
+- Add support for MIPS architecture to gmp.h and gmp-mparam.h
+
+* Wed Jun 17 2015 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1:6.0.0-11
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild
+
+* Sat May 02 2015 Kalev Lember <kalevlember@gmail.com> - 1:6.0.0-10
+- Rebuilt for GCC 5 C++11 ABI change
+
+* Thu Apr 02 2015 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:6.0.0-9
+- bug965318 - improve debuginfo of assembler sources
+
+* Thu Sep 04 2014 Dan Horák <dan[at]danny.cz> - 1:6.0.0-8
+- drop s390x patch, support is already in upstream
+
+* Sat Aug 16 2014 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1:6.0.0-7
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_22_Mass_Rebuild
+
+* Sat Jul 12 2014 Tom Callaway <spot@fedoraproject.org> - 1:6.0.0-6
+- fix license handling
+
+* Thu Jul 10 2014 Brent Baude <baude@us.ibm.com> - 1:6.0.0-5
+- Fix gmp headers for ppc64le (#1083429)
+
+* Sat Jun 07 2014 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1:6.0.0-4
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_Mass_Rebuild
+
+* Thu Apr 24 2014 Karsten Hopp <karsten@redhat.com> 6.0.0-3
+- set default for BMOD_1_TO_MOD_1_THRESHOLD on ppc64, patch by 
+  Torbjorn Granlund:
+  https://gmplib.org/repo/gmp/rev/4a6d258b467f
+
+* Mon Apr 14 2014 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:6.0.0-2
+- rebase
+
+* Wed Nov 06 2013 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:5.1.3-2
+- support for aarch64
+
+* Wed Nov 06 2013 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:5.1.3-1
+- rebase to 5.1.3
+
+* Sat Aug 03 2013 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1:5.1.2-2
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild
+
+* Thu May 30 2013 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:5.1.2-1
+- rebase to 5.1.2
+
+* Thu Mar 28 2013 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:5.1.1-3
+- added build dependency needed to autoreconf on arm
+
+* Thu Feb 14 2013 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:5.1.1-2
+- rebase to 5.1.1
+- deleted unapplicable part of gmp-4.0.1-s390.patch
+
+* Fri Jan 25 2013 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:5.1.0-1
+- rebase to 5.1.0, de-ansi patch no longer applicable
+- upstream dropped libmp.so (bsdmp-like interface)
+- silenced bogus date in changelog
+
+* Tue Jan 22 2013 Peter Robinson <pbrobinson@fedoraproject.org> 1:5.0.5-6
+- Rebuild against new binutils to fix FTBFS on ARM
+
+* Fri Nov 23 2012 Frantisek Kluknavsky <fkluknav@redhat.com> - 1:5.0.5-5
+- minor spec cleanup
+
+* Fri Jul 20 2012 Peter Schiffer <pschiffe@redhat.com> 1:5.0.5-3
+- fixed FTBFS
+
+* Thu Jul 19 2012 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1:5.0.5-2
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_18_Mass_Rebuild
+
+* Mon Jun 25 2012 Peter Schiffer <pschiffe@redhat.com> 1:5.0.5-1
+- resolves: #820897
+  update to 5.0.5
+
+* Thu Apr 19 2012 Peter Schiffer <pschiffe@redhat.com> 1:5.0.4-1
+- resolves: #785116
+  update to 5.0.4
+
+* Tue Feb 28 2012 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1:5.0.2-6
+- Rebuilt for c++ ABI breakage
+
+* Thu Jan 19 2012 Peter Schiffer <pschiffe@redhat.com> 1:5.0.2-5
+- fixed FTBFS with gcc 4.7 on 32bit arch
+
+* Fri Jan 13 2012 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1:5.0.2-4
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_17_Mass_Rebuild
+
+* Fri Oct 14 2011 Peter Schiffer <pschiffe@redhat.com> 1:5.0.2-3
+- removed old compatibility library
+
+* Mon Sep 26 2011 Peter Schiffer <pschiffe@redhat.com> 1:5.0.2-2
+- temporary build wild old compatibility library version
+
+* Tue Sep 20 2011 Peter Schiffer <pschiffe@redhat.com> 1:5.0.2-1
+- resolves: #702919
+  update to 5.0.2
+- resolves: #738091
+  removed unused direct shlib dependency on libm
+  updated license in gmp.h and gmp-mparam.h files
+
+* Mon Jun 13 2011 Ivana Hutarova Varekova <varekova@redhat.com> 1:4.3.2-4
+- Resolves: #706374
+  fix sse2/libgmp.so.3.5.2 debuginfo data
+
+* Tue Feb 08 2011 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1:4.3.2-3
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_15_Mass_Rebuild
+
+* Wed Nov 24 2010 Ivana Hutarova Varekova <varekova@redhat.com> 1:4.3.2-2
+- fix Requires tag
+
+* Wed Nov 24 2010 Ivana Hutarova Varekova <varekova@redhat.com> 1:4.3.2-1
+- downgrade from 5.0.1 to 4.3.2
+
+* Mon May 24 2010 Ivana Hutarova Varekova <varekova@redhat.com> 5.0.1-1
+- update to 5.0.1
+
+* Tue Mar  2 2010 Ivana Hutarova Varekova <varekova@redhat.com> 4.3.1-7
+- fix the license tag
+
+* Fri Nov 27 2009 Ivana Hutarova Varekova <varekova@redhat.com> 4.3.1-6
+- remove unnecessary dependences
+  remove duplicated documentation
+
+* Mon Aug 10 2009 Ivana Varekova <varekova@redhat.com> 4.3.1-5
+- fix installation with --excludedocs option (#515947)
+
+* Fri Jul 24 2009 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 4.3.1-4
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_12_Mass_Rebuild
+
+* Wed Jun 17 2009 Ivana Varekova <varekova@redhat.com> 4.3.1-3
+- rebuild
+
+* Mon Jun 15 2009 Ivana Varekova <varekova@redhat.com> 4.3.1-2
+- Resolves: #505592
+  add RPM_OPT_FLAGS
+
+* Thu May 28 2009 Ivana Varekova <varekova@redhat.com> 4.3.1-1
+- update to 4.3.1
+- remove configure macro (built problem)
+
+* Thu Apr 09 2009 Dennis Gilmore <dennis@ausil.us> - 4.2.4-6
+- no check that --host and --target are the same when building i586  or sparcv9 they are not
+
+* Tue Feb 24 2009 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 4.2.4-5
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_11_Mass_Rebuild
+
+* Tue Dec 23 2008 Ivana Varekova <varekova@redhat.com> 4.2.4-4
+- fix spec file
+
+* Mon Dec  8 2008 Ivana Varekova <varekova@redhat.com> 4.2.4-3
+- remove useless option (#475073)
+
+* Wed Dec  3 2008 Stepan Kasal <skasal@redhat.com> 4.2.4-2
+- Run full autoreconf, add automake to BuildRequires.
+
+* Mon Nov 10 2008 Ivana Varekova <varekova@redhat.com> 4.2.4-1
+- update to 4.2.4
+
+* Fri Nov  7 2008 Ivana Varekova <varekova@redhat.com> 4.2.2-9
+- remove useless patch (#470200)
+
+* Thu Apr 24 2008 Tom "spot" Callaway <tcallawa@redhat.com> 4.2.2-8
+- add sparc/sparc64 support
+
+* Wed Mar 19 2008 Ivana Varekova <varekova@redhat.com> 4.2.2-7
+- add superH support (#437688)
+
+* Wed Feb 13 2008 Ivana varekova <varekova@redhat.com> 4.2.2-6
+- fix gcc-4.3 problem - add <cstdio> (#432336)
+
+* Fri Feb  8 2008 Ivana Varekova <varekova@redhat.com> 4.2.2-5
+- split the devel subpackage to devel and static parts
+
+* Thu Feb  7 2008 Ivana Varekova <varekova@redhat.com> 4.2.2-4
+- change license tag
+
+* Mon Sep 24 2007 Ivana Varekova <varekova@redhat.com> 4.2.2-3
+- fix libgmpxx.so link
+
+* Thu Sep 20 2007 Ivana Varekova <varekova@redhat.com> 4.2.2-2
+- fix check tag
+
+* Wed Sep 19 2007 Ivana Varekova <varekova@redhat.com> 4.2.2-1
+- update to 4.2.2
+
+* Mon Aug 20 2007 Ivana Varekova <varekova@redhat.com> 4.2.1-3
+- spec file cleanup (#253439)
+
+* Tue Aug  7 2007 Ivana Varekova <varekova@redhat.com> 4.2.1-2
+- add arm support (#245456)
+  thanks to Lennert Buytenhek
+
+* Mon Aug  6 2007 Ivana Varekova <varekova@redhat.com> 4.2.1-1
+- update to 4.2.1
+- do some spec cleanups
+- fix 238794 - gmp-devel depends on {version} but not on 
+  {version}-{release}
+- remove mpfr (moved to separate package)
+
+* Thu Jul 05 2007 Florian La Roche <laroche@redhat.com> 4.1.4-13
+- don't fail scripts to e.g. allow excludedocs installs
+
+* Tue Apr 24 2007 Karsten Hopp <karsten@redhat.com> 4.1.4-12.3
+- fix library permissions
+
+* Wed Mar 14 2007 Karsten Hopp <karsten@redhat.com> 4.1.4-12.2
+- fix typo
+
+* Wed Mar 14 2007 Thomas Woerner <twoerner@redhat.com> 4.1.4-12.1
+- added alpha support for gmp.h and gmp-mparam.h wrappers
+
+* Fri Feb 23 2007 Karsten Hopp <karsten@redhat.com> 4.1.4-12
+- remove trailing dot from summary
+- fix buildroot
+- fix post/postun/... requirements
+- use make install DESTDIR=...
+- replace tabs with spaces
+- convert changelog to utf-8
+
+* Wed Jan 17 2007 Jakub Jelinek <jakub@redhat.com> 4.1.4-11
+- make sure libmpfr.a doesn't contain SSE2 instructions on i?86 (#222371)
+- rebase to mpfr 2.2.1 from 2.2.0 + cumulative fixes
+
+* Thu Nov  2 2006 Thomas Woerner <twoerner@redhat.com> 4.1.4-10
+- fixed arch order in gmp.h and gmp-mparam.h wrapper for all architectures
+
+* Thu Nov  2 2006 Joe Orton <jorton@redhat.com> 4.1.4-10
+- include ppc64 header on ppc64 not ppc header
+
+* Fri Oct 27 2006 Thomas Woerner <twoerner@redhat.com> - 4.1.4-9
+- fixed multilib devel conflicts for gmp (#212286)
+
+* Thu Oct 26 2006 Jakub Jelinek <jakub@redhat.com> - 4.1.4-8
+- upgrade mpfr to 2.2.0 (#211971)
+- apply mpfr 2.2.0 cumulative patch
+
+* Fri Jul 14 2006 Thomas Woerner <twoerner@redhat.com> - 4.1.4-7
+- release bump
+
+* Fri Feb 10 2006 Jesse Keating <jkeating@redhat.com> - 4.1.4-6.2.1
+- bump again for double-long bug on ppc(64)
+
+* Tue Feb 07 2006 Jesse Keating <jkeating@redhat.com> - 4.1.4-6.2
+- rebuilt for new gcc4.1 snapshot and glibc changes
+
+* Fri Dec 09 2005 Jesse Keating <jkeating@redhat.com>
+- rebuilt
+
+* Mon Apr 18 2005 Thomas Woerner <twoerner@redhat.com> 4.1.4-6
+- fixed __setfpucw call in mpfr-test.h
+
+* Wed Mar 02 2005 Karsten Hopp <karsten@redhat.de> 4.1.4-5
+- build with gcc-4
+
+* Wed Feb 09 2005 Karsten Hopp <karsten@redhat.de> 4.1.4-4
+- rebuilt
+
+* Sun Sep 26 2004 Florian La Roche <Florian.LaRoche@redhat.de>
+- 4.1.4
+- disable ppc64 patch, now fixed upstream
+
+* Tue Jun 15 2004 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Mon May 24 2004 Thomas Woerner <twoerner@redhat.com> 4.1.3-1
+- new version 4.1.3
+
+* Wed Mar 31 2004 Thomas Woerner <twoerner@redhat.com> 4.1.2-14
+- dropped RPATH (#118506)
+
+* Sat Mar 06 2004 Florian La Roche <Florian.LaRoche@redhat.de>
+- also build SSE2 DSOs, patch from Ulrich Drepper
+
+* Tue Mar 02 2004 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Fri Feb 13 2004 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Thu Jan 29 2004 Thomas Woerner <twoerner@redhat.com> 4.1.2-11
+- BuildRequires for automake16
+
+* Mon Dec 01 2003 Florian La Roche <Florian.LaRoche@redhat.de>
+- fix symlink to libgmpxx.so.3  #111135
+- add patch to factorize.c from gmp homepage
+
+* Thu Oct 23 2003 Joe Orton <jorton@redhat.com> 4.1.2-9
+- build with -Wa,--noexecstack
+
+* Thu Oct 23 2003 Joe Orton <jorton@redhat.com> 4.1.2-8
+- build assembly code with -Wa,--execstack
+- use parallel make
+- run tests, and fix C++ therein
+
+* Thu Oct 02 2003 Florian La Roche <Florian.LaRoche@redhat.de>
+- enable mpfr  #104395
+- enable cxx  #80195
+- add COPYING.LIB
+- add fixes from gmp web-site
+- remove some cruft patches for older libtool releases
+
+* Wed Jun 04 2003 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Tue Jun 03 2003 Florian La Roche <Florian.LaRoche@redhat.de>
+- make configure.in work with newer autoconf
+
+* Sun Jun 01 2003 Florian La Roche <Florian.LaRoche@redhat.de>
+- do not set extra_functions for s390x  #92001
+
+* Thu Feb 13 2003 Elliot Lee <sopwith@redhat.com> 4.1.2-3
+- Add ppc64 patch, accompanied by running auto*
+
+* Wed Jan 22 2003 Tim Powers <timp@redhat.com>
+- rebuilt
+
+* Wed Jan 01 2003 Florian La Roche <Florian.LaRoche@redhat.de>
+- update to 4.1.2
+
+* Tue Dec 03 2002 Florian La Roche <Florian.LaRoche@redhat.de>
+- update to 4.1.1
+- remove un-necessary patches
+- adjust s390/x86_64 patch
+
+* Sun Oct 06 2002 Florian La Roche <Florian.LaRoche@redhat.de>
+- add s390x patch
+- disable current x86-64 support in longlong.h
+
+* Mon Jul  8 2002 Trond Eivind Glomsrød <teg@redhat.com> 4.1-4
+- Add 4 patches, among them one for #67918
+- Update URL
+- s/Copyright/License/
+
+* Mon Jul  8 2002 Trond Eivind Glomsrød <teg@redhat.com> 4.1-3
+- Redefine the configure macro, the included configure 
+  script isn't happy about the rpm default one (#68190). Also, make
+  sure the included libtool isn't replaced,
+
+* Fri Jun 21 2002 Tim Powers <timp@redhat.com>
+- automated rebuild
+
+* Sat May 25 2002 Florian La Roche <Florian.LaRoche@redhat.de>
+- update to version 4.1
+- patch s390 gmp-mparam.h to match other archs.
+
+* Thu May 23 2002 Tim Powers <timp@redhat.com>
+- automated rebuild
+
+* Mon Mar 11 2002 Trond Eivind Glomsrød <teg@redhat.com> 4.0.1-3
+- Use standard %%configure macro and edit %%{_tmppath}
+
+* Tue Feb 26 2002 Trond Eivind Glomsrød <teg@redhat.com> 4.0.1-2
+- Rebuild
+
+* Tue Jan 22 2002 Florian La Roche <Florian.LaRoche@redhat.de>
+- update to 4.0.1
+- bzip2 src
+
+* Wed Jan 09 2002 Tim Powers <timp@redhat.com>
+- automated rebuild
+
+* Sun Jun 24 2001 Elliot Lee <sopwith@redhat.com>
+- Bump release + rebuild.
+
+* Mon Feb 05 2001 Philipp Knirsch <pknirsch@redhat.de>
+- Fixed bugzilla bug #25515 where GMP wouldn't work on IA64 as IA64 is not
+correctly identified as a 64 bit platform.
+
+* Mon Dec 18 2000 Preston Brown <pbrown@redhat.com>
+- include bsd mp library
+
+* Tue Oct 17 2000 Florian La Roche <Florian.LaRoche@redhat.de>
+- update to 3.1.1
+
+* Sun Sep  3 2000 Florian La Roche <Florian.LaRoche@redhat.com>
+- update to 3.1
+
+* Sat Aug 19 2000 Preston Brown <pbrown@redhat.com>
+- devel subpackage depends on main package so that .so symlink is OK.
+
+* Thu Jul 13 2000 Prospector <bugzilla@redhat.com>
+- automatic rebuild
+
+* Sat Jun  3 2000 Nalin Dahyabhai <nalin@redhat.com>
+- switch to the configure and makeinstall macros
+- FHS-compliance fixing
+- move docs to non-devel package
+
+* Fri Apr 28 2000 Bill Nottingham <notting@redhat.com>
+- libtoolize for ia64
+
+* Fri Apr 28 2000 Florian La Roche <Florian.LaRoche@redhat.com>
+- update to 3.0.1
+
+* Thu Apr 27 2000 Jakub Jelinek <jakub@redhat.com>
+- sparc64 fixes for 3.0
+
+* Wed Apr 26 2000 Florian La Roche <Florian.LaRoche@redhat.com>
+- update to 3.0
+
+* Mon Feb 14 2000 Matt Wilson <msw@redhat.com>
+- #include <string.h> in files that use string functions
+
+* Wed Feb 02 2000 Cristian Gafton <gafton@redhat.com>
+- fix description and summary
+
+* Mon Dec 06 1999 Michael K. Johnson <johnsonm@redhat.com>
+- s/GPL/LGPL/
+- build as non-root (#7604)
+
+* Mon Sep 06 1999 Jakub Jelinek <jj@ultra.linux.cz>
+- merge in some debian gmp fixes
+- Ulrich Drepper's __gmp_scale2 fix
+- my mpf_set_q fix
+- sparc64 fixes
+
+* Wed Apr 28 1999 Cristian Gafton <gafton@redhat.com>
+- add sparc patch for PIC handling
+
+* Sun Mar 21 1999 Cristian Gafton <gafton@redhat.com> 
+- auto rebuild in the new build environment (release 8)
+
+* Thu Feb 11 1999 Michael Johnson <johnsonm@redhat.com>
+- include the private header file gmp-mparam.h because several
+  apps seem to assume that they are building against the gmp
+  source tree and require it.  Sigh.
+
+* Tue Jan 12 1999 Michael K. Johnson <johnsonm@redhat.com>
+- libtoolize to work on arm
+
+* Thu Sep 10 1998 Cristian Gafton <gafton@redhat.com>
+- yet another touch of the spec file
+
+* Wed Sep  2 1998 Michael Fulbright <msf@redhat.com>
+- looked over before inclusion in RH 5.2
+
+* Sun May 24 1998 Dick Porter <dick@cymru.net>
+- Patch Makefile.in, not Makefile
+- Don't specify i586, let configure decide the arch
+
+* Sat Jan 24 1998 Marc Ewing <marc@redhat.com>
+- started with package from Toshio Kuratomi <toshiok@cats.ucsc.edu>
+- cleaned up file list
+- fixed up install-info support
+