commit dd8f4b760d0f4f883031e7a3c6926240b43046d1 Author: MSVSphere Packaging Team Date: Fri Sep 22 17:43:45 2023 +0300 import gmp-6.2.0-13.el9 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..024739e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +SOURCES/gmp-6.2.0.tar.bz2 diff --git a/.gmp.metadata b/.gmp.metadata new file mode 100644 index 0000000..062ceb6 --- /dev/null +++ b/.gmp.metadata @@ -0,0 +1 @@ +5e9341d3807bc7505376f9ed9f5c1c6c57050aa6 SOURCES/gmp-6.2.0.tar.bz2 diff --git a/SOURCES/cve-2021-43618.patch b/SOURCES/cve-2021-43618.patch new file mode 100644 index 0000000..f741972 --- /dev/null +++ b/SOURCES/cve-2021-43618.patch @@ -0,0 +1,25 @@ + +# HG changeset patch +# User Marco Bodrato +# Date 1634836009 -7200 +# Node ID 561a9c25298e17bb01896801ff353546c6923dbd +# Parent e1fd9db13b475209a864577237ea4b9105b3e96e +mpz/inp_raw.c: Avoid bit size overflows + +diff -r e1fd9db13b47 -r 561a9c25298e mpz/inp_raw.c +--- a/mpz/inp_raw.c Tue Dec 22 23:49:51 2020 +0100 ++++ b/mpz/inp_raw.c Thu Oct 21 19:06:49 2021 +0200 +@@ -88,8 +88,11 @@ + + abs_csize = ABS (csize); + ++ if (UNLIKELY (abs_csize > ~(mp_bitcnt_t) 0 / 8)) ++ return 0; /* Bit size overflows */ ++ + /* round up to a multiple of limbs */ +- abs_xsize = BITS_TO_LIMBS (abs_csize*8); ++ abs_xsize = BITS_TO_LIMBS ((mp_bitcnt_t) abs_csize * 8); + + if (abs_xsize != 0) + { + diff --git a/SOURCES/gmp-6.0.0-debuginfo.patch b/SOURCES/gmp-6.0.0-debuginfo.patch new file mode 100644 index 0000000..bb72839 --- /dev/null +++ b/SOURCES/gmp-6.0.0-debuginfo.patch @@ -0,0 +1,21 @@ +diff -up wrk/mpn/m4-ccas.wrk wrk/mpn/m4-ccas +--- wrk/mpn/m4-ccas.wrk 2015-04-02 16:44:03.645305407 +0200 ++++ wrk/mpn/m4-ccas 2015-04-02 16:21:57.893870969 +0200 +@@ -104,4 +104,4 @@ echo "$CC" + $CC || exit + + # Comment this out to preserve .s intermediates +-rm -f $TMP ++#rm -f $TMP +diff -up wrk/mpn/Makeasm.am.wrk wrk/mpn/Makeasm.am +--- wrk/mpn/Makeasm.am.wrk 2015-04-02 16:42:41.692278742 +0200 ++++ wrk/mpn/Makeasm.am 2015-04-02 16:21:57.891870945 +0200 +@@ -66,7 +66,7 @@ SUFFIXES = .s .S .asm + + + # can be overridden during development, eg. "make RM_TMP=: mul_1.lo" +-RM_TMP = rm -f ++RM_TMP = true + + + # .S assembler, preprocessed with cpp. diff --git a/SOURCES/gmp-intel-cet.patch b/SOURCES/gmp-intel-cet.patch new file mode 100644 index 0000000..62e790d --- /dev/null +++ b/SOURCES/gmp-intel-cet.patch @@ -0,0 +1,3518 @@ +From 2db44789f76e93fb641fc0ced30f35c163ab89ba Mon Sep 17 00:00:00 2001 +From: rpm-build +Date: Tue, 17 Aug 2021 15:33:21 +0200 +Subject: [PATCH] Intel CET: Define the base macros + +--- + acinclude.m4 | 100 +++++++++++++++++++++++++ + configure.ac | 1 + + mpn/x86/aors_n.asm | 5 +- + mpn/x86/aorsmul_1.asm | 1 + + mpn/x86/atom/sse2/aorsmul_1.asm | 1 + + mpn/x86/atom/sse2/mul_basecase.asm | 1 + + mpn/x86/atom/sse2/sqr_basecase.asm | 1 + + mpn/x86/bdiv_dbm1c.asm | 1 + + mpn/x86/copyd.asm | 1 + + mpn/x86/copyi.asm | 1 + + mpn/x86/divrem_1.asm | 1 + + mpn/x86/divrem_2.asm | 1 + + mpn/x86/k6/aors_n.asm | 1 + + mpn/x86/k6/aorsmul_1.asm | 1 + + mpn/x86/k6/divrem_1.asm | 1 + + mpn/x86/k6/k62mmx/copyd.asm | 1 + + mpn/x86/k6/k62mmx/lshift.asm | 1 + + mpn/x86/k6/k62mmx/rshift.asm | 1 + + mpn/x86/k6/mmx/com.asm | 1 + + mpn/x86/k6/mmx/logops_n.asm | 1 + + mpn/x86/k6/mmx/lshift.asm | 1 + + mpn/x86/k6/mmx/popham.asm | 1 + + mpn/x86/k6/mmx/rshift.asm | 1 + + mpn/x86/k6/mod_34lsub1.asm | 1 + + mpn/x86/k6/mul_1.asm | 1 + + mpn/x86/k6/mul_basecase.asm | 1 + + mpn/x86/k6/pre_mod_1.asm | 1 + + mpn/x86/k6/sqr_basecase.asm | 1 + + mpn/x86/k7/aors_n.asm | 1 + + mpn/x86/k7/mmx/com.asm | 1 + + mpn/x86/k7/mmx/copyd.asm | 1 + + mpn/x86/k7/mmx/copyi.asm | 1 + + mpn/x86/k7/mmx/divrem_1.asm | 1 + + mpn/x86/k7/mmx/lshift.asm | 1 + + mpn/x86/k7/mmx/popham.asm | 1 + + mpn/x86/k7/mmx/rshift.asm | 1 + + mpn/x86/k7/mod_1_1.asm | 1 + + mpn/x86/k7/mod_1_4.asm | 1 + + mpn/x86/k7/mod_34lsub1.asm | 1 + + mpn/x86/k7/mul_basecase.asm | 1 + + mpn/x86/k7/sqr_basecase.asm | 1 + + mpn/x86/lshift.asm | 1 + + mpn/x86/mmx/sec_tabselect.asm | 1 + + mpn/x86/mod_34lsub1.asm | 1 + + mpn/x86/mul_1.asm | 1 + + mpn/x86/mul_basecase.asm | 1 + + mpn/x86/p6/aors_n.asm | 3 +- + mpn/x86/p6/aorsmul_1.asm | 3 +- + mpn/x86/p6/copyd.asm | 1 + + mpn/x86/p6/gcd_11.asm | 1 + + mpn/x86/p6/lshsub_n.asm | 3 +- + mpn/x86/p6/mmx/divrem_1.asm | 1 + + mpn/x86/p6/mod_34lsub1.asm | 1 + + mpn/x86/p6/mul_basecase.asm | 3 +- + mpn/x86/p6/sqr_basecase.asm | 3 +- + mpn/x86/pentium/aors_n.asm | 1 + + mpn/x86/pentium/aorsmul_1.asm | 1 + + mpn/x86/pentium/com.asm | 1 + + mpn/x86/pentium/copyd.asm | 1 + + mpn/x86/pentium/copyi.asm | 1 + + mpn/x86/pentium/logops_n.asm | 1 + + mpn/x86/pentium/lshift.asm | 1 + + mpn/x86/pentium/mmx/lshift.asm | 1 + + mpn/x86/pentium/mmx/mul_1.asm | 1 + + mpn/x86/pentium/mmx/rshift.asm | 1 + + mpn/x86/pentium/mod_34lsub1.asm | 1 + + mpn/x86/pentium/mul_1.asm | 1 + + mpn/x86/pentium/mul_2.asm | 1 + + mpn/x86/pentium/mul_basecase.asm | 1 + + mpn/x86/pentium/rshift.asm | 1 + + mpn/x86/pentium/sqr_basecase.asm | 1 + + mpn/x86/pentium4/copyd.asm | 1 + + mpn/x86/pentium4/copyi.asm | 1 + + mpn/x86/pentium4/mmx/popham.asm | 1 + + mpn/x86/pentium4/sse2/add_n.asm | 1 + + mpn/x86/pentium4/sse2/addlsh1_n.asm | 1 + + mpn/x86/pentium4/sse2/addmul_1.asm | 1 + + mpn/x86/pentium4/sse2/cnd_add_n.asm | 1 + + mpn/x86/pentium4/sse2/cnd_sub_n.asm | 1 + + mpn/x86/pentium4/sse2/divrem_1.asm | 1 + + mpn/x86/pentium4/sse2/mod_1_1.asm | 1 + + mpn/x86/pentium4/sse2/mod_1_4.asm | 1 + + mpn/x86/pentium4/sse2/mod_34lsub1.asm | 1 + + mpn/x86/pentium4/sse2/mul_1.asm | 1 + + mpn/x86/pentium4/sse2/mul_basecase.asm | 1 + + mpn/x86/pentium4/sse2/rsh1add_n.asm | 1 + + mpn/x86/pentium4/sse2/sqr_basecase.asm | 1 + + mpn/x86/pentium4/sse2/sub_n.asm | 1 + + mpn/x86/pentium4/sse2/submul_1.asm | 1 + + mpn/x86/rshift.asm | 1 + + mpn/x86/sec_tabselect.asm | 1 + + mpn/x86/sqr_basecase.asm | 1 + + mpn/x86/udiv.asm | 1 + + mpn/x86/umul.asm | 1 + + mpn/x86/x86-defs.m4 | 7 +- + mpn/x86_64/addaddmul_1msb0.asm | 1 + + mpn/x86_64/aorrlsh1_n.asm | 1 + + mpn/x86_64/aorrlshC_n.asm | 1 + + mpn/x86_64/aorrlsh_n.asm | 1 + + mpn/x86_64/aors_err1_n.asm | 1 + + mpn/x86_64/aors_err2_n.asm | 1 + + mpn/x86_64/aors_err3_n.asm | 1 + + mpn/x86_64/aors_n.asm | 1 + + mpn/x86_64/aorsmul_1.asm | 1 + + mpn/x86_64/atom/addmul_2.asm | 1 + + mpn/x86_64/atom/aorrlsh1_n.asm | 1 + + mpn/x86_64/atom/aorrlsh2_n.asm | 1 + + mpn/x86_64/atom/lshift.asm | 1 + + mpn/x86_64/atom/lshiftc.asm | 1 + + mpn/x86_64/atom/mul_2.asm | 1 + + mpn/x86_64/atom/rsh1aors_n.asm | 1 + + mpn/x86_64/atom/rshift.asm | 1 + + mpn/x86_64/atom/sublsh1_n.asm | 1 + + mpn/x86_64/bd1/addmul_2.asm | 1 + + mpn/x86_64/bd1/hamdist.asm | 1 + + mpn/x86_64/bd1/mul_2.asm | 1 + + mpn/x86_64/bd1/mul_basecase.asm | 1 + + mpn/x86_64/bd1/popcount.asm | 1 + + mpn/x86_64/bd2/gcd_11.asm | 1 + + mpn/x86_64/bd2/gcd_22.asm | 1 + + mpn/x86_64/bd4/gcd_11.asm | 1 + + mpn/x86_64/bdiv_dbm1c.asm | 1 + + mpn/x86_64/bdiv_q_1.asm | 1 + + mpn/x86_64/bt1/aors_n.asm | 1 + + mpn/x86_64/bt1/aorsmul_1.asm | 1 + + mpn/x86_64/bt1/copyd.asm | 1 + + mpn/x86_64/bt1/copyi.asm | 1 + + mpn/x86_64/bt1/gcd_11.asm | 1 + + mpn/x86_64/bt1/mul_1.asm | 1 + + mpn/x86_64/bt1/mul_basecase.asm | 1 + + mpn/x86_64/bt1/sqr_basecase.asm | 1 + + mpn/x86_64/cnd_aors_n.asm | 1 + + mpn/x86_64/com.asm | 1 + + mpn/x86_64/copyd.asm | 1 + + mpn/x86_64/copyi.asm | 1 + + mpn/x86_64/core2/aors_err1_n.asm | 1 + + mpn/x86_64/core2/aors_n.asm | 1 + + mpn/x86_64/core2/aorsmul_1.asm | 1 + + mpn/x86_64/core2/divrem_1.asm | 1 + + mpn/x86_64/core2/gcd_11.asm | 1 + + mpn/x86_64/core2/gcd_22.asm | 1 + + mpn/x86_64/core2/hamdist.asm | 1 + + mpn/x86_64/core2/logops_n.asm | 1 + + mpn/x86_64/core2/lshift.asm | 1 + + mpn/x86_64/core2/lshiftc.asm | 1 + + mpn/x86_64/core2/mul_basecase.asm | 5 ++ + mpn/x86_64/core2/mullo_basecase.asm | 1 + + mpn/x86_64/core2/popcount.asm | 1 + + mpn/x86_64/core2/rsh1aors_n.asm | 1 + + mpn/x86_64/core2/rshift.asm | 1 + + mpn/x86_64/core2/sqr_basecase.asm | 1 + + mpn/x86_64/core2/sublshC_n.asm | 1 + + mpn/x86_64/coreibwl/addmul_1.asm | 24 ++++-- + mpn/x86_64/coreibwl/mul_1.asm | 24 ++++-- + mpn/x86_64/coreibwl/mul_basecase.asm | 47 ++++++++---- + mpn/x86_64/coreibwl/mullo_basecase.asm | 1 + + mpn/x86_64/coreibwl/sqr_basecase.asm | 49 ++++++++---- + mpn/x86_64/coreihwl/addmul_2.asm | 1 + + mpn/x86_64/coreihwl/aors_n.asm | 1 + + mpn/x86_64/coreihwl/aorsmul_1.asm | 1 + + mpn/x86_64/coreihwl/gcd_22.asm | 1 + + mpn/x86_64/coreihwl/mul_2.asm | 1 + + mpn/x86_64/coreihwl/mul_basecase.asm | 1 + + mpn/x86_64/coreihwl/mullo_basecase.asm | 1 + + mpn/x86_64/coreihwl/redc_1.asm | 1 + + mpn/x86_64/coreihwl/sqr_basecase.asm | 1 + + mpn/x86_64/coreinhm/aorrlsh_n.asm | 1 + + mpn/x86_64/coreinhm/hamdist.asm | 1 + + mpn/x86_64/coreinhm/popcount.asm | 1 + + mpn/x86_64/coreisbr/addmul_2.asm | 1 + + mpn/x86_64/coreisbr/aorrlshC_n.asm | 1 + + mpn/x86_64/coreisbr/aorrlsh_n.asm | 1 + + mpn/x86_64/coreisbr/aors_n.asm | 1 + + mpn/x86_64/coreisbr/cnd_add_n.asm | 1 + + mpn/x86_64/coreisbr/cnd_sub_n.asm | 1 + + mpn/x86_64/coreisbr/mul_1.asm | 1 + + mpn/x86_64/coreisbr/mul_2.asm | 1 + + mpn/x86_64/coreisbr/mul_basecase.asm | 1 + + mpn/x86_64/coreisbr/mullo_basecase.asm | 1 + + mpn/x86_64/coreisbr/rsh1aors_n.asm | 1 + + mpn/x86_64/coreisbr/sqr_basecase.asm | 1 + + mpn/x86_64/div_qr_1n_pi1.asm | 1 + + mpn/x86_64/div_qr_2n_pi1.asm | 1 + + mpn/x86_64/div_qr_2u_pi1.asm | 1 + + mpn/x86_64/dive_1.asm | 1 + + mpn/x86_64/divrem_1.asm | 1 + + mpn/x86_64/divrem_2.asm | 1 + + mpn/x86_64/fastavx/copyd.asm | 1 + + mpn/x86_64/fastavx/copyi.asm | 1 + + mpn/x86_64/fastsse/com-palignr.asm | 1 + + mpn/x86_64/fastsse/com.asm | 1 + + mpn/x86_64/fastsse/copyd-palignr.asm | 1 + + mpn/x86_64/fastsse/copyd.asm | 1 + + mpn/x86_64/fastsse/copyi-palignr.asm | 1 + + mpn/x86_64/fastsse/copyi.asm | 1 + + mpn/x86_64/fastsse/lshift-movdqu2.asm | 1 + + mpn/x86_64/fastsse/lshift.asm | 1 + + mpn/x86_64/fastsse/lshiftc-movdqu2.asm | 1 + + mpn/x86_64/fastsse/lshiftc.asm | 1 + + mpn/x86_64/fastsse/rshift-movdqu2.asm | 1 + + mpn/x86_64/fastsse/sec_tabselect.asm | 1 + + mpn/x86_64/fat/fat_entry.asm | 1 + + mpn/x86_64/gcd_11.asm | 1 + + mpn/x86_64/gcd_22.asm | 1 + + mpn/x86_64/k10/gcd_22.asm | 1 + + mpn/x86_64/k10/hamdist.asm | 1 + + mpn/x86_64/k10/popcount.asm | 5 +- + mpn/x86_64/k8/addmul_2.asm | 1 + + mpn/x86_64/k8/aorrlsh_n.asm | 1 + + mpn/x86_64/k8/bdiv_q_1.asm | 1 + + mpn/x86_64/k8/div_qr_1n_pi1.asm | 1 + + mpn/x86_64/k8/mul_basecase.asm | 8 ++ + mpn/x86_64/k8/mullo_basecase.asm | 12 ++- + mpn/x86_64/k8/mulmid_basecase.asm | 9 +++ + mpn/x86_64/k8/redc_1.asm | 18 +++-- + mpn/x86_64/k8/sqr_basecase.asm | 18 +++-- + mpn/x86_64/logops_n.asm | 1 + + mpn/x86_64/lshift.asm | 1 + + mpn/x86_64/lshiftc.asm | 1 + + mpn/x86_64/lshsub_n.asm | 1 + + mpn/x86_64/missing.asm | 1 + + mpn/x86_64/mod_1_2.asm | 1 + + mpn/x86_64/mod_1_4.asm | 1 + + mpn/x86_64/mod_34lsub1.asm | 28 ++++--- + mpn/x86_64/mode1o.asm | 1 + + mpn/x86_64/mul_1.asm | 1 + + mpn/x86_64/mul_2.asm | 1 + + mpn/x86_64/nano/dive_1.asm | 1 + + mpn/x86_64/pentium4/aors_n.asm | 1 + + mpn/x86_64/pentium4/mod_34lsub1.asm | 1 + + mpn/x86_64/pentium4/rsh1aors_n.asm | 1 + + mpn/x86_64/pentium4/rshift.asm | 1 + + mpn/x86_64/popham.asm | 1 + + mpn/x86_64/rsh1aors_n.asm | 1 + + mpn/x86_64/rshift.asm | 1 + + mpn/x86_64/sec_tabselect.asm | 1 + + mpn/x86_64/sqr_diag_addlsh1.asm | 1 + + mpn/x86_64/sublsh1_n.asm | 1 + + mpn/x86_64/x86_64-defs.m4 | 6 +- + mpn/x86_64/zen/aorrlsh_n.asm | 25 +++++-- + mpn/x86_64/zen/mul_basecase.asm | 1 + + mpn/x86_64/zen/mullo_basecase.asm | 1 + + mpn/x86_64/zen/sbpi1_bdiv_r.asm | 1 + + mpn/x86_64/zen/sqr_basecase.asm | 1 + + 244 files changed, 537 insertions(+), 90 deletions(-) + +diff --git a/acinclude.m4 b/acinclude.m4 +index 86175ce..84e880b 100644 +--- a/acinclude.m4 ++++ b/acinclude.m4 +@@ -3135,6 +3135,106 @@ __sparc_get_pc_thunk.l7: + GMP_DEFINE_RAW(["define(,<$gmp_cv_asm_sparc_shared_thunks>)"]) + ]) + ++dnl GMP_ASM_X86_CET_MACROS(ABI) ++dnl ------------ ++dnl Define ++dnl 1. X86_ENDBR for endbr32/endbr64. ++dnl 2. X86_NOTRACK for notrack prefix. ++dnl 3. X86_GNU_PROPERTY to add a .note.gnu.property section to mark ++dnl Intel CET support if needed. ++dnl .section ".note.gnu.property", "a" ++dnl .p2align POINTER-ALIGN ++dnl .long 1f - 0f ++dnl .long 4f - 1f ++dnl .long 5 ++dnl 0: ++dnl .asciz "GNU" ++dnl 1: ++dnl .p2align POINTER-ALIGN ++dnl .long 0xc0000002 ++dnl .long 3f - 2f ++dnl 2: ++dnl .long 3 ++dnl 3: ++dnl .p2align POINTER-ALIGN ++dnl 4: ++AC_DEFUN([GMP_ASM_X86_CET_MACROS],[ ++dnl AC_REQUIRE([AC_PROG_CC]) GMP uses something else ++AC_CACHE_CHECK([if Intel CET is enabled], ++ gmp_cv_asm_x86_intel_cet, [dnl ++ cat > conftest.c </dev/null]) ++ then ++ gmp_cv_asm_x86_intel_cet=yes ++ else ++ gmp_cv_asm_x86_intel_cet=no ++ fi ++ rm -f conftest*]) ++ if test "$gmp_cv_asm_x86_intel_cet" = yes; then ++ case $1 in ++ 32) ++ endbr=endbr32 ++ p2align=2 ++ ;; ++ 64) ++ endbr=endbr64 ++ p2align=3 ++ ;; ++ x32) ++ endbr=endbr64 ++ p2align=2 ++ ;; ++ esac ++ AC_CACHE_CHECK([if .note.gnu.property section is needed], ++ gmp_cv_asm_x86_gnu_property, [dnl ++ cat > conftest.c </dev/null]) ++ then ++ gmp_cv_asm_x86_gnu_property=yes ++ else ++ gmp_cv_asm_x86_gnu_property=no ++ fi ++ rm -f conftest*]) ++ echo ["define(,<$endbr>)"] >> $gmp_tmpconfigm4 ++ echo ["define(,)"] >> $gmp_tmpconfigm4 ++ else ++ gmp_cv_asm_x86_gnu_property=no ++ echo ["define(,<>)"] >> $gmp_tmpconfigm4 ++ echo ["define(,<>)"] >> $gmp_tmpconfigm4 ++ fi ++ if test "$gmp_cv_asm_x86_gnu_property" = yes; then ++ echo ["define(, < ++ .section \".note.gnu.property\", \"a\" ++ .p2align $p2align ++ .long 1f - 0f ++ .long 4f - 1f ++ .long 5 ++0: ++ .asciz \"GNU\" ++1: ++ .p2align $p2align ++ .long 0xc0000002 ++ .long 3f - 2f ++2: ++ .long 3 ++3: ++ .p2align $p2align ++4:>)"] >> $gmp_tmpconfigm4 ++ else ++ echo ["define(,<>)"] >> $gmp_tmpconfigm4 ++ fi ++]) ++ + + dnl GMP_C_ATTRIBUTE_CONST + dnl --------------------- +diff --git a/configure.ac b/configure.ac +index 024cacb..be314a6 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -3804,6 +3804,7 @@ yes + esac + ;; + esac ++ GMP_ASM_X86_CET_MACROS($ABI) + ;; + esac + fi +diff --git a/mpn/x86/aors_n.asm b/mpn/x86/aors_n.asm +index 5d359f5..7ea7814 100644 +--- a/mpn/x86/aors_n.asm ++++ b/mpn/x86/aors_n.asm +@@ -112,7 +112,7 @@ L(0a): leal (%eax,%eax,8),%eax + shrl %ebp C shift bit 0 into carry + popl %ebp FRAME_popl() + +- jmp *%eax C jump into loop ++ X86_NOTRACK jmp *%eax C jump into loop + + EPILOGUE() + +@@ -153,7 +153,7 @@ L(0b): leal (%eax,%eax,8),%eax + C Calculate start address in loop for non-PIC. + leal L(oop)-3(%eax,%eax,8),%eax + ') +- jmp *%eax C jump into loop ++ X86_NOTRACK jmp *%eax C jump into loop + + L(oopgo): + pushl %ebp FRAME_pushl() +@@ -200,3 +200,4 @@ L(oop): movl (%esi),%eax + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/aorsmul_1.asm b/mpn/x86/aorsmul_1.asm +index 54a8905..0ab1e01 100644 +--- a/mpn/x86/aorsmul_1.asm ++++ b/mpn/x86/aorsmul_1.asm +@@ -154,3 +154,4 @@ L(end): movl %ebx,%eax + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/atom/sse2/aorsmul_1.asm b/mpn/x86/atom/sse2/aorsmul_1.asm +index 969a14a..20658e1 100644 +--- a/mpn/x86/atom/sse2/aorsmul_1.asm ++++ b/mpn/x86/atom/sse2/aorsmul_1.asm +@@ -172,3 +172,4 @@ PROLOGUE(func_1c) + mov 20(%esp), %edx C carry + jmp L(ent) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/atom/sse2/mul_basecase.asm b/mpn/x86/atom/sse2/mul_basecase.asm +index 97d3aeb..74171aa 100644 +--- a/mpn/x86/atom/sse2/mul_basecase.asm ++++ b/mpn/x86/atom/sse2/mul_basecase.asm +@@ -499,3 +499,4 @@ L(done): + pop %edi + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/atom/sse2/sqr_basecase.asm b/mpn/x86/atom/sse2/sqr_basecase.asm +index af19ed8..0031812 100644 +--- a/mpn/x86/atom/sse2/sqr_basecase.asm ++++ b/mpn/x86/atom/sse2/sqr_basecase.asm +@@ -632,3 +632,4 @@ L(one): pmuludq %mm7, %mm7 + pop %edi + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/bdiv_dbm1c.asm b/mpn/x86/bdiv_dbm1c.asm +index 0288c47..7a3b1a6 100644 +--- a/mpn/x86/bdiv_dbm1c.asm ++++ b/mpn/x86/bdiv_dbm1c.asm +@@ -127,3 +127,4 @@ L(b1): add $-4, %ebp + pop %esi + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/copyd.asm b/mpn/x86/copyd.asm +index 51fa195..0e588d9 100644 +--- a/mpn/x86/copyd.asm ++++ b/mpn/x86/copyd.asm +@@ -89,3 +89,4 @@ PROLOGUE(mpn_copyd) + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/copyi.asm b/mpn/x86/copyi.asm +index f6b0354..6efbb90 100644 +--- a/mpn/x86/copyi.asm ++++ b/mpn/x86/copyi.asm +@@ -97,3 +97,4 @@ PROLOGUE(mpn_copyi) + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/divrem_1.asm b/mpn/x86/divrem_1.asm +index 255d493..b1af920 100644 +--- a/mpn/x86/divrem_1.asm ++++ b/mpn/x86/divrem_1.asm +@@ -231,3 +231,4 @@ deflit(`FRAME',8) + popl %edi + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/divrem_2.asm b/mpn/x86/divrem_2.asm +index 4c38ad0..c2920c2 100644 +--- a/mpn/x86/divrem_2.asm ++++ b/mpn/x86/divrem_2.asm +@@ -197,3 +197,4 @@ L(35): sub 20(%esp), %ebp + movl $1, 32(%esp) + jmp L(8) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/aors_n.asm b/mpn/x86/k6/aors_n.asm +index 168f9b4..257ba59 100644 +--- a/mpn/x86/k6/aors_n.asm ++++ b/mpn/x86/k6/aors_n.asm +@@ -335,3 +335,4 @@ L(inplace_done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/aorsmul_1.asm b/mpn/x86/k6/aorsmul_1.asm +index eaa92eb..78be9d2 100644 +--- a/mpn/x86/k6/aorsmul_1.asm ++++ b/mpn/x86/k6/aorsmul_1.asm +@@ -389,3 +389,4 @@ Zdisp( M4_inst,%ecx, disp0,(%edi)) + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/divrem_1.asm b/mpn/x86/k6/divrem_1.asm +index b4cea4f..ca41a3f 100644 +--- a/mpn/x86/k6/divrem_1.asm ++++ b/mpn/x86/k6/divrem_1.asm +@@ -201,3 +201,4 @@ deflit(`FRAME',8) + popl %edi + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/k62mmx/copyd.asm b/mpn/x86/k6/k62mmx/copyd.asm +index f80a5a1..fc329f5 100644 +--- a/mpn/x86/k6/k62mmx/copyd.asm ++++ b/mpn/x86/k6/k62mmx/copyd.asm +@@ -116,3 +116,4 @@ L(zero): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/k62mmx/lshift.asm b/mpn/x86/k6/k62mmx/lshift.asm +index c86575f..728fb5b 100644 +--- a/mpn/x86/k6/k62mmx/lshift.asm ++++ b/mpn/x86/k6/k62mmx/lshift.asm +@@ -292,3 +292,4 @@ deflit(`FRAME',4) + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/k62mmx/rshift.asm b/mpn/x86/k6/k62mmx/rshift.asm +index f604a7b..bd673f3 100644 +--- a/mpn/x86/k6/k62mmx/rshift.asm ++++ b/mpn/x86/k6/k62mmx/rshift.asm +@@ -291,3 +291,4 @@ L(finish_even): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/mmx/com.asm b/mpn/x86/k6/mmx/com.asm +index b747454..646d16b 100644 +--- a/mpn/x86/k6/mmx/com.asm ++++ b/mpn/x86/k6/mmx/com.asm +@@ -101,3 +101,4 @@ L(no_extra): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/mmx/logops_n.asm b/mpn/x86/k6/mmx/logops_n.asm +index e17930b..acfd7df 100644 +--- a/mpn/x86/k6/mmx/logops_n.asm ++++ b/mpn/x86/k6/mmx/logops_n.asm +@@ -224,3 +224,4 @@ L(no_extra): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/mmx/lshift.asm b/mpn/x86/k6/mmx/lshift.asm +index 45be582..eee1eb8 100644 +--- a/mpn/x86/k6/mmx/lshift.asm ++++ b/mpn/x86/k6/mmx/lshift.asm +@@ -128,3 +128,4 @@ L(top): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/mmx/popham.asm b/mpn/x86/k6/mmx/popham.asm +index 2b19d0b..efeb1b4 100644 +--- a/mpn/x86/k6/mmx/popham.asm ++++ b/mpn/x86/k6/mmx/popham.asm +@@ -234,3 +234,4 @@ HAM(` nop C code alignment') + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/mmx/rshift.asm b/mpn/x86/k6/mmx/rshift.asm +index cd0382f..ae53711 100644 +--- a/mpn/x86/k6/mmx/rshift.asm ++++ b/mpn/x86/k6/mmx/rshift.asm +@@ -128,3 +128,4 @@ Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/mod_34lsub1.asm b/mpn/x86/k6/mod_34lsub1.asm +index 7e30503..05f8979 100644 +--- a/mpn/x86/k6/mod_34lsub1.asm ++++ b/mpn/x86/k6/mod_34lsub1.asm +@@ -188,3 +188,4 @@ L(combine): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/mul_1.asm b/mpn/x86/k6/mul_1.asm +index 3ef7ec2..2139f36 100644 +--- a/mpn/x86/k6/mul_1.asm ++++ b/mpn/x86/k6/mul_1.asm +@@ -290,3 +290,4 @@ L(finish_not_one): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/mul_basecase.asm b/mpn/x86/k6/mul_basecase.asm +index 7030001..ab202a2 100644 +--- a/mpn/x86/k6/mul_basecase.asm ++++ b/mpn/x86/k6/mul_basecase.asm +@@ -610,3 +610,4 @@ Zdisp( addl, %ecx, disp0,(%edi)) + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/pre_mod_1.asm b/mpn/x86/k6/pre_mod_1.asm +index 34db20d..1e4cb17 100644 +--- a/mpn/x86/k6/pre_mod_1.asm ++++ b/mpn/x86/k6/pre_mod_1.asm +@@ -144,3 +144,4 @@ L(q1_ff): + + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k6/sqr_basecase.asm b/mpn/x86/k6/sqr_basecase.asm +index b7ecb5c..f3a101a 100644 +--- a/mpn/x86/k6/sqr_basecase.asm ++++ b/mpn/x86/k6/sqr_basecase.asm +@@ -678,3 +678,4 @@ L(pic_calc): + + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/aors_n.asm b/mpn/x86/k7/aors_n.asm +index 1a08072..bfdf3d4 100644 +--- a/mpn/x86/k7/aors_n.asm ++++ b/mpn/x86/k7/aors_n.asm +@@ -256,3 +256,4 @@ L(even): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mmx/com.asm b/mpn/x86/k7/mmx/com.asm +index a258c22..cf48fac 100644 +--- a/mpn/x86/k7/mmx/com.asm ++++ b/mpn/x86/k7/mmx/com.asm +@@ -123,3 +123,4 @@ L(done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mmx/copyd.asm b/mpn/x86/k7/mmx/copyd.asm +index 59ece40..3bc9ff8 100644 +--- a/mpn/x86/k7/mmx/copyd.asm ++++ b/mpn/x86/k7/mmx/copyd.asm +@@ -142,3 +142,4 @@ L(done): + + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mmx/copyi.asm b/mpn/x86/k7/mmx/copyi.asm +index 9a28f92..f0648fa 100644 +--- a/mpn/x86/k7/mmx/copyi.asm ++++ b/mpn/x86/k7/mmx/copyi.asm +@@ -155,3 +155,4 @@ L(done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mmx/divrem_1.asm b/mpn/x86/k7/mmx/divrem_1.asm +index cf34328..370bfbb 100644 +--- a/mpn/x86/k7/mmx/divrem_1.asm ++++ b/mpn/x86/k7/mmx/divrem_1.asm +@@ -830,3 +830,4 @@ L(fraction_entry): + jmp L(fraction_done) + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mmx/lshift.asm b/mpn/x86/k7/mmx/lshift.asm +index b3383cf..4140e82 100644 +--- a/mpn/x86/k7/mmx/lshift.asm ++++ b/mpn/x86/k7/mmx/lshift.asm +@@ -479,3 +479,4 @@ L(end_even_unaligned): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mmx/popham.asm b/mpn/x86/k7/mmx/popham.asm +index 95965b7..f29540a 100644 +--- a/mpn/x86/k7/mmx/popham.asm ++++ b/mpn/x86/k7/mmx/popham.asm +@@ -211,3 +211,4 @@ L(loaded): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mmx/rshift.asm b/mpn/x86/k7/mmx/rshift.asm +index 345d23a..0da1f93 100644 +--- a/mpn/x86/k7/mmx/rshift.asm ++++ b/mpn/x86/k7/mmx/rshift.asm +@@ -478,3 +478,4 @@ L(end_even_unaligned): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mod_1_1.asm b/mpn/x86/k7/mod_1_1.asm +index 1bbe6f9..8da9519 100644 +--- a/mpn/x86/k7/mod_1_1.asm ++++ b/mpn/x86/k7/mod_1_1.asm +@@ -219,3 +219,4 @@ PROLOGUE(mpn_mod_1_1p_cps) + pop %ebp + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mod_1_4.asm b/mpn/x86/k7/mod_1_4.asm +index bb7597e..fe1da5b 100644 +--- a/mpn/x86/k7/mod_1_4.asm ++++ b/mpn/x86/k7/mod_1_4.asm +@@ -258,3 +258,4 @@ C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm + pop %ebp + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mod_34lsub1.asm b/mpn/x86/k7/mod_34lsub1.asm +index ee3ad04..0c1b8c8 100644 +--- a/mpn/x86/k7/mod_34lsub1.asm ++++ b/mpn/x86/k7/mod_34lsub1.asm +@@ -186,3 +186,4 @@ L(combine): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/mul_basecase.asm b/mpn/x86/k7/mul_basecase.asm +index 4dfb500..b96fda7 100644 +--- a/mpn/x86/k7/mul_basecase.asm ++++ b/mpn/x86/k7/mul_basecase.asm +@@ -600,3 +600,4 @@ deflit(`disp1', eval(disp0-0 + 4)) + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/k7/sqr_basecase.asm b/mpn/x86/k7/sqr_basecase.asm +index 7b6a97e..df47ee4 100644 +--- a/mpn/x86/k7/sqr_basecase.asm ++++ b/mpn/x86/k7/sqr_basecase.asm +@@ -633,3 +633,4 @@ L(diag): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/lshift.asm b/mpn/x86/lshift.asm +index 6ee6153..95f5321 100644 +--- a/mpn/x86/lshift.asm ++++ b/mpn/x86/lshift.asm +@@ -104,3 +104,4 @@ L(end): shll %cl,%ebx C compute least significant limb + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/mmx/sec_tabselect.asm b/mpn/x86/mmx/sec_tabselect.asm +index aae158a..543dec1 100644 +--- a/mpn/x86/mmx/sec_tabselect.asm ++++ b/mpn/x86/mmx/sec_tabselect.asm +@@ -161,3 +161,4 @@ L(b00): pop %ebp + emms + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/mod_34lsub1.asm b/mpn/x86/mod_34lsub1.asm +index e09e702..df52d37 100644 +--- a/mpn/x86/mod_34lsub1.asm ++++ b/mpn/x86/mod_34lsub1.asm +@@ -181,3 +181,4 @@ L(combine): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/mul_1.asm b/mpn/x86/mul_1.asm +index 421de62..dbbc0e3 100644 +--- a/mpn/x86/mul_1.asm ++++ b/mpn/x86/mul_1.asm +@@ -138,3 +138,4 @@ L(end): movl %ebx,%eax + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/mul_basecase.asm b/mpn/x86/mul_basecase.asm +index 8339732..c32fd7e 100644 +--- a/mpn/x86/mul_basecase.asm ++++ b/mpn/x86/mul_basecase.asm +@@ -221,3 +221,4 @@ L(done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/p6/aors_n.asm b/mpn/x86/p6/aors_n.asm +index df51c2e..ab172df 100644 +--- a/mpn/x86/p6/aors_n.asm ++++ b/mpn/x86/p6/aors_n.asm +@@ -90,7 +90,7 @@ L(here): + ') + + shr %edx C set cy flag +- jmp *%eax ++ X86_NOTRACK jmp *%eax + + ifdef(`PIC',` + L(pic_calc): +@@ -154,3 +154,4 @@ PROLOGUE(func_nc) + movl 20(%esp), %edx + jmp L(start) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/p6/aorsmul_1.asm b/mpn/x86/p6/aorsmul_1.asm +index bc8c49c..2a3b122 100644 +--- a/mpn/x86/p6/aorsmul_1.asm ++++ b/mpn/x86/p6/aorsmul_1.asm +@@ -240,7 +240,7 @@ L(here): + cmovnz( %ebx, %ecx) C high,low carry other way around + cmovnz( %eax, %ebx) + +- jmp *%edx ++ X86_NOTRACK jmp *%edx + + + ifdef(`PIC',` +@@ -318,3 +318,4 @@ deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/p6/copyd.asm b/mpn/x86/p6/copyd.asm +index 1be7636..bd42da1 100644 +--- a/mpn/x86/p6/copyd.asm ++++ b/mpn/x86/p6/copyd.asm +@@ -176,3 +176,4 @@ L(zero): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/p6/gcd_11.asm b/mpn/x86/p6/gcd_11.asm +index 80e055e..a7fc6a8 100644 +--- a/mpn/x86/p6/gcd_11.asm ++++ b/mpn/x86/p6/gcd_11.asm +@@ -81,3 +81,4 @@ L(end): mov %edx, %eax + pop %edi + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/p6/lshsub_n.asm b/mpn/x86/p6/lshsub_n.asm +index 7ada213..17db5d5 100644 +--- a/mpn/x86/p6/lshsub_n.asm ++++ b/mpn/x86/p6/lshsub_n.asm +@@ -82,7 +82,7 @@ L(here): + pxor %mm1, %mm1 + pxor %mm0, %mm0 + +- jmp *%eax ++ X86_NOTRACK jmp *%eax + + ifdef(`PIC',` + L(pic_calc): +@@ -167,3 +167,4 @@ L(ent): mov 0(up,n,4), %eax + jmp L(top) + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/p6/mmx/divrem_1.asm b/mpn/x86/p6/mmx/divrem_1.asm +index 5300616..b6057dd 100644 +--- a/mpn/x86/p6/mmx/divrem_1.asm ++++ b/mpn/x86/p6/mmx/divrem_1.asm +@@ -765,3 +765,4 @@ L(fraction_top): + jmp L(fraction_done) + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/p6/mod_34lsub1.asm b/mpn/x86/p6/mod_34lsub1.asm +index b88ab5d..46b3806 100644 +--- a/mpn/x86/p6/mod_34lsub1.asm ++++ b/mpn/x86/p6/mod_34lsub1.asm +@@ -188,3 +188,4 @@ L(done_0): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/p6/mul_basecase.asm b/mpn/x86/p6/mul_basecase.asm +index d87bc12..521b31e 100644 +--- a/mpn/x86/p6/mul_basecase.asm ++++ b/mpn/x86/p6/mul_basecase.asm +@@ -524,7 +524,7 @@ L(unroll_outer_entry): + xorl %eax, %ebx C carries other way for odd index + xorl %eax, %ecx + +- jmp *%edx ++ X86_NOTRACK jmp *%edx + + + C ----------------------------------------------------------------------------- +@@ -605,3 +605,4 @@ deflit(`disp1', eval(disp0 + 4)) + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/p6/sqr_basecase.asm b/mpn/x86/p6/sqr_basecase.asm +index 8fc7fdf..f71304f 100644 +--- a/mpn/x86/p6/sqr_basecase.asm ++++ b/mpn/x86/p6/sqr_basecase.asm +@@ -447,7 +447,7 @@ define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')') + cmovX( %ebx, %ecx) C high carry reverse + cmovX( %eax, %ebx) C low carry reverse + movl %edx, VAR_JMP +- jmp *%edx ++ X86_NOTRACK jmp *%edx + + + C Must be on an even address here so the low bit of the jump address +@@ -647,3 +647,4 @@ L(pic_calc): + + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/aors_n.asm b/mpn/x86/pentium/aors_n.asm +index 01ebfb9..ca124a5 100644 +--- a/mpn/x86/pentium/aors_n.asm ++++ b/mpn/x86/pentium/aors_n.asm +@@ -201,3 +201,4 @@ L(end2): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/aorsmul_1.asm b/mpn/x86/pentium/aorsmul_1.asm +index d83cc45..5cec8b3 100644 +--- a/mpn/x86/pentium/aorsmul_1.asm ++++ b/mpn/x86/pentium/aorsmul_1.asm +@@ -142,3 +142,4 @@ L(top): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/com.asm b/mpn/x86/pentium/com.asm +index b080545..00064ff 100644 +--- a/mpn/x86/pentium/com.asm ++++ b/mpn/x86/pentium/com.asm +@@ -179,3 +179,4 @@ L(done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/copyd.asm b/mpn/x86/pentium/copyd.asm +index 72a543b..c7f74b5 100644 +--- a/mpn/x86/pentium/copyd.asm ++++ b/mpn/x86/pentium/copyd.asm +@@ -144,3 +144,4 @@ L(done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/copyi.asm b/mpn/x86/pentium/copyi.asm +index d983d6b..bc7744e 100644 +--- a/mpn/x86/pentium/copyi.asm ++++ b/mpn/x86/pentium/copyi.asm +@@ -162,3 +162,4 @@ L(done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/logops_n.asm b/mpn/x86/pentium/logops_n.asm +index 1877317..41a9477 100644 +--- a/mpn/x86/pentium/logops_n.asm ++++ b/mpn/x86/pentium/logops_n.asm +@@ -174,3 +174,4 @@ L(done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/lshift.asm b/mpn/x86/pentium/lshift.asm +index 2a31f36..68cba52 100644 +--- a/mpn/x86/pentium/lshift.asm ++++ b/mpn/x86/pentium/lshift.asm +@@ -241,3 +241,4 @@ L(L1): movl %edx,(%edi) C store last limb + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/mmx/lshift.asm b/mpn/x86/pentium/mmx/lshift.asm +index 04b0ddc..9e18c86 100644 +--- a/mpn/x86/pentium/mmx/lshift.asm ++++ b/mpn/x86/pentium/mmx/lshift.asm +@@ -461,3 +461,4 @@ L(finish_zero_unaligned): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/mmx/mul_1.asm b/mpn/x86/pentium/mmx/mul_1.asm +index 4ced577..b04a718 100644 +--- a/mpn/x86/pentium/mmx/mul_1.asm ++++ b/mpn/x86/pentium/mmx/mul_1.asm +@@ -369,3 +369,4 @@ L(small_done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/mmx/rshift.asm b/mpn/x86/pentium/mmx/rshift.asm +index e3b274b..5493d20 100644 +--- a/mpn/x86/pentium/mmx/rshift.asm ++++ b/mpn/x86/pentium/mmx/rshift.asm +@@ -466,3 +466,4 @@ L(finish_zero_unaligned): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/mod_34lsub1.asm b/mpn/x86/pentium/mod_34lsub1.asm +index 2d88223..0945de8 100644 +--- a/mpn/x86/pentium/mod_34lsub1.asm ++++ b/mpn/x86/pentium/mod_34lsub1.asm +@@ -190,3 +190,4 @@ L(combine): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/mul_1.asm b/mpn/x86/pentium/mul_1.asm +index a0858af..2c49130 100644 +--- a/mpn/x86/pentium/mul_1.asm ++++ b/mpn/x86/pentium/mul_1.asm +@@ -175,3 +175,4 @@ L(top): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/mul_2.asm b/mpn/x86/pentium/mul_2.asm +index 4c7beb5..e94e071 100644 +--- a/mpn/x86/pentium/mul_2.asm ++++ b/mpn/x86/pentium/mul_2.asm +@@ -148,3 +148,4 @@ L(done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/mul_basecase.asm b/mpn/x86/pentium/mul_basecase.asm +index e1d0f05..ff269bb 100644 +--- a/mpn/x86/pentium/mul_basecase.asm ++++ b/mpn/x86/pentium/mul_basecase.asm +@@ -140,3 +140,4 @@ L(done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/rshift.asm b/mpn/x86/pentium/rshift.asm +index 2105c4c..d98080d 100644 +--- a/mpn/x86/pentium/rshift.asm ++++ b/mpn/x86/pentium/rshift.asm +@@ -241,3 +241,4 @@ L(L1): movl %edx,(%edi) C store last limb + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium/sqr_basecase.asm b/mpn/x86/pentium/sqr_basecase.asm +index b11d767..ee64eb3 100644 +--- a/mpn/x86/pentium/sqr_basecase.asm ++++ b/mpn/x86/pentium/sqr_basecase.asm +@@ -526,3 +526,4 @@ L(diag): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/copyd.asm b/mpn/x86/pentium4/copyd.asm +index 82af81c..bf06a05 100644 +--- a/mpn/x86/pentium4/copyd.asm ++++ b/mpn/x86/pentium4/copyd.asm +@@ -69,3 +69,4 @@ L(end): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/copyi.asm b/mpn/x86/pentium4/copyi.asm +index b614887..acbb3f4 100644 +--- a/mpn/x86/pentium4/copyi.asm ++++ b/mpn/x86/pentium4/copyi.asm +@@ -91,3 +91,4 @@ L(replmovs): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/mmx/popham.asm b/mpn/x86/pentium4/mmx/popham.asm +index 9563cb5..f7a6124 100644 +--- a/mpn/x86/pentium4/mmx/popham.asm ++++ b/mpn/x86/pentium4/mmx/popham.asm +@@ -201,3 +201,4 @@ L(loaded): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/add_n.asm b/mpn/x86/pentium4/sse2/add_n.asm +index 8e2380e..e329635 100644 +--- a/mpn/x86/pentium4/sse2/add_n.asm ++++ b/mpn/x86/pentium4/sse2/add_n.asm +@@ -99,3 +99,4 @@ L(top): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/addlsh1_n.asm b/mpn/x86/pentium4/sse2/addlsh1_n.asm +index 93b63b2..e801f7b 100644 +--- a/mpn/x86/pentium4/sse2/addlsh1_n.asm ++++ b/mpn/x86/pentium4/sse2/addlsh1_n.asm +@@ -106,3 +106,4 @@ L(top): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/addmul_1.asm b/mpn/x86/pentium4/sse2/addmul_1.asm +index 7810207..62a7675 100644 +--- a/mpn/x86/pentium4/sse2/addmul_1.asm ++++ b/mpn/x86/pentium4/sse2/addmul_1.asm +@@ -187,3 +187,4 @@ PROLOGUE(mpn_addmul_1c) + movd 20(%esp), %mm6 + jmp L(ent) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/cnd_add_n.asm b/mpn/x86/pentium4/sse2/cnd_add_n.asm +index b3f3474..7183b94 100644 +--- a/mpn/x86/pentium4/sse2/cnd_add_n.asm ++++ b/mpn/x86/pentium4/sse2/cnd_add_n.asm +@@ -93,3 +93,4 @@ L(top): movd (%ebx,%ecx,4), %mm2 + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/mpn/x86/pentium4/sse2/cnd_sub_n.asm +index 339a23e..ba0fc47 100644 +--- a/mpn/x86/pentium4/sse2/cnd_sub_n.asm ++++ b/mpn/x86/pentium4/sse2/cnd_sub_n.asm +@@ -112,3 +112,4 @@ L(done_mm1): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/divrem_1.asm b/mpn/x86/pentium4/sse2/divrem_1.asm +index 0146fab..d8619e0 100644 +--- a/mpn/x86/pentium4/sse2/divrem_1.asm ++++ b/mpn/x86/pentium4/sse2/divrem_1.asm +@@ -643,3 +643,4 @@ L(fraction_top): + jmp L(fraction_done) + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/mod_1_1.asm b/mpn/x86/pentium4/sse2/mod_1_1.asm +index ee88bab..2e5a514 100644 +--- a/mpn/x86/pentium4/sse2/mod_1_1.asm ++++ b/mpn/x86/pentium4/sse2/mod_1_1.asm +@@ -164,3 +164,4 @@ C CAUTION: This is the same code as in k7/mod_1_1.asm + pop %ebp + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/mod_1_4.asm b/mpn/x86/pentium4/sse2/mod_1_4.asm +index eb2edb6..5ef3c4a 100644 +--- a/mpn/x86/pentium4/sse2/mod_1_4.asm ++++ b/mpn/x86/pentium4/sse2/mod_1_4.asm +@@ -267,3 +267,4 @@ C CAUTION: This is the same code as in k7/mod_1_4.asm + pop %ebp + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/mpn/x86/pentium4/sse2/mod_34lsub1.asm +index 31e25b7..5b6b9a7 100644 +--- a/mpn/x86/pentium4/sse2/mod_34lsub1.asm ++++ b/mpn/x86/pentium4/sse2/mod_34lsub1.asm +@@ -173,3 +173,4 @@ L(combine): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/mul_1.asm b/mpn/x86/pentium4/sse2/mul_1.asm +index 6347b8b..9e4f3fc 100644 +--- a/mpn/x86/pentium4/sse2/mul_1.asm ++++ b/mpn/x86/pentium4/sse2/mul_1.asm +@@ -162,3 +162,4 @@ PROLOGUE(mpn_mul_1c) + movd 20(%esp), %mm6 + jmp L(ent) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/mul_basecase.asm b/mpn/x86/pentium4/sse2/mul_basecase.asm +index 6e3775a..0bad756 100644 +--- a/mpn/x86/pentium4/sse2/mul_basecase.asm ++++ b/mpn/x86/pentium4/sse2/mul_basecase.asm +@@ -660,3 +660,4 @@ L(oel3): + pop %esi C 3 + ret C 3 + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/rsh1add_n.asm b/mpn/x86/pentium4/sse2/rsh1add_n.asm +index f421d13..543a637 100644 +--- a/mpn/x86/pentium4/sse2/rsh1add_n.asm ++++ b/mpn/x86/pentium4/sse2/rsh1add_n.asm +@@ -124,3 +124,4 @@ L(done): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/sqr_basecase.asm b/mpn/x86/pentium4/sse2/sqr_basecase.asm +index 2dd57d2..9695d42 100644 +--- a/mpn/x86/pentium4/sse2/sqr_basecase.asm ++++ b/mpn/x86/pentium4/sse2/sqr_basecase.asm +@@ -703,3 +703,4 @@ L(diag): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/sub_n.asm b/mpn/x86/pentium4/sse2/sub_n.asm +index 5ba1c01..2cd5b22 100644 +--- a/mpn/x86/pentium4/sse2/sub_n.asm ++++ b/mpn/x86/pentium4/sse2/sub_n.asm +@@ -117,3 +117,4 @@ L(done_mm1): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/pentium4/sse2/submul_1.asm b/mpn/x86/pentium4/sse2/submul_1.asm +index 020675b..1172f0a 100644 +--- a/mpn/x86/pentium4/sse2/submul_1.asm ++++ b/mpn/x86/pentium4/sse2/submul_1.asm +@@ -180,3 +180,4 @@ L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 + movd %mm0, 8(%edx) C result + jmp L(rt) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/rshift.asm b/mpn/x86/rshift.asm +index a60dcaa..1cedc0d 100644 +--- a/mpn/x86/rshift.asm ++++ b/mpn/x86/rshift.asm +@@ -106,3 +106,4 @@ L(end): shrl %cl,%ebx C compute most significant limb + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/sec_tabselect.asm b/mpn/x86/sec_tabselect.asm +index c7c2e05..3a8fa17 100644 +--- a/mpn/x86/sec_tabselect.asm ++++ b/mpn/x86/sec_tabselect.asm +@@ -113,3 +113,4 @@ L(outer_end): + pop %edi + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/sqr_basecase.asm b/mpn/x86/sqr_basecase.asm +index 39f8a89..3414b05 100644 +--- a/mpn/x86/sqr_basecase.asm ++++ b/mpn/x86/sqr_basecase.asm +@@ -357,3 +357,4 @@ L(diag): + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/udiv.asm b/mpn/x86/udiv.asm +index a3ee088..2531ef7 100644 +--- a/mpn/x86/udiv.asm ++++ b/mpn/x86/udiv.asm +@@ -50,3 +50,4 @@ deflit(`FRAME',0) + movl %edx, (%ecx) + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/umul.asm b/mpn/x86/umul.asm +index 34fe434..5c1da35 100644 +--- a/mpn/x86/umul.asm ++++ b/mpn/x86/umul.asm +@@ -49,3 +49,4 @@ deflit(`FRAME',0) + movl %edx, %eax + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86/x86-defs.m4 b/mpn/x86/x86-defs.m4 +index 81309b2..b3520d2 100644 +--- a/mpn/x86/x86-defs.m4 ++++ b/mpn/x86/x86-defs.m4 +@@ -123,6 +123,7 @@ m4_assert_defined(`WANT_PROFILING') + TYPE($1,`function') + COFF_TYPE($1) + $1: ++ X86_ENDBR + ifelse(WANT_PROFILING,`prof', ` call_mcount') + ifelse(WANT_PROFILING,`gprof', ` call_mcount') + ifelse(WANT_PROFILING,`instrument',` call_instrument(enter)') +@@ -992,7 +993,11 @@ L(movl_eip_`'substr($2,1)): + + dnl ASM_END + +-define(`ASM_END',`load_eip') ++define(`ASM_END', ++`load_eip ++X86_GNU_PROPERTY ++') ++ + + define(`load_eip', `') dnl updated in LEA/LEAL + +diff --git a/mpn/x86_64/addaddmul_1msb0.asm b/mpn/x86_64/addaddmul_1msb0.asm +index 87c21b4..2d03ddb 100644 +--- a/mpn/x86_64/addaddmul_1msb0.asm ++++ b/mpn/x86_64/addaddmul_1msb0.asm +@@ -168,3 +168,4 @@ L(end): cmp $1, R32(n) + pop %r12 + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/aorrlsh1_n.asm b/mpn/x86_64/aorrlsh1_n.asm +index 6ee0872..1441a6c 100644 +--- a/mpn/x86_64/aorrlsh1_n.asm ++++ b/mpn/x86_64/aorrlsh1_n.asm +@@ -168,3 +168,4 @@ ifdef(`OPERATION_rsblsh1_n',` + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/aorrlshC_n.asm b/mpn/x86_64/aorrlshC_n.asm +index de00154..691abde 100644 +--- a/mpn/x86_64/aorrlshC_n.asm ++++ b/mpn/x86_64/aorrlshC_n.asm +@@ -170,3 +170,4 @@ ifelse(ADDSUB,add,` + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/aorrlsh_n.asm b/mpn/x86_64/aorrlsh_n.asm +index 5ca128f..57f0e77 100644 +--- a/mpn/x86_64/aorrlsh_n.asm ++++ b/mpn/x86_64/aorrlsh_n.asm +@@ -174,3 +174,4 @@ L(end): add R32(%rbx), R32(%rbx) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/aors_err1_n.asm b/mpn/x86_64/aors_err1_n.asm +index 54d0b3f..8c42ea1 100644 +--- a/mpn/x86_64/aors_err1_n.asm ++++ b/mpn/x86_64/aors_err1_n.asm +@@ -223,3 +223,4 @@ L(end): + pop %rbx + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/aors_err2_n.asm b/mpn/x86_64/aors_err2_n.asm +index ce5c2a4..0227e5d 100644 +--- a/mpn/x86_64/aors_err2_n.asm ++++ b/mpn/x86_64/aors_err2_n.asm +@@ -170,3 +170,4 @@ L(end): + pop %rbx + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/aors_err3_n.asm b/mpn/x86_64/aors_err3_n.asm +index bb6d0c5..37047db 100644 +--- a/mpn/x86_64/aors_err3_n.asm ++++ b/mpn/x86_64/aors_err3_n.asm +@@ -154,3 +154,4 @@ L(end): + pop %rbx + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/aors_n.asm b/mpn/x86_64/aors_n.asm +index d5a314a..b516c4d 100644 +--- a/mpn/x86_64/aors_n.asm ++++ b/mpn/x86_64/aors_n.asm +@@ -176,3 +176,4 @@ L(end): lea 32(up), up + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/aorsmul_1.asm b/mpn/x86_64/aorsmul_1.asm +index dfe4dc4..e3bb2f9 100644 +--- a/mpn/x86_64/aorsmul_1.asm ++++ b/mpn/x86_64/aorsmul_1.asm +@@ -188,3 +188,4 @@ IFDOS(``pop %rdi '') + IFDOS(``pop %rsi '') + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/atom/addmul_2.asm b/mpn/x86_64/atom/addmul_2.asm +index c1dcdc4..c1d9451 100644 +--- a/mpn/x86_64/atom/addmul_2.asm ++++ b/mpn/x86_64/atom/addmul_2.asm +@@ -184,3 +184,4 @@ L(end): mul v1 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/atom/aorrlsh1_n.asm b/mpn/x86_64/atom/aorrlsh1_n.asm +index f44de19..693a302 100644 +--- a/mpn/x86_64/atom/aorrlsh1_n.asm ++++ b/mpn/x86_64/atom/aorrlsh1_n.asm +@@ -236,3 +236,4 @@ IFDOS(` mov 56(%rsp), %r8 ') + sbb R32(%rbp), R32(%rbp) C save acy + jmp L(ent) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/atom/aorrlsh2_n.asm b/mpn/x86_64/atom/aorrlsh2_n.asm +index 02fb29d..c6ded74 100644 +--- a/mpn/x86_64/atom/aorrlsh2_n.asm ++++ b/mpn/x86_64/atom/aorrlsh2_n.asm +@@ -189,3 +189,4 @@ ifdef(`OPERATION_rsblsh2_n',` + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/atom/lshift.asm b/mpn/x86_64/atom/lshift.asm +index 1b37d5d..894b912 100644 +--- a/mpn/x86_64/atom/lshift.asm ++++ b/mpn/x86_64/atom/lshift.asm +@@ -121,3 +121,4 @@ L(end): shl R8(%rcx), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/atom/lshiftc.asm b/mpn/x86_64/atom/lshiftc.asm +index 7385f8f..40d8fff 100644 +--- a/mpn/x86_64/atom/lshiftc.asm ++++ b/mpn/x86_64/atom/lshiftc.asm +@@ -125,3 +125,4 @@ L(end): shl R8(%rcx), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/atom/mul_2.asm b/mpn/x86_64/atom/mul_2.asm +index 4bc22cd..87414d9 100644 +--- a/mpn/x86_64/atom/mul_2.asm ++++ b/mpn/x86_64/atom/mul_2.asm +@@ -188,3 +188,4 @@ L(end): mul v1 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/atom/rsh1aors_n.asm b/mpn/x86_64/atom/rsh1aors_n.asm +index 6f5f638..f3952c0 100644 +--- a/mpn/x86_64/atom/rsh1aors_n.asm ++++ b/mpn/x86_64/atom/rsh1aors_n.asm +@@ -285,3 +285,4 @@ L(cj1): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/atom/rshift.asm b/mpn/x86_64/atom/rshift.asm +index 29c027d..f4c59e1 100644 +--- a/mpn/x86_64/atom/rshift.asm ++++ b/mpn/x86_64/atom/rshift.asm +@@ -119,3 +119,4 @@ L(end): shr R8(cnt), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/atom/sublsh1_n.asm b/mpn/x86_64/atom/sublsh1_n.asm +index 1306acd..762e1ee 100644 +--- a/mpn/x86_64/atom/sublsh1_n.asm ++++ b/mpn/x86_64/atom/sublsh1_n.asm +@@ -240,3 +240,4 @@ IFDOS(` mov 56(%rsp), %r8 ') + sbb R32(%rbp), R32(%rbp) C save acy + jmp L(ent) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bd1/addmul_2.asm b/mpn/x86_64/bd1/addmul_2.asm +index b54e91a..b1c149b 100644 +--- a/mpn/x86_64/bd1/addmul_2.asm ++++ b/mpn/x86_64/bd1/addmul_2.asm +@@ -233,3 +233,4 @@ L(end): mul v0 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bd1/hamdist.asm b/mpn/x86_64/bd1/hamdist.asm +index 29e78a3..f93ce4d 100644 +--- a/mpn/x86_64/bd1/hamdist.asm ++++ b/mpn/x86_64/bd1/hamdist.asm +@@ -204,3 +204,4 @@ DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + END_OBJECT(L(cnsts)) + ') ++ASM_END() +diff --git a/mpn/x86_64/bd1/mul_2.asm b/mpn/x86_64/bd1/mul_2.asm +index 85fa7aa..e910cee 100644 +--- a/mpn/x86_64/bd1/mul_2.asm ++++ b/mpn/x86_64/bd1/mul_2.asm +@@ -193,3 +193,4 @@ L(end): mov -8(up), %rax + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bd1/mul_basecase.asm b/mpn/x86_64/bd1/mul_basecase.asm +index e47ba58..ebae74d 100644 +--- a/mpn/x86_64/bd1/mul_basecase.asm ++++ b/mpn/x86_64/bd1/mul_basecase.asm +@@ -414,3 +414,4 @@ L(ret2):pop %rbp + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bd1/popcount.asm b/mpn/x86_64/bd1/popcount.asm +index 28ce461..063c2cc 100644 +--- a/mpn/x86_64/bd1/popcount.asm ++++ b/mpn/x86_64/bd1/popcount.asm +@@ -189,3 +189,4 @@ DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + END_OBJECT(L(cnsts)) + ') ++ASM_END() +diff --git a/mpn/x86_64/bd2/gcd_11.asm b/mpn/x86_64/bd2/gcd_11.asm +index b167077..3d1c788 100644 +--- a/mpn/x86_64/bd2/gcd_11.asm ++++ b/mpn/x86_64/bd2/gcd_11.asm +@@ -94,3 +94,4 @@ L(end): mov v0, %rax + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bd2/gcd_22.asm b/mpn/x86_64/bd2/gcd_22.asm +index a4f30ea..b886678 100644 +--- a/mpn/x86_64/bd2/gcd_22.asm ++++ b/mpn/x86_64/bd2/gcd_22.asm +@@ -140,3 +140,4 @@ L(end): C mov v0, %rax + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bd4/gcd_11.asm b/mpn/x86_64/bd4/gcd_11.asm +index 4176b85..d172e32 100644 +--- a/mpn/x86_64/bd4/gcd_11.asm ++++ b/mpn/x86_64/bd4/gcd_11.asm +@@ -94,3 +94,4 @@ L(end): C rax = result + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bdiv_dbm1c.asm b/mpn/x86_64/bdiv_dbm1c.asm +index a53bd52..c383ee3 100644 +--- a/mpn/x86_64/bdiv_dbm1c.asm ++++ b/mpn/x86_64/bdiv_dbm1c.asm +@@ -104,3 +104,4 @@ L(lo1): sub %rax, %r8 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bdiv_q_1.asm b/mpn/x86_64/bdiv_q_1.asm +index 85538c9..c983c7f 100644 +--- a/mpn/x86_64/bdiv_q_1.asm ++++ b/mpn/x86_64/bdiv_q_1.asm +@@ -193,3 +193,4 @@ L(one): shr R8(%rcx), %rax + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bt1/aors_n.asm b/mpn/x86_64/bt1/aors_n.asm +index 9b6b5c7..04d81dd 100644 +--- a/mpn/x86_64/bt1/aors_n.asm ++++ b/mpn/x86_64/bt1/aors_n.asm +@@ -157,3 +157,4 @@ PROLOGUE(func_nc) + IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bt1/aorsmul_1.asm b/mpn/x86_64/bt1/aorsmul_1.asm +index 41e1d8a..d309321 100644 +--- a/mpn/x86_64/bt1/aorsmul_1.asm ++++ b/mpn/x86_64/bt1/aorsmul_1.asm +@@ -189,3 +189,4 @@ IFDOS(` pop %rdi ') + IFDOS(` pop %rsi ') + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bt1/copyd.asm b/mpn/x86_64/bt1/copyd.asm +index 877714e..23fb80b 100644 +--- a/mpn/x86_64/bt1/copyd.asm ++++ b/mpn/x86_64/bt1/copyd.asm +@@ -89,3 +89,4 @@ L(end): cmp $-4, R32(n) + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bt1/copyi.asm b/mpn/x86_64/bt1/copyi.asm +index ee0f578..25718e6 100644 +--- a/mpn/x86_64/bt1/copyi.asm ++++ b/mpn/x86_64/bt1/copyi.asm +@@ -92,3 +92,4 @@ L(end): cmp $4, R32(n) + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bt1/gcd_11.asm b/mpn/x86_64/bt1/gcd_11.asm +index ef53392..03bc06d 100644 +--- a/mpn/x86_64/bt1/gcd_11.asm ++++ b/mpn/x86_64/bt1/gcd_11.asm +@@ -117,3 +117,4 @@ L(count_better): + bsf u0, cnt + jmp L(shr) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bt1/mul_1.asm b/mpn/x86_64/bt1/mul_1.asm +index 4394d6e..634cb35 100644 +--- a/mpn/x86_64/bt1/mul_1.asm ++++ b/mpn/x86_64/bt1/mul_1.asm +@@ -239,3 +239,4 @@ IFDOS(` pop %rdi ') + IFDOS(` pop %rsi ') + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bt1/mul_basecase.asm b/mpn/x86_64/bt1/mul_basecase.asm +index e7d46bf..1726190 100644 +--- a/mpn/x86_64/bt1/mul_basecase.asm ++++ b/mpn/x86_64/bt1/mul_basecase.asm +@@ -484,3 +484,4 @@ L(ret): pop %r13 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/bt1/sqr_basecase.asm b/mpn/x86_64/bt1/sqr_basecase.asm +index 0e417a1..8f665d1 100644 +--- a/mpn/x86_64/bt1/sqr_basecase.asm ++++ b/mpn/x86_64/bt1/sqr_basecase.asm +@@ -563,3 +563,4 @@ L(esd): add %rbx, w0 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/cnd_aors_n.asm b/mpn/x86_64/cnd_aors_n.asm +index 13a2ab3..b720ecb 100644 +--- a/mpn/x86_64/cnd_aors_n.asm ++++ b/mpn/x86_64/cnd_aors_n.asm +@@ -181,3 +181,4 @@ L(end): neg R32(%rax) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/com.asm b/mpn/x86_64/com.asm +index 006acaf..ec72e19 100644 +--- a/mpn/x86_64/com.asm ++++ b/mpn/x86_64/com.asm +@@ -93,3 +93,4 @@ L(e10): movq 24(up,n,8), %r9 + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/copyd.asm b/mpn/x86_64/copyd.asm +index a5e6e59..02ab53f 100644 +--- a/mpn/x86_64/copyd.asm ++++ b/mpn/x86_64/copyd.asm +@@ -91,3 +91,4 @@ L(end): shr R32(n) + mov %r9, -16(rp) + 1: ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/copyi.asm b/mpn/x86_64/copyi.asm +index bafce7a..8c6dbdc 100644 +--- a/mpn/x86_64/copyi.asm ++++ b/mpn/x86_64/copyi.asm +@@ -90,3 +90,4 @@ L(end): shr R32(n) + mov %r9, 16(rp) + 1: ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/aors_err1_n.asm b/mpn/x86_64/core2/aors_err1_n.asm +index 3f875ae..c9c6c36 100644 +--- a/mpn/x86_64/core2/aors_err1_n.asm ++++ b/mpn/x86_64/core2/aors_err1_n.asm +@@ -223,3 +223,4 @@ L(end): + pop %rbx + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/aors_n.asm b/mpn/x86_64/core2/aors_n.asm +index f9e0039..7981b7f 100644 +--- a/mpn/x86_64/core2/aors_n.asm ++++ b/mpn/x86_64/core2/aors_n.asm +@@ -148,3 +148,4 @@ PROLOGUE(func_nc) + IFDOS(` mov 56(%rsp), %r8 ') + jmp L(start) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/aorsmul_1.asm b/mpn/x86_64/core2/aorsmul_1.asm +index a7a5d6e..b2b067a 100644 +--- a/mpn/x86_64/core2/aorsmul_1.asm ++++ b/mpn/x86_64/core2/aorsmul_1.asm +@@ -186,3 +186,4 @@ L(n1): mov 8(rp), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/divrem_1.asm b/mpn/x86_64/core2/divrem_1.asm +index 1b3f139..d41c494 100644 +--- a/mpn/x86_64/core2/divrem_1.asm ++++ b/mpn/x86_64/core2/divrem_1.asm +@@ -241,3 +241,4 @@ L(ret): pop %rbx + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/gcd_11.asm b/mpn/x86_64/core2/gcd_11.asm +index b00451f..b730a55 100644 +--- a/mpn/x86_64/core2/gcd_11.asm ++++ b/mpn/x86_64/core2/gcd_11.asm +@@ -91,3 +91,4 @@ L(end): C rax = result + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/gcd_22.asm b/mpn/x86_64/core2/gcd_22.asm +index b5aa73b..0ccde8a 100644 +--- a/mpn/x86_64/core2/gcd_22.asm ++++ b/mpn/x86_64/core2/gcd_22.asm +@@ -135,3 +135,4 @@ L(end): C mov v0, %rax + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/hamdist.asm b/mpn/x86_64/core2/hamdist.asm +index a78753d..be451d7 100644 +--- a/mpn/x86_64/core2/hamdist.asm ++++ b/mpn/x86_64/core2/hamdist.asm +@@ -208,3 +208,4 @@ DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + END_OBJECT(L(cnsts)) ++ASM_END() +diff --git a/mpn/x86_64/core2/logops_n.asm b/mpn/x86_64/core2/logops_n.asm +index 5ff174c..451d556 100644 +--- a/mpn/x86_64/core2/logops_n.asm ++++ b/mpn/x86_64/core2/logops_n.asm +@@ -283,3 +283,4 @@ L(ret): FUNC_EXIT() + ret + EPILOGUE() + ') ++ASM_END() +diff --git a/mpn/x86_64/core2/lshift.asm b/mpn/x86_64/core2/lshift.asm +index 9016a71..62053c2 100644 +--- a/mpn/x86_64/core2/lshift.asm ++++ b/mpn/x86_64/core2/lshift.asm +@@ -143,3 +143,4 @@ L(1): shl R8(cnt), %r9 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/lshiftc.asm b/mpn/x86_64/core2/lshiftc.asm +index c428f13..cdd4e11 100644 +--- a/mpn/x86_64/core2/lshiftc.asm ++++ b/mpn/x86_64/core2/lshiftc.asm +@@ -157,3 +157,4 @@ L(1): shl R8(cnt), %r9 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/mul_basecase.asm b/mpn/x86_64/core2/mul_basecase.asm +index d16be85..0dcf0f8 100644 +--- a/mpn/x86_64/core2/mul_basecase.asm ++++ b/mpn/x86_64/core2/mul_basecase.asm +@@ -347,6 +347,7 @@ L(m2e0):mul v1 + jz L(ret2) + + L(do_am0): ++ X86_ENDBR + push %r15 + push vn_param + +@@ -520,6 +521,7 @@ L(m2e1):mul v1 + jz L(ret2) + + L(do_am1): ++ X86_ENDBR + push %r15 + push vn_param + +@@ -693,6 +695,7 @@ L(m2e2):mul v1 + jz L(ret2) + + L(do_am2): ++ X86_ENDBR + push %r15 + push vn_param + +@@ -866,6 +869,7 @@ L(m2e3):mul v1 + jz L(ret2) + + L(do_am3): ++ X86_ENDBR + push %r15 + push vn_param + +@@ -973,3 +977,4 @@ L(lo3): mul v0 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/mullo_basecase.asm b/mpn/x86_64/core2/mullo_basecase.asm +index 0f03d86..11814d5 100644 +--- a/mpn/x86_64/core2/mullo_basecase.asm ++++ b/mpn/x86_64/core2/mullo_basecase.asm +@@ -425,3 +425,4 @@ L(n3): mov (vp_param), %r9 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/popcount.asm b/mpn/x86_64/core2/popcount.asm +index 39d8c5d..5e03ef3 100644 +--- a/mpn/x86_64/core2/popcount.asm ++++ b/mpn/x86_64/core2/popcount.asm +@@ -183,3 +183,4 @@ DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + END_OBJECT(L(cnsts)) ++ASM_END() +diff --git a/mpn/x86_64/core2/rsh1aors_n.asm b/mpn/x86_64/core2/rsh1aors_n.asm +index 27eed37..5b4fe7e 100644 +--- a/mpn/x86_64/core2/rsh1aors_n.asm ++++ b/mpn/x86_64/core2/rsh1aors_n.asm +@@ -167,3 +167,4 @@ L(end): shrd $1, %rbx, %rbp + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/rshift.asm b/mpn/x86_64/core2/rshift.asm +index 7578a53..86cc804 100644 +--- a/mpn/x86_64/core2/rshift.asm ++++ b/mpn/x86_64/core2/rshift.asm +@@ -141,3 +141,4 @@ L(1): shr R8(cnt), %r9 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/sqr_basecase.asm b/mpn/x86_64/core2/sqr_basecase.asm +index a112c1b..65286b0 100644 +--- a/mpn/x86_64/core2/sqr_basecase.asm ++++ b/mpn/x86_64/core2/sqr_basecase.asm +@@ -982,3 +982,4 @@ L(n3): mov %rax, %r10 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/core2/sublshC_n.asm b/mpn/x86_64/core2/sublshC_n.asm +index 272700d..e30562b 100644 +--- a/mpn/x86_64/core2/sublshC_n.asm ++++ b/mpn/x86_64/core2/sublshC_n.asm +@@ -156,3 +156,4 @@ L(end): shr $RSH, %r11 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreibwl/addmul_1.asm b/mpn/x86_64/coreibwl/addmul_1.asm +index ee7e4ee..4ea5580 100644 +--- a/mpn/x86_64/coreibwl/addmul_1.asm ++++ b/mpn/x86_64/coreibwl/addmul_1.asm +@@ -110,33 +110,39 @@ L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +-L(f0): mulx( (up), %r10, %r8) ++L(f0): X86_ENDBR ++ mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea -1(n), n + jmp L(b0) + +-L(f3): mulx( (up), %r9, %rax) ++L(f3): X86_ENDBR ++ mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + jmp L(b3) + +-L(f4): mulx( (up), %r10, %r8) ++L(f4): X86_ENDBR ++ mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + jmp L(b4) + +-L(f5): mulx( (up), %r9, %rax) ++L(f5): X86_ENDBR ++ mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + jmp L(b5) + +-L(f6): mulx( (up), %r10, %r8) ++L(f6): X86_ENDBR ++ mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + jmp L(b6) + +-L(f1): mulx( (up), %r9, %rax) ++L(f1): X86_ENDBR ++ mulx( (up), %r9, %rax) + jrcxz L(1) + jmp L(b1) + L(1): add (rp), %r9 +@@ -156,7 +162,8 @@ ifdef(`PIC', + ` nop;nop;nop;nop', + ` nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop') + +-L(f2): mulx( (up), %r10, %r8) ++L(f2): X86_ENDBR ++ mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) +@@ -200,7 +207,8 @@ L(b3): adox( 48,(rp), %r9) + mulx( (up), %r9, %rax) + jmp L(top) + +-L(f7): mulx( (up), %r9, %rax) ++L(f7): X86_ENDBR ++ mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + jmp L(b7) +diff --git a/mpn/x86_64/coreibwl/mul_1.asm b/mpn/x86_64/coreibwl/mul_1.asm +index b7fae2f..77121a5 100644 +--- a/mpn/x86_64/coreibwl/mul_1.asm ++++ b/mpn/x86_64/coreibwl/mul_1.asm +@@ -108,48 +108,56 @@ L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +-L(f0): mulx( (up), %r10, %r8) ++L(f0): X86_ENDBR ++ mulx( (up), %r10, %r8) + lea 56(up), up + lea -8(rp), rp + jmp L(b0) + +-L(f3): mulx( (up), %r9, %rax) ++L(f3): X86_ENDBR ++ mulx( (up), %r9, %rax) + lea 16(up), up + lea 16(rp), rp + inc n + jmp L(b3) + +-L(f4): mulx( (up), %r10, %r8) ++L(f4): X86_ENDBR ++ mulx( (up), %r10, %r8) + lea 24(up), up + lea 24(rp), rp + inc n + jmp L(b4) + +-L(f5): mulx( (up), %r9, %rax) ++L(f5): X86_ENDBR ++ mulx( (up), %r9, %rax) + lea 32(up), up + lea 32(rp), rp + inc n + jmp L(b5) + +-L(f6): mulx( (up), %r10, %r8) ++L(f6): X86_ENDBR ++ mulx( (up), %r10, %r8) + lea 40(up), up + lea 40(rp), rp + inc n + jmp L(b6) + +-L(f7): mulx( (up), %r9, %rax) ++L(f7): X86_ENDBR ++ mulx( (up), %r9, %rax) + lea 48(up), up + lea 48(rp), rp + inc n + jmp L(b7) + +-L(f1): mulx( (up), %r9, %rax) ++L(f1): X86_ENDBR ++ mulx( (up), %r9, %rax) + test n, n + jnz L(b1) + L(1): mov %r9, (rp) + ret + +-L(f2): mulx( (up), %r10, %r8) ++L(f2): X86_ENDBR ++ mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) +diff --git a/mpn/x86_64/coreibwl/mul_basecase.asm b/mpn/x86_64/coreibwl/mul_basecase.asm +index 42ca976..c5e60e7 100644 +--- a/mpn/x86_64/coreibwl/mul_basecase.asm ++++ b/mpn/x86_64/coreibwl/mul_basecase.asm +@@ -157,45 +157,53 @@ ifdef(`PIC', + jmp *(%r10,%rax,8) + ') + +-L(mf0): mulx( (up), w2, w3) ++L(mf0): X86_ENDBR ++ mulx( (up), w2, w3) + lea 56(up), up + lea -8(rp), rp + jmp L(mb0) + +-L(mf3): mulx( (up), w0, w1) ++L(mf3): X86_ENDBR ++ mulx( (up), w0, w1) + lea 16(up), up + lea 16(rp), rp + inc n + jmp L(mb3) + +-L(mf4): mulx( (up), w2, w3) ++L(mf4): X86_ENDBR ++ mulx( (up), w2, w3) + lea 24(up), up + lea 24(rp), rp + inc n + jmp L(mb4) + +-L(mf5): mulx( (up), w0, w1) ++L(mf5): X86_ENDBR ++ mulx( (up), w0, w1) + lea 32(up), up + lea 32(rp), rp + inc n + jmp L(mb5) + +-L(mf6): mulx( (up), w2, w3) ++L(mf6): X86_ENDBR ++ mulx( (up), w2, w3) + lea 40(up), up + lea 40(rp), rp + inc n + jmp L(mb6) + +-L(mf7): mulx( (up), w0, w1) ++L(mf7): X86_ENDBR ++ mulx( (up), w0, w1) + lea 48(up), up + lea 48(rp), rp + inc n + jmp L(mb7) + +-L(mf1): mulx( (up), w0, w1) ++L(mf1): X86_ENDBR ++ mulx( (up), w0, w1) + jmp L(mb1) + +-L(mf2): mulx( (up), w2, w3) ++L(mf2): X86_ENDBR ++ mulx( (up), w2, w3) + lea 8(up), up + lea 8(rp), rp + mulx( (up), w0, w1) +@@ -256,32 +264,39 @@ L(outer): + lea 8(vp), vp + jmp *jaddr + +-L(f0): mulx( 8,(up), w2, w3) ++L(f0): X86_ENDBR ++ mulx( 8,(up), w2, w3) + lea 8(rp,unneg,8), rp + lea -1(n), n + jmp L(b0) + +-L(f3): mulx( -16,(up), w0, w1) ++L(f3): X86_ENDBR ++ mulx( -16,(up), w0, w1) + lea -56(rp,unneg,8), rp + jmp L(b3) + +-L(f4): mulx( -24,(up), w2, w3) ++L(f4): X86_ENDBR ++ mulx( -24,(up), w2, w3) + lea -56(rp,unneg,8), rp + jmp L(b4) + +-L(f5): mulx( -32,(up), w0, w1) ++L(f5): X86_ENDBR ++ mulx( -32,(up), w0, w1) + lea -56(rp,unneg,8), rp + jmp L(b5) + +-L(f6): mulx( -40,(up), w2, w3) ++L(f6): X86_ENDBR ++ mulx( -40,(up), w2, w3) + lea -56(rp,unneg,8), rp + jmp L(b6) + +-L(f7): mulx( 16,(up), w0, w1) ++L(f7): X86_ENDBR ++ mulx( 16,(up), w0, w1) + lea 8(rp,unneg,8), rp + jmp L(b7) + +-L(f1): mulx( (up), w0, w1) ++L(f1): X86_ENDBR ++ mulx( (up), w0, w1) + lea 8(rp,unneg,8), rp + jmp L(b1) + +@@ -303,6 +318,7 @@ L(done): + ret + + L(f2): ++ X86_ENDBR + mulx( -8,(up), w2, w3) + lea 8(rp,unneg,8), rp + mulx( (up), w0, w1) +@@ -367,3 +383,4 @@ L(atab):JMPENT( L(f0), L(atab)) + JMPENT( L(f7), L(atab)) + TEXT + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreibwl/mullo_basecase.asm b/mpn/x86_64/coreibwl/mullo_basecase.asm +index 5cdb209..b3e435b 100644 +--- a/mpn/x86_64/coreibwl/mullo_basecase.asm ++++ b/mpn/x86_64/coreibwl/mullo_basecase.asm +@@ -393,3 +393,4 @@ L(mtab):JMPENT( L(mf7), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) ++ASM_END() +diff --git a/mpn/x86_64/coreibwl/sqr_basecase.asm b/mpn/x86_64/coreibwl/sqr_basecase.asm +index e81b01b..cd523cf 100644 +--- a/mpn/x86_64/coreibwl/sqr_basecase.asm ++++ b/mpn/x86_64/coreibwl/sqr_basecase.asm +@@ -181,14 +181,16 @@ ifdef(`PIC', + jmp *(%r10,%rax,8) + ') + +-L(mf0): mulx( u0, w0, w1) C up[0]^2 ++L(mf0): X86_ENDBR ++ mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + lea 64(up), up + add w1, w2 + jmp L(mb0) + +-L(mf3): mulx( u0, w2, w3) C up[0]^2 ++L(mf3): X86_ENDBR ++ mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mov w2, (rp) + mulx( 8,(up), w0, w1) +@@ -197,7 +199,8 @@ L(mf3): mulx( u0, w2, w3) C up[0]^2 + add w3, w0 + jmp L(mb3) + +-L(mf4): mulx( u0, w0, w1) C up[0]^2 ++L(mf4): X86_ENDBR ++ mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) +@@ -206,7 +209,8 @@ L(mf4): mulx( u0, w0, w1) C up[0]^2 + add w1, w2 + jmp L(mb4) + +-L(mf5): mulx( u0, w2, w3) C up[0]^2 ++L(mf5): X86_ENDBR ++ mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) +@@ -215,7 +219,8 @@ L(mf5): mulx( u0, w2, w3) C up[0]^2 + add w3, w0 + jmp L(mb5) + +-L(mf6): mulx( u0, w0, w1) C up[0]^2 ++L(mf6): X86_ENDBR ++ mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) +@@ -224,7 +229,8 @@ L(mf6): mulx( u0, w0, w1) C up[0]^2 + add w1, w2 + jmp L(mb6) + +-L(mf7): mulx( u0, w2, w3) C up[0]^2 ++L(mf7): X86_ENDBR ++ mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) +@@ -233,7 +239,8 @@ L(mf7): mulx( u0, w2, w3) C up[0]^2 + add w3, w0 + jmp L(mb7) + +-L(mf1): mulx( u0, w2, w3) C up[0]^2 ++L(mf1): X86_ENDBR ++ mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) +@@ -242,7 +249,8 @@ L(mf1): mulx( u0, w2, w3) C up[0]^2 + add w3, w0 + jmp L(mb1) + +-L(mf2): mulx( u0, w0, w1) C up[0]^2 ++L(mf2): X86_ENDBR ++ mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) +@@ -300,7 +308,8 @@ ifdef(`PIC', + + L(ed0): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +-L(f7): mov w0, (rp) ++L(f7): X86_ENDBR ++ mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea -64(up,un_save,8), up +@@ -356,7 +365,8 @@ L(b0): mov w0, (rp) + + L(ed1): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +-L(f0): mov w0, (rp) ++L(f0): X86_ENDBR ++ mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea -64(up,un_save,8), up +@@ -415,7 +425,8 @@ L(b1): mulx( 8,(up), w2, w3) + + L(ed2): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +-L(f1): mov w0, (rp) ++L(f1): X86_ENDBR ++ mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up +@@ -477,7 +488,8 @@ L(b2): adox( 48,(rp), w0) + + L(ed3): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +-L(f2): mov w0, (rp) ++L(f2): X86_ENDBR ++ mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up +@@ -535,7 +547,8 @@ L(b3): mulx( -16,(up), w0, w1) + + L(ed4): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +-L(f3): mov w0, (rp) ++L(f3): X86_ENDBR ++ mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up +@@ -592,7 +605,8 @@ L(b4): mulx( -24,(up), w2, w3) + + L(ed5): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +-L(f4): mov w0, (rp) ++L(f4): X86_ENDBR ++ mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up +@@ -649,7 +663,8 @@ L(b5): mulx( -32,(up), w0, w1) + + L(ed6): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +-L(f5): mov w0, (rp) ++L(f5): X86_ENDBR ++ mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up +@@ -706,7 +721,8 @@ L(b6): adcx( w1, w2) + + L(ed7): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +-L(f6): mov w0, (rp) ++L(f6): X86_ENDBR ++ mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up +@@ -837,3 +853,4 @@ L(atab):JMPENT( L(f6), L(atab)) + JMPENT( L(f5), L(atab)) + TEXT + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreihwl/addmul_2.asm b/mpn/x86_64/coreihwl/addmul_2.asm +index 9d1c405..322037e 100644 +--- a/mpn/x86_64/coreihwl/addmul_2.asm ++++ b/mpn/x86_64/coreihwl/addmul_2.asm +@@ -239,3 +239,4 @@ L(end): mulx( v0, %rax, w3) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreihwl/aors_n.asm b/mpn/x86_64/coreihwl/aors_n.asm +index fc99627..f9d89f7 100644 +--- a/mpn/x86_64/coreihwl/aors_n.asm ++++ b/mpn/x86_64/coreihwl/aors_n.asm +@@ -259,3 +259,4 @@ L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) ++ASM_END() +diff --git a/mpn/x86_64/coreihwl/aorsmul_1.asm b/mpn/x86_64/coreihwl/aorsmul_1.asm +index 3f43afa..d01c941 100644 +--- a/mpn/x86_64/coreihwl/aorsmul_1.asm ++++ b/mpn/x86_64/coreihwl/aorsmul_1.asm +@@ -199,3 +199,4 @@ L(ret): pop %r13 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreihwl/gcd_22.asm b/mpn/x86_64/coreihwl/gcd_22.asm +index b5863b6..e41731e 100644 +--- a/mpn/x86_64/coreihwl/gcd_22.asm ++++ b/mpn/x86_64/coreihwl/gcd_22.asm +@@ -136,3 +136,4 @@ L(end): mov v0, %rax + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreihwl/mul_2.asm b/mpn/x86_64/coreihwl/mul_2.asm +index f1f044f..f48e5d8 100644 +--- a/mpn/x86_64/coreihwl/mul_2.asm ++++ b/mpn/x86_64/coreihwl/mul_2.asm +@@ -174,3 +174,4 @@ L(end): mulx( v1, %rdx, %rax) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreihwl/mul_basecase.asm b/mpn/x86_64/coreihwl/mul_basecase.asm +index b2656c8..14826e8 100644 +--- a/mpn/x86_64/coreihwl/mul_basecase.asm ++++ b/mpn/x86_64/coreihwl/mul_basecase.asm +@@ -439,3 +439,4 @@ L(ret2):pop %rbp + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreihwl/mullo_basecase.asm b/mpn/x86_64/coreihwl/mullo_basecase.asm +index e65559b..b29352c 100644 +--- a/mpn/x86_64/coreihwl/mullo_basecase.asm ++++ b/mpn/x86_64/coreihwl/mullo_basecase.asm +@@ -420,3 +420,4 @@ L(n3): mov (vp), %r9 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreihwl/redc_1.asm b/mpn/x86_64/coreihwl/redc_1.asm +index b1d6c0a..3b09a73 100644 +--- a/mpn/x86_64/coreihwl/redc_1.asm ++++ b/mpn/x86_64/coreihwl/redc_1.asm +@@ -435,3 +435,4 @@ L(ret): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreihwl/sqr_basecase.asm b/mpn/x86_64/coreihwl/sqr_basecase.asm +index 641cdf3..b6ea890 100644 +--- a/mpn/x86_64/coreihwl/sqr_basecase.asm ++++ b/mpn/x86_64/coreihwl/sqr_basecase.asm +@@ -504,3 +504,4 @@ L(dend):adc %rbx, %rdx + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreinhm/aorrlsh_n.asm b/mpn/x86_64/coreinhm/aorrlsh_n.asm +index eed64e7..3f25eea 100644 +--- a/mpn/x86_64/coreinhm/aorrlsh_n.asm ++++ b/mpn/x86_64/coreinhm/aorrlsh_n.asm +@@ -198,3 +198,4 @@ IFDOS(` mov 64(%rsp), %r9 ') C cy + sbb R32(%rbx), R32(%rbx) C initialise CF save register + jmp L(ent) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreinhm/hamdist.asm b/mpn/x86_64/coreinhm/hamdist.asm +index a5a63e4..a84bcbc 100644 +--- a/mpn/x86_64/coreinhm/hamdist.asm ++++ b/mpn/x86_64/coreinhm/hamdist.asm +@@ -194,3 +194,4 @@ L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) ++ASM_END() +diff --git a/mpn/x86_64/coreinhm/popcount.asm b/mpn/x86_64/coreinhm/popcount.asm +index 0a3c867..24c4ebc 100644 +--- a/mpn/x86_64/coreinhm/popcount.asm ++++ b/mpn/x86_64/coreinhm/popcount.asm +@@ -180,3 +180,4 @@ L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/addmul_2.asm b/mpn/x86_64/coreisbr/addmul_2.asm +index 21f0bf4..45c7b15 100644 +--- a/mpn/x86_64/coreisbr/addmul_2.asm ++++ b/mpn/x86_64/coreisbr/addmul_2.asm +@@ -222,3 +222,4 @@ L(end): mul v1 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/aorrlshC_n.asm b/mpn/x86_64/coreisbr/aorrlshC_n.asm +index 23ace41..6af7da8 100644 +--- a/mpn/x86_64/coreisbr/aorrlshC_n.asm ++++ b/mpn/x86_64/coreisbr/aorrlshC_n.asm +@@ -171,3 +171,4 @@ L(end): shr $RSH, %rbp + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/aorrlsh_n.asm b/mpn/x86_64/coreisbr/aorrlsh_n.asm +index db8ee68..56ca497 100644 +--- a/mpn/x86_64/coreisbr/aorrlsh_n.asm ++++ b/mpn/x86_64/coreisbr/aorrlsh_n.asm +@@ -213,3 +213,4 @@ IFDOS(` mov 64(%rsp), %r9 ') C cy + sbb R32(%rbx), R32(%rbx) C initialise CF save register + jmp L(ent) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/aors_n.asm b/mpn/x86_64/coreisbr/aors_n.asm +index 61fee3e..d466248 100644 +--- a/mpn/x86_64/coreisbr/aors_n.asm ++++ b/mpn/x86_64/coreisbr/aors_n.asm +@@ -201,3 +201,4 @@ PROLOGUE(func_nc) + IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/cnd_add_n.asm b/mpn/x86_64/coreisbr/cnd_add_n.asm +index 43abcc8..3d72bf8 100644 +--- a/mpn/x86_64/coreisbr/cnd_add_n.asm ++++ b/mpn/x86_64/coreisbr/cnd_add_n.asm +@@ -172,3 +172,4 @@ L(end): neg R32(%rax) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/cnd_sub_n.asm b/mpn/x86_64/coreisbr/cnd_sub_n.asm +index f55492b..3371269 100644 +--- a/mpn/x86_64/coreisbr/cnd_sub_n.asm ++++ b/mpn/x86_64/coreisbr/cnd_sub_n.asm +@@ -198,3 +198,4 @@ L(end): neg R32(%rax) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/mul_1.asm b/mpn/x86_64/coreisbr/mul_1.asm +index a43a117..1f17293 100644 +--- a/mpn/x86_64/coreisbr/mul_1.asm ++++ b/mpn/x86_64/coreisbr/mul_1.asm +@@ -197,3 +197,4 @@ L(00c): add cin, %r10 + mov 8(up,n,8), %rax + jmp L(L0c) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/mul_2.asm b/mpn/x86_64/coreisbr/mul_2.asm +index 781534d..10f1769 100644 +--- a/mpn/x86_64/coreisbr/mul_2.asm ++++ b/mpn/x86_64/coreisbr/mul_2.asm +@@ -165,3 +165,4 @@ L(end): mul v0 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/mul_basecase.asm b/mpn/x86_64/coreisbr/mul_basecase.asm +index 35fd1cc..d5c7e5b 100644 +--- a/mpn/x86_64/coreisbr/mul_basecase.asm ++++ b/mpn/x86_64/coreisbr/mul_basecase.asm +@@ -405,3 +405,4 @@ L(ret2):pop %rbp + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/mullo_basecase.asm b/mpn/x86_64/coreisbr/mullo_basecase.asm +index a41a8ac..acf7776 100644 +--- a/mpn/x86_64/coreisbr/mullo_basecase.asm ++++ b/mpn/x86_64/coreisbr/mullo_basecase.asm +@@ -382,3 +382,4 @@ L(n3): mov (vp_param), %r9 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/rsh1aors_n.asm b/mpn/x86_64/coreisbr/rsh1aors_n.asm +index fd2eaea..eefad99 100644 +--- a/mpn/x86_64/coreisbr/rsh1aors_n.asm ++++ b/mpn/x86_64/coreisbr/rsh1aors_n.asm +@@ -191,3 +191,4 @@ L(end): shrd $1, %rbx, %rbp + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/coreisbr/sqr_basecase.asm b/mpn/x86_64/coreisbr/sqr_basecase.asm +index 46a3612..1600e25 100644 +--- a/mpn/x86_64/coreisbr/sqr_basecase.asm ++++ b/mpn/x86_64/coreisbr/sqr_basecase.asm +@@ -482,3 +482,4 @@ L(dend):add %r8, %r10 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/div_qr_1n_pi1.asm b/mpn/x86_64/div_qr_1n_pi1.asm +index b3d45e2..9fd2633 100644 +--- a/mpn/x86_64/div_qr_1n_pi1.asm ++++ b/mpn/x86_64/div_qr_1n_pi1.asm +@@ -245,3 +245,4 @@ L(q_incr_loop): + lea 8(U1), U1 + jmp L(q_incr_loop) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/div_qr_2n_pi1.asm b/mpn/x86_64/div_qr_2n_pi1.asm +index 5e59a0a..c189c33 100644 +--- a/mpn/x86_64/div_qr_2n_pi1.asm ++++ b/mpn/x86_64/div_qr_2n_pi1.asm +@@ -156,3 +156,4 @@ L(fix): C Unlikely update. u2 >= d1 + sbb d1, u2 + jmp L(bck) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/div_qr_2u_pi1.asm b/mpn/x86_64/div_qr_2u_pi1.asm +index 85af96f..f2ac526 100644 +--- a/mpn/x86_64/div_qr_2u_pi1.asm ++++ b/mpn/x86_64/div_qr_2u_pi1.asm +@@ -198,3 +198,4 @@ L(fix_qh): C Unlikely update. u2 >= d1 + sbb d1, u2 + jmp L(bck_qh) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/dive_1.asm b/mpn/x86_64/dive_1.asm +index 988bdab..1929091 100644 +--- a/mpn/x86_64/dive_1.asm ++++ b/mpn/x86_64/dive_1.asm +@@ -156,3 +156,4 @@ L(one): shr R8(%rcx), %rax + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/divrem_1.asm b/mpn/x86_64/divrem_1.asm +index d4d61ad..edfd893 100644 +--- a/mpn/x86_64/divrem_1.asm ++++ b/mpn/x86_64/divrem_1.asm +@@ -312,3 +312,4 @@ L(ret): pop %rbx + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/divrem_2.asm b/mpn/x86_64/divrem_2.asm +index 20811cc..e10f328 100644 +--- a/mpn/x86_64/divrem_2.asm ++++ b/mpn/x86_64/divrem_2.asm +@@ -190,3 +190,4 @@ L(fix): seta %dl + sbb %r11, %rbx + jmp L(bck) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastavx/copyd.asm b/mpn/x86_64/fastavx/copyd.asm +index 56d472f..a69a624 100644 +--- a/mpn/x86_64/fastavx/copyd.asm ++++ b/mpn/x86_64/fastavx/copyd.asm +@@ -170,3 +170,4 @@ L(bc): test $4, R8(n) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastavx/copyi.asm b/mpn/x86_64/fastavx/copyi.asm +index 7607747..f50aa47 100644 +--- a/mpn/x86_64/fastavx/copyi.asm ++++ b/mpn/x86_64/fastavx/copyi.asm +@@ -167,3 +167,4 @@ L(bc): test $4, R8(n) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/com-palignr.asm b/mpn/x86_64/fastsse/com-palignr.asm +index 69027bc..50cd40f 100644 +--- a/mpn/x86_64/fastsse/com-palignr.asm ++++ b/mpn/x86_64/fastsse/com-palignr.asm +@@ -309,3 +309,4 @@ L(end): test $1, R8(n) + 1: FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/com.asm b/mpn/x86_64/fastsse/com.asm +index c867222..aec7d25 100644 +--- a/mpn/x86_64/fastsse/com.asm ++++ b/mpn/x86_64/fastsse/com.asm +@@ -173,3 +173,4 @@ IFDOS(` add $56, %rsp ') + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/copyd-palignr.asm b/mpn/x86_64/fastsse/copyd-palignr.asm +index fac6f8a..fa1e4a4 100644 +--- a/mpn/x86_64/fastsse/copyd-palignr.asm ++++ b/mpn/x86_64/fastsse/copyd-palignr.asm +@@ -252,3 +252,4 @@ L(end): test $1, R8(n) + 1: FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/copyd.asm b/mpn/x86_64/fastsse/copyd.asm +index b3c4706..ce820c5 100644 +--- a/mpn/x86_64/fastsse/copyd.asm ++++ b/mpn/x86_64/fastsse/copyd.asm +@@ -164,3 +164,4 @@ L(sma): test $8, R8(n) + L(don): FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/copyi-palignr.asm b/mpn/x86_64/fastsse/copyi-palignr.asm +index 9876a47..fb4655f 100644 +--- a/mpn/x86_64/fastsse/copyi-palignr.asm ++++ b/mpn/x86_64/fastsse/copyi-palignr.asm +@@ -298,3 +298,4 @@ L(end): test $1, R8(n) + 1: FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/copyi.asm b/mpn/x86_64/fastsse/copyi.asm +index 97f7865..826caad 100644 +--- a/mpn/x86_64/fastsse/copyi.asm ++++ b/mpn/x86_64/fastsse/copyi.asm +@@ -183,3 +183,4 @@ dnl jnc 1b + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/lshift-movdqu2.asm b/mpn/x86_64/fastsse/lshift-movdqu2.asm +index a05e850..217f2cd 100644 +--- a/mpn/x86_64/fastsse/lshift-movdqu2.asm ++++ b/mpn/x86_64/fastsse/lshift-movdqu2.asm +@@ -180,3 +180,4 @@ L(end8):movq (ap), %xmm0 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/lshift.asm b/mpn/x86_64/fastsse/lshift.asm +index 6a17b93..79a5554 100644 +--- a/mpn/x86_64/fastsse/lshift.asm ++++ b/mpn/x86_64/fastsse/lshift.asm +@@ -171,3 +171,4 @@ L(end8):movq (ap), %xmm0 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/mpn/x86_64/fastsse/lshiftc-movdqu2.asm +index 8250910..9f14435 100644 +--- a/mpn/x86_64/fastsse/lshiftc-movdqu2.asm ++++ b/mpn/x86_64/fastsse/lshiftc-movdqu2.asm +@@ -191,3 +191,4 @@ L(end8):movq (ap), %xmm0 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/lshiftc.asm b/mpn/x86_64/fastsse/lshiftc.asm +index a616075..a6630cb 100644 +--- a/mpn/x86_64/fastsse/lshiftc.asm ++++ b/mpn/x86_64/fastsse/lshiftc.asm +@@ -181,3 +181,4 @@ L(end8):movq (ap), %xmm0 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/rshift-movdqu2.asm b/mpn/x86_64/fastsse/rshift-movdqu2.asm +index 1e270b1..15bcc02 100644 +--- a/mpn/x86_64/fastsse/rshift-movdqu2.asm ++++ b/mpn/x86_64/fastsse/rshift-movdqu2.asm +@@ -199,3 +199,4 @@ L(bc): dec R32(n) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fastsse/sec_tabselect.asm b/mpn/x86_64/fastsse/sec_tabselect.asm +index e7b7feb..f3b76eb 100644 +--- a/mpn/x86_64/fastsse/sec_tabselect.asm ++++ b/mpn/x86_64/fastsse/sec_tabselect.asm +@@ -202,3 +202,4 @@ IFDOS(` add $88, %rsp ') + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/fat/fat_entry.asm b/mpn/x86_64/fat/fat_entry.asm +index 5f244ac..2322be8 100644 +--- a/mpn/x86_64/fat/fat_entry.asm ++++ b/mpn/x86_64/fat/fat_entry.asm +@@ -207,3 +207,4 @@ PROLOGUE(__gmpn_cpuid) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/gcd_11.asm b/mpn/x86_64/gcd_11.asm +index f9b3bcc..1e5ac68 100644 +--- a/mpn/x86_64/gcd_11.asm ++++ b/mpn/x86_64/gcd_11.asm +@@ -112,3 +112,4 @@ L(shift_alot): + mov u0, %rdx + jmp L(mid) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/gcd_22.asm b/mpn/x86_64/gcd_22.asm +index 78f985f..c3b0b89 100644 +--- a/mpn/x86_64/gcd_22.asm ++++ b/mpn/x86_64/gcd_22.asm +@@ -161,3 +161,4 @@ L(end): C mov v0, %rax + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k10/gcd_22.asm b/mpn/x86_64/k10/gcd_22.asm +index f58b4cc..c7fe668 100644 +--- a/mpn/x86_64/k10/gcd_22.asm ++++ b/mpn/x86_64/k10/gcd_22.asm +@@ -140,3 +140,4 @@ L(end): C mov v0, %rax + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k10/hamdist.asm b/mpn/x86_64/k10/hamdist.asm +index f70494a..d885e2d 100644 +--- a/mpn/x86_64/k10/hamdist.asm ++++ b/mpn/x86_64/k10/hamdist.asm +@@ -107,3 +107,4 @@ L(top): mov (ap,n,8), %r8 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k10/popcount.asm b/mpn/x86_64/k10/popcount.asm +index 3814aea..45bcba5 100644 +--- a/mpn/x86_64/k10/popcount.asm ++++ b/mpn/x86_64/k10/popcount.asm +@@ -79,7 +79,7 @@ C neg R32(%rcx) + + lea L(top)(%rip), %rdx + lea (%rdx,%rcx,2), %rdx +- jmp *%rdx ++ X86_NOTRACK jmp *%rdx + ',` + lea (up,n,8), up + +@@ -101,7 +101,7 @@ C lea (%rcx,%rcx,4), %rcx C 10x + + lea L(top)(%rip), %rdx + add %rcx, %rdx +- jmp *%rdx ++ X86_NOTRACK jmp *%rdx + ') + + ALIGN(32) +@@ -136,3 +136,4 @@ C 1 = n mod 8 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k8/addmul_2.asm b/mpn/x86_64/k8/addmul_2.asm +index 78bcba1..38caa4d 100644 +--- a/mpn/x86_64/k8/addmul_2.asm ++++ b/mpn/x86_64/k8/addmul_2.asm +@@ -193,3 +193,4 @@ L(end): xor R32(w1), R32(w1) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k8/aorrlsh_n.asm b/mpn/x86_64/k8/aorrlsh_n.asm +index ff3a184..3ab7050 100644 +--- a/mpn/x86_64/k8/aorrlsh_n.asm ++++ b/mpn/x86_64/k8/aorrlsh_n.asm +@@ -215,3 +215,4 @@ L(cj1): mov %r9, 8(rp,n,8) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k8/bdiv_q_1.asm b/mpn/x86_64/k8/bdiv_q_1.asm +index 1172b0d..606d54f 100644 +--- a/mpn/x86_64/k8/bdiv_q_1.asm ++++ b/mpn/x86_64/k8/bdiv_q_1.asm +@@ -177,3 +177,4 @@ L(one): shr R8(%rcx), %rax + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k8/div_qr_1n_pi1.asm b/mpn/x86_64/k8/div_qr_1n_pi1.asm +index 86de08c..e91b809 100644 +--- a/mpn/x86_64/k8/div_qr_1n_pi1.asm ++++ b/mpn/x86_64/k8/div_qr_1n_pi1.asm +@@ -247,3 +247,4 @@ L(q_incr_loop): + lea 8(U1), U1 + jmp L(q_incr_loop) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k8/mul_basecase.asm b/mpn/x86_64/k8/mul_basecase.asm +index ca2efb9..9126c2b 100644 +--- a/mpn/x86_64/k8/mul_basecase.asm ++++ b/mpn/x86_64/k8/mul_basecase.asm +@@ -335,8 +335,10 @@ C addmul_2 for remaining vp's + C adjusted value of n that is reloaded on each iteration + + L(addmul_outer_0): ++ X86_ENDBR + add $3, un + lea 0(%rip), outer_addr ++ X86_ENDBR + + mov un, n + mov -24(up,un,8), %rax +@@ -348,6 +350,7 @@ L(addmul_outer_0): + jmp L(addmul_entry_0) + + L(addmul_outer_1): ++ X86_ENDBR + mov un, n + mov (up,un,8), %rax + mul v0 +@@ -358,8 +361,10 @@ L(addmul_outer_1): + jmp L(addmul_entry_1) + + L(addmul_outer_2): ++ X86_ENDBR + add $1, un + lea 0(%rip), outer_addr ++ X86_ENDBR + + mov un, n + mov -8(up,un,8), %rax +@@ -372,8 +377,10 @@ L(addmul_outer_2): + jmp L(addmul_entry_2) + + L(addmul_outer_3): ++ X86_ENDBR + add $2, un + lea 0(%rip), outer_addr ++ X86_ENDBR + + mov un, n + mov -16(up,un,8), %rax +@@ -467,3 +474,4 @@ L(ret): pop %r15 + ret + + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k8/mullo_basecase.asm b/mpn/x86_64/k8/mullo_basecase.asm +index fa00f42..4a931a5 100644 +--- a/mpn/x86_64/k8/mullo_basecase.asm ++++ b/mpn/x86_64/k8/mullo_basecase.asm +@@ -99,12 +99,14 @@ dnl JMPENT( L(2m4), L(tab)) C 10 + dnl JMPENT( L(3m4), L(tab)) C 11 + TEXT + +-L(1): imul %r8, %rax ++L(1): X86_ENDBR ++ imul %r8, %rax + mov %rax, (rp) + FUNC_EXIT() + ret + +-L(2): mov 8(vp_param), %r11 ++L(2): X86_ENDBR ++ mov 8(vp_param), %r11 + imul %rax, %r11 C u0 x v1 + mul %r8 C u0 x v0 + mov %rax, (rp) +@@ -115,7 +117,8 @@ L(2): mov 8(vp_param), %r11 + FUNC_EXIT() + ret + +-L(3): mov 8(vp_param), %r9 C v1 ++L(3): X86_ENDBR ++ mov 8(vp_param), %r9 C v1 + mov 16(vp_param), %r11 + mul %r8 C u0 x v0 -> + mov %rax, (rp) C r0 +@@ -335,6 +338,7 @@ L(mul_2_entry_1): + + + L(addmul_outer_1): ++ X86_ENDBR + lea -2(n), j + mov -16(up,n,8), %rax + mul v0 +@@ -346,6 +350,7 @@ L(addmul_outer_1): + jmp L(addmul_entry_1) + + L(addmul_outer_3): ++ X86_ENDBR + lea 0(n), j + mov -16(up,n,8), %rax + xor R32(w3), R32(w3) +@@ -434,3 +439,4 @@ L(ret): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k8/mulmid_basecase.asm b/mpn/x86_64/k8/mulmid_basecase.asm +index 86f1414..7d5f158 100644 +--- a/mpn/x86_64/k8/mulmid_basecase.asm ++++ b/mpn/x86_64/k8/mulmid_basecase.asm +@@ -329,6 +329,7 @@ C addmul_2 for remaining vp's + + ALIGN(16) + L(addmul_prologue_0): ++ X86_ENDBR + mov -8(up,n,8), %rax + mul v1 + mov %rax, w1 +@@ -338,6 +339,7 @@ L(addmul_prologue_0): + + ALIGN(16) + L(addmul_prologue_1): ++ X86_ENDBR + mov 16(up,n,8), %rax + mul v1 + mov %rax, w0 +@@ -348,6 +350,7 @@ L(addmul_prologue_1): + + ALIGN(16) + L(addmul_prologue_2): ++ X86_ENDBR + mov 8(up,n,8), %rax + mul v1 + mov %rax, w3 +@@ -357,6 +360,7 @@ L(addmul_prologue_2): + + ALIGN(16) + L(addmul_prologue_3): ++ X86_ENDBR + mov (up,n,8), %rax + mul v1 + mov %rax, w2 +@@ -471,6 +475,7 @@ L(diag_prologue_0): + mov vp, vp_inner + mov vn, n + lea 0(%rip), outer_addr ++ X86_ENDBR + mov -8(up,n,8), %rax + jmp L(diag_entry_0) + +@@ -480,6 +485,7 @@ L(diag_prologue_1): + add $3, vn + mov vn, n + lea 0(%rip), outer_addr ++ X86_ENDBR + mov -8(vp_inner), %rax + jmp L(diag_entry_1) + +@@ -489,6 +495,7 @@ L(diag_prologue_2): + add $2, vn + mov vn, n + lea 0(%rip), outer_addr ++ X86_ENDBR + mov 16(vp_inner), %rax + jmp L(diag_entry_2) + +@@ -507,6 +514,7 @@ L(diag_entry_0): + adc %rdx, w1 + adc $0, w2 + L(diag_entry_3): ++ X86_ENDBR + mov -16(up,n,8), %rax + mulq 8(vp_inner) + add %rax, w0 +@@ -557,3 +565,4 @@ L(ret): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k8/redc_1.asm b/mpn/x86_64/k8/redc_1.asm +index 9327b21..3e241af 100644 +--- a/mpn/x86_64/k8/redc_1.asm ++++ b/mpn/x86_64/k8/redc_1.asm +@@ -125,7 +125,8 @@ L(tab): JMPENT( L(0), L(tab)) + TEXT + + ALIGN(16) +-L(1): mov (mp_param), %rax ++L(1): X86_ENDBR ++ mov (mp_param), %rax + mul q0 + add 8(up), %rax + adc 16(up), %rdx +@@ -136,7 +137,8 @@ L(1): mov (mp_param), %rax + + + ALIGN(16) +-L(2): mov (mp_param), %rax ++L(2): X86_ENDBR ++ mov (mp_param), %rax + mul q0 + xor R32(%r14), R32(%r14) + mov %rax, %r10 +@@ -171,7 +173,8 @@ L(2): mov (mp_param), %rax + jmp L(ret) + + +-L(3): mov (mp_param), %rax ++L(3): X86_ENDBR ++ mov (mp_param), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 +@@ -248,7 +251,7 @@ L(3): mov (mp_param), %rax + + + ALIGN(16) +-L(2m4): ++L(2m4): X86_ENDBR + L(lo2): mov (mp,nneg,8), %rax + mul q0 + xor R32(%r14), R32(%r14) +@@ -324,7 +327,7 @@ L(le2): add %r10, (up) + + + ALIGN(16) +-L(1m4): ++L(1m4): X86_ENDBR + L(lo1): mov (mp,nneg,8), %rax + xor %r9, %r9 + xor R32(%rbx), R32(%rbx) +@@ -398,7 +401,7 @@ L(le1): add %r10, (up) + + ALIGN(16) + L(0): +-L(0m4): ++L(0m4): X86_ENDBR + L(lo0): mov (mp,nneg,8), %rax + mov nneg, i + mul q0 +@@ -463,7 +466,7 @@ L(le0): add %r10, (up) + + + ALIGN(16) +-L(3m4): ++L(3m4): X86_ENDBR + L(lo3): mov (mp,nneg,8), %rax + mul q0 + mov %rax, %rbx +@@ -589,3 +592,4 @@ L(ret): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/k8/sqr_basecase.asm b/mpn/x86_64/k8/sqr_basecase.asm +index 60cf945..37858b4 100644 +--- a/mpn/x86_64/k8/sqr_basecase.asm ++++ b/mpn/x86_64/k8/sqr_basecase.asm +@@ -131,7 +131,8 @@ L(tab): JMPENT( L(4), L(tab)) + JMPENT( L(3m4), L(tab)) + TEXT + +-L(1): mov (up), %rax ++L(1): X86_ENDBR ++ mov (up), %rax + mul %rax + add $40, %rsp + mov %rax, (rp) +@@ -139,7 +140,8 @@ L(1): mov (up), %rax + FUNC_EXIT() + ret + +-L(2): mov (up), %rax ++L(2): X86_ENDBR ++ mov (up), %rax + mov %rax, %r8 + mul %rax + mov 8(up), %r11 +@@ -165,7 +167,8 @@ L(2): mov (up), %rax + FUNC_EXIT() + ret + +-L(3): mov (up), %rax ++L(3): X86_ENDBR ++ mov (up), %rax + mov %rax, %r10 + mul %rax + mov 8(up), %r11 +@@ -210,7 +213,8 @@ L(3): mov (up), %rax + FUNC_EXIT() + ret + +-L(4): mov (up), %rax ++L(4): X86_ENDBR ++ mov (up), %rax + mov %rax, %r11 + mul %rax + mov 8(up), %rbx +@@ -282,6 +286,7 @@ L(4): mov (up), %rax + + + L(0m4): ++ X86_ENDBR + lea -16(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 + mov 8(up), %rax +@@ -340,6 +345,7 @@ L(L3): xor R32(w1), R32(w1) + + + L(1m4): ++ X86_ENDBR + lea 8(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 C u0 + mov 8(up), %rax C u1 +@@ -418,6 +424,7 @@ L(m2x): mov (up,j,8), %rax + + + L(2m4): ++ X86_ENDBR + lea -16(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 + mov 8(up), %rax +@@ -474,7 +481,7 @@ L(L1): xor R32(w0), R32(w0) + jmp L(dowhile_mid) + + +-L(3m4): ++L(3m4): X86_ENDBR + lea 8(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 C u0 + mov 8(up), %rax C u1 +@@ -805,3 +812,4 @@ L(d1): mov %r11, 24(rp,j,8) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/logops_n.asm b/mpn/x86_64/logops_n.asm +index e25854d..b3969ba 100644 +--- a/mpn/x86_64/logops_n.asm ++++ b/mpn/x86_64/logops_n.asm +@@ -258,3 +258,4 @@ L(ret): FUNC_EXIT() + ret + EPILOGUE() + ') ++ASM_END() +diff --git a/mpn/x86_64/lshift.asm b/mpn/x86_64/lshift.asm +index fff3152..4187bdc 100644 +--- a/mpn/x86_64/lshift.asm ++++ b/mpn/x86_64/lshift.asm +@@ -170,3 +170,4 @@ L(ast): mov (up), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/lshiftc.asm b/mpn/x86_64/lshiftc.asm +index c4ba04a..f6fe4c9 100644 +--- a/mpn/x86_64/lshiftc.asm ++++ b/mpn/x86_64/lshiftc.asm +@@ -180,3 +180,4 @@ L(ast): mov (up), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/lshsub_n.asm b/mpn/x86_64/lshsub_n.asm +index 4d428c0..62877d7 100644 +--- a/mpn/x86_64/lshsub_n.asm ++++ b/mpn/x86_64/lshsub_n.asm +@@ -170,3 +170,4 @@ L(end): + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/missing.asm b/mpn/x86_64/missing.asm +index 9b65c89..22dac17 100644 +--- a/mpn/x86_64/missing.asm ++++ b/mpn/x86_64/missing.asm +@@ -128,3 +128,4 @@ PROLOGUE(__gmp_adcx) + ret + EPILOGUE() + PROTECT(__gmp_adcx) ++ASM_END() +diff --git a/mpn/x86_64/mod_1_2.asm b/mpn/x86_64/mod_1_2.asm +index 40fcaeb..fbaae3b 100644 +--- a/mpn/x86_64/mod_1_2.asm ++++ b/mpn/x86_64/mod_1_2.asm +@@ -239,3 +239,4 @@ ifdef(`SHLD_SLOW',` + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/mod_1_4.asm b/mpn/x86_64/mod_1_4.asm +index 6cf304c..8969e42 100644 +--- a/mpn/x86_64/mod_1_4.asm ++++ b/mpn/x86_64/mod_1_4.asm +@@ -270,3 +270,4 @@ ifdef(`SHLD_SLOW',` + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/mod_34lsub1.asm b/mpn/x86_64/mod_34lsub1.asm +index 75421a6..70282b6 100644 +--- a/mpn/x86_64/mod_34lsub1.asm ++++ b/mpn/x86_64/mod_34lsub1.asm +@@ -145,46 +145,55 @@ L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(8), L(tab)) + TEXT + +-L(6): add (ap), %rax ++L(6): X86_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +-L(3): add (ap), %rax ++L(3): X86_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + jmp L(cj1) + +-L(7): add (ap), %rax ++L(7): X86_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +-L(4): add (ap), %rax ++L(4): X86_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +-L(1): add (ap), %rax ++L(1): X86_ENDBR ++ add (ap), %rax + adc $0, %rcx + jmp L(cj2) + +-L(8): add (ap), %rax ++L(8): X86_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +-L(5): add (ap), %rax ++L(5): X86_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +-L(2): add (ap), %rax ++L(2): X86_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + + L(cj2): adc $0, %rdx + L(cj1): adc $0, %r9 +-L(0): add %r9, %rax ++L(0): X86_ENDBR ++ add %r9, %rax + adc $0, %rcx + adc $0, %rdx + adc $0, %rax +@@ -213,3 +222,4 @@ L(0): add %r9, %rax + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/mode1o.asm b/mpn/x86_64/mode1o.asm +index 2cd2b08..3377435 100644 +--- a/mpn/x86_64/mode1o.asm ++++ b/mpn/x86_64/mode1o.asm +@@ -169,3 +169,4 @@ L(one): + + EPILOGUE(mpn_modexact_1c_odd) + EPILOGUE(mpn_modexact_1_odd) ++ASM_END() +diff --git a/mpn/x86_64/mul_1.asm b/mpn/x86_64/mul_1.asm +index e1ba89b..44764dd 100644 +--- a/mpn/x86_64/mul_1.asm ++++ b/mpn/x86_64/mul_1.asm +@@ -190,3 +190,4 @@ IFDOS(``pop %rdi '') + IFDOS(``pop %rsi '') + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/mul_2.asm b/mpn/x86_64/mul_2.asm +index d64313b..b6c6bf1 100644 +--- a/mpn/x86_64/mul_2.asm ++++ b/mpn/x86_64/mul_2.asm +@@ -202,3 +202,4 @@ L(m22): mul v1 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/nano/dive_1.asm b/mpn/x86_64/nano/dive_1.asm +index e9a0763..aead4d5 100644 +--- a/mpn/x86_64/nano/dive_1.asm ++++ b/mpn/x86_64/nano/dive_1.asm +@@ -164,3 +164,4 @@ L(one): shr R8(%rcx), %rax + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/pentium4/aors_n.asm b/mpn/x86_64/pentium4/aors_n.asm +index 8e6ee1b..3751e38 100644 +--- a/mpn/x86_64/pentium4/aors_n.asm ++++ b/mpn/x86_64/pentium4/aors_n.asm +@@ -194,3 +194,4 @@ L(ret): mov R32(%rbx), R32(%rax) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/pentium4/mod_34lsub1.asm b/mpn/x86_64/pentium4/mod_34lsub1.asm +index f34b3f0..bf83f62 100644 +--- a/mpn/x86_64/pentium4/mod_34lsub1.asm ++++ b/mpn/x86_64/pentium4/mod_34lsub1.asm +@@ -165,3 +165,4 @@ L(combine): + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/pentium4/rsh1aors_n.asm b/mpn/x86_64/pentium4/rsh1aors_n.asm +index 5528ce4..219a809 100644 +--- a/mpn/x86_64/pentium4/rsh1aors_n.asm ++++ b/mpn/x86_64/pentium4/rsh1aors_n.asm +@@ -332,3 +332,4 @@ L(cj1): or %r14, %rbx + L(c3): mov $1, R8(%rax) + jmp L(rc3) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/pentium4/rshift.asm b/mpn/x86_64/pentium4/rshift.asm +index b7c1ee2..848045f 100644 +--- a/mpn/x86_64/pentium4/rshift.asm ++++ b/mpn/x86_64/pentium4/rshift.asm +@@ -167,3 +167,4 @@ L(ast): movq (up), %mm2 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/popham.asm b/mpn/x86_64/popham.asm +index 3a29b2e..b7ceb17 100644 +--- a/mpn/x86_64/popham.asm ++++ b/mpn/x86_64/popham.asm +@@ -161,3 +161,4 @@ L(end): + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/rsh1aors_n.asm b/mpn/x86_64/rsh1aors_n.asm +index a3e9cc5..797e250 100644 +--- a/mpn/x86_64/rsh1aors_n.asm ++++ b/mpn/x86_64/rsh1aors_n.asm +@@ -187,3 +187,4 @@ L(end): mov %rbx, (rp) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/rshift.asm b/mpn/x86_64/rshift.asm +index 3f344f1..0fc5877 100644 +--- a/mpn/x86_64/rshift.asm ++++ b/mpn/x86_64/rshift.asm +@@ -174,3 +174,4 @@ L(ast): mov (up), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/sec_tabselect.asm b/mpn/x86_64/sec_tabselect.asm +index e8aed26..5dce3c1 100644 +--- a/mpn/x86_64/sec_tabselect.asm ++++ b/mpn/x86_64/sec_tabselect.asm +@@ -174,3 +174,4 @@ L(b00): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/sqr_diag_addlsh1.asm b/mpn/x86_64/sqr_diag_addlsh1.asm +index f486125..a1d8767 100644 +--- a/mpn/x86_64/sqr_diag_addlsh1.asm ++++ b/mpn/x86_64/sqr_diag_addlsh1.asm +@@ -114,3 +114,4 @@ L(end): add %r10, %r8 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/sublsh1_n.asm b/mpn/x86_64/sublsh1_n.asm +index c6d829f..c18f32a 100644 +--- a/mpn/x86_64/sublsh1_n.asm ++++ b/mpn/x86_64/sublsh1_n.asm +@@ -158,3 +158,4 @@ L(end): add R32(%rbp), R32(%rax) + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/x86_64-defs.m4 b/mpn/x86_64/x86_64-defs.m4 +index 64e3729..2a1c1b0 100644 +--- a/mpn/x86_64/x86_64-defs.m4 ++++ b/mpn/x86_64/x86_64-defs.m4 +@@ -94,9 +94,9 @@ m4_assert_numargs(1) + ` GLOBL $1 + TYPE($1,`function') + $1: ++ X86_ENDBR + ') + +- + dnl Usage: ASSERT([cond][,instructions]) + dnl + dnl If WANT_ASSERT is 1, output the given instructions and expect the given +@@ -149,6 +149,10 @@ ifdef(`PIC', + `lea $1(%rip), $2') + ') + ++dnl ASM_END ++ ++define(`ASM_END', `X86_GNU_PROPERTY') ++ + + define(`DEF_OBJECT', + m4_assert_numargs_range(2,3) +diff --git a/mpn/x86_64/zen/aorrlsh_n.asm b/mpn/x86_64/zen/aorrlsh_n.asm +index e049b2f..6e6783f 100644 +--- a/mpn/x86_64/zen/aorrlsh_n.asm ++++ b/mpn/x86_64/zen/aorrlsh_n.asm +@@ -102,26 +102,30 @@ ifdef(`PIC',` + jmp *(%r11,%rax,8) + ') + +-L(0): lea 32(up), up ++L(0): X86_ENDBR ++ lea 32(up), up + lea 32(vp), vp + lea 32(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e0) + +-L(7): mov %r10, %r11 ++L(7): X86_ENDBRmov ++ %r10, %r11 + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e7) + +-L(6): lea 16(up), up ++L(6): X86_ENDBR ++ movlea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e6) + +-L(5): mov %r10, %r11 ++L(5): X86_ENDBRmov ++ mov %r10, %r11 + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp +@@ -191,23 +195,27 @@ L(e1): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + jmp L(top) + +-L(4): xor R32(%r11), R32(%r11) ++L(4): X86_ENDBRmov ++ xor R32(%r11), R32(%r11) + jmp L(e4) + +-L(3): mov %r10, %r11 ++L(3): X86_ENDBRmov ++ mov %r10, %r11 + lea -8(up), up + lea -8(vp), vp + lea -8(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e3) + +-L(2): lea -16(up), up ++L(2): X86_ENDBRmov ++ lea -16(up), up + lea -16(vp), vp + lea -16(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e2) + +-L(1): mov %r10, %r11 ++L(1): X86_ENDBRmov ++ mov %r10, %r11 + lea -24(up), up + lea 40(vp), vp + lea 40(rp), rp +@@ -224,3 +232,4 @@ L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) ++ASM_END() +diff --git a/mpn/x86_64/zen/mul_basecase.asm b/mpn/x86_64/zen/mul_basecase.asm +index affa3b6..c70d548 100644 +--- a/mpn/x86_64/zen/mul_basecase.asm ++++ b/mpn/x86_64/zen/mul_basecase.asm +@@ -453,3 +453,4 @@ L(wd3): adc %r11, 8(rp) + jne L(3) + jmp L(end) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/zen/mullo_basecase.asm b/mpn/x86_64/zen/mullo_basecase.asm +index 2ae729a..c081698 100644 +--- a/mpn/x86_64/zen/mullo_basecase.asm ++++ b/mpn/x86_64/zen/mullo_basecase.asm +@@ -297,3 +297,4 @@ L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + inc %r14 + jmp L(outer) + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/zen/sbpi1_bdiv_r.asm b/mpn/x86_64/zen/sbpi1_bdiv_r.asm +index f6e8f9c..277b3c3 100644 +--- a/mpn/x86_64/zen/sbpi1_bdiv_r.asm ++++ b/mpn/x86_64/zen/sbpi1_bdiv_r.asm +@@ -505,3 +505,4 @@ L(ret): mov %rbp, %rax + pop %r15 + ret + EPILOGUE() ++ASM_END() +diff --git a/mpn/x86_64/zen/sqr_basecase.asm b/mpn/x86_64/zen/sqr_basecase.asm +index a7c6127..d185deb 100644 +--- a/mpn/x86_64/zen/sqr_basecase.asm ++++ b/mpn/x86_64/zen/sqr_basecase.asm +@@ -480,3 +480,4 @@ C pop %r14 + FUNC_EXIT() + ret + EPILOGUE() ++ASM_END() +-- +2.32.0 + diff --git a/SOURCES/gmp-mparam.h b/SOURCES/gmp-mparam.h new file mode 100644 index 0000000..1d4e087 --- /dev/null +++ b/SOURCES/gmp-mparam.h @@ -0,0 +1,88 @@ +/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000, 2001, 2002, 2003, +2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +/* + * This gmp-mparam.h is a wrapper include file for the original gmp-mparam.h, + * which has been renamed to gmp-mparam-.h. There are conflicts for the + * original gmp-mparam.h on multilib systems, which result from arch-specific + * configuration options. Please do not use the arch-specific file directly. + * + * Copyright (C) 2006 Red Hat, Inc. + * Thomas Woerner + */ + +#ifdef gmp_mparam_wrapper_h +#error "gmp_mparam_wrapper_h should not be defined!" +#endif +#define gmp_mparam_wrapper_h + +#if defined(__arm__) +#include "gmp-mparam-arm.h" +#elif defined(__i386__) +#include "gmp-mparam-i386.h" +#elif defined(__ia64__) +#include "gmp-mparam-ia64.h" +#elif defined(__powerpc64__) +# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include "gmp-mparam-ppc64.h" +# else +#include "gmp-mparam-ppc64le.h" +# endif +#elif defined(__powerpc__) +# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include "gmp-mparam-ppc.h" +# else +#include "gmp-mparam-ppcle.h" +# endif +#elif defined(__s390x__) +#include "gmp-mparam-s390x.h" +#elif defined(__s390__) +#include "gmp-mparam-s390.h" +#elif defined(__x86_64__) +#include "gmp-mparam-x86_64.h" +#elif defined(__alpha__) +#include "gmp-mparam-alpha.h" +#elif defined(__sh__) +#include "gmp-mparam-sh.h" +#elif defined(__sparc__) && defined (__arch64__) +#include "gmp-mparam-sparc64.h" +#elif defined(__sparc__) +#include "gmp-mparam-sparc.h" +#elif defined(__aarch64__) +#include "gmp-mparam-aarch64.h" +#elif defined(__mips64) && defined(__MIPSEL__) +#include "gmp-mparam-mips64el.h" +#elif defined(__mips64) +#include "gmp-mparam-mips64.h" +#elif defined(__mips) && defined(__MIPSEL__) +#include "gmp-mparam-mipsel.h" +#elif defined(__mips) +#include "gmp-mparam-mips.h" +#elif defined(__riscv) +#if __riscv_xlen == 64 +#include "gmp-mparam-riscv64.h" +#else +#error "No support for riscv32" +#endif +#else +#error "The gmp-devel package is not usable with the architecture." +#endif + +#undef gmp_mparam_wrapper_h diff --git a/SOURCES/gmp.h b/SOURCES/gmp.h new file mode 100644 index 0000000..0a91606 --- /dev/null +++ b/SOURCES/gmp.h @@ -0,0 +1,88 @@ +/* Definitions for GNU multiple precision functions. -*- mode: c -*- + +Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000, 2001, 2002, 2003, +2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +/* + * This gmp.h is a wrapper include file for the original gmp.h, which has been + * renamed to gmp-.h. There are conflicts for the original gmp.h on + * multilib systems, which result from arch-specific configuration options. + * Please do not use the arch-specific file directly. + * + * Copyright (C) 2006 Red Hat, Inc. + * Thomas Woerner + */ + +#ifdef gmp_wrapper_h +#error "gmp_wrapper_h should not be defined!" +#endif +#define gmp_wrapper_h + +#if defined(__arm__) +#include "gmp-arm.h" +#elif defined(__i386__) +#include "gmp-i386.h" +#elif defined(__ia64__) +#include "gmp-ia64.h" +#elif defined(__powerpc64__) +# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include "gmp-ppc64.h" +# else +#include "gmp-ppc64le.h" +# endif +#elif defined(__powerpc__) +# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include "gmp-ppc.h" +# else +#include "gmp-ppcle.h" +# endif +#elif defined(__s390x__) +#include "gmp-s390x.h" +#elif defined(__s390__) +#include "gmp-s390.h" +#elif defined(__x86_64__) +#include "gmp-x86_64.h" +#elif defined(__alpha__) +#include "gmp-alpha.h" +#elif defined(__sh__) +#include "gmp-sh.h" +#elif defined(__sparc__) && defined (__arch64__) +#include "gmp-sparc64.h" +#elif defined(__sparc__) +#include "gmp-sparc.h" +#elif defined(__aarch64__) +#include "gmp-aarch64.h" +#elif defined(__mips64) && defined(__MIPSEL__) +#include "gmp-mips64el.h" +#elif defined(__mips64) +#include "gmp-mips64.h" +#elif defined(__mips) && defined(__MIPSEL__) +#include "gmp-mipsel.h" +#elif defined(__mips) +#include "gmp-mips.h" +#elif defined(__riscv) +#if __riscv_xlen == 64 +#include "gmp-riscv64.h" +#else +#error "No support for riscv32" +#endif +#else +#error "The gmp-devel package is not usable with the architecture." +#endif + +#undef gmp_wrapper_h diff --git a/SOURCES/ibm_z13_simd_part1.patch b/SOURCES/ibm_z13_simd_part1.patch new file mode 100644 index 0000000..73f6c83 --- /dev/null +++ b/SOURCES/ibm_z13_simd_part1.patch @@ -0,0 +1,595 @@ +Co-authored-by: Stefan Liebler +--- + mpn/s390_64/z13/addmul_1.c | 358 +++++++++++++++++++++++++++++++++++ + mpn/s390_64/z13/common-vec.h | 175 +++++++++++++++++ + mpn/s390_64/z13/mul_1.c | 31 +++ + 3 files changed, 564 insertions(+) + create mode 100644 mpn/s390_64/z13/addmul_1.c + create mode 100644 mpn/s390_64/z13/common-vec.h + create mode 100644 mpn/s390_64/z13/mul_1.c + +diff --git a/mpn/s390_64/z13/addmul_1.c b/mpn/s390_64/z13/addmul_1.c +new file mode 100644 +index 000000000..022e5edcc +--- /dev/null ++++ b/mpn/s390_64/z13/addmul_1.c +@@ -0,0 +1,358 @@ ++/* Addmul_1 / mul_1 for IBM z13 and later ++ Contributed by Marius Hillenbrand ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "gmp-impl.h" ++#include "s390_64/z13/common-vec.h" ++ ++#undef FUNCNAME ++ ++#ifdef DO_INLINE ++# ifdef OPERATION_addmul_1 ++# define ADD ++# define FUNCNAME inline_addmul_1 ++# elif defined(OPERATION_mul_1) ++# define FUNCNAME inline_mul_1 ++# endif ++ ++#else ++# ifdef OPERATION_addmul_1 ++# define ADD ++# define FUNCNAME mpn_addmul_1 ++# elif defined(OPERATION_mul_1) ++# define FUNCNAME mpn_mul_1 ++# endif ++#endif ++ ++#ifdef DO_INLINE ++static inline mp_limb_t ++FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb) ++ __attribute__ ((always_inline)); ++ ++static inline ++#endif ++mp_limb_t ++FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb) ++{ ++ ASSERT (n >= 1); ++ ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n)); ++ ++ /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in ++ VRs (using each VR as a single 128-bit accumulator). ++ The inner loop is unrolled to four limbs, with two blocks of four ++ multiplications each. Since the MLGR operation operates on even/odd GPR ++ pairs, pin the products appropriately. */ ++ ++ /* products as GPR pairs */ ++ register mp_limb_t p0_high asm("r0"); ++ register mp_limb_t p0_low asm("r1"); ++ ++ register mp_limb_t p1_high asm("r8"); ++ register mp_limb_t p1_low asm("r9"); ++ ++ register mp_limb_t p2_high asm("r6"); ++ register mp_limb_t p2_low asm("r7"); ++ ++ register mp_limb_t p3_high asm("r10"); ++ register mp_limb_t p3_low asm("r11"); ++ ++ /* carry flag for 128-bit add in VR for first carry chain */ ++ vec_t carry_vec0 = { .dw = vec_splat_u64 (0) }; ++ mp_limb_t carry_limb = 0; ++ ++#ifdef ADD ++ /* 2nd carry flag for 2nd carry chain with addmul */ ++ vec_t carry_vec1 = { .dw = vec_splat_u64 (0) }; ++ vec_t sum0; ++ vec_t rp0_addend, rp1_addend; ++ rp0_addend.dw = vec_splat_u64 (0); ++ rp1_addend.dw = vec_splat_u64 (0); ++#endif ++ vec_t sum1; ++ ++ vec_t carry_prod = { .dw = vec_splat_u64 (0) }; ++ ++ /* The scalar multiplications compete with pointer and index increments for ++ * issue ports. Thus, increment the loop index in the middle of the loop so ++ * that the operations for the next iteration's multiplications can be ++ * loaded in time (looks horrible, yet helps performance) and make sure we ++ * use addressing with base reg + index reg + immediate displacement ++ * (so that only the single index needs incrementing, instead of multiple ++ * pointers). */ ++#undef LOOP_ADVANCE ++#undef IDX_OFFSET ++ ++#define LOOP_ADVANCE 4 * sizeof (mp_limb_t) ++#define IDX_OFFSET (LOOP_ADVANCE) ++ register ssize_t idx = 0 - IDX_OFFSET; ++ ++ /* ++ * branch-on-count implicitly hint to the branch prediction as taken, while ++ * compare-and-branch hints as not taken. currently, using branch-on-count ++ * has a performance advantage, but it is not clear that it is generally the ++ * better choice (e.g., branch-on-count requires decrementing the separate ++ * counter). so, allow switching the loop condition to enable either ++ * category of branch instructions: ++ * - idx is less than an upper bound, for compare-and-branch ++ * - iteration counter greater than zero, for branch-on-count ++ */ ++#define BRCTG ++#ifdef BRCTG ++ ssize_t iterations = (size_t)n / 4; ++#else ++ ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET; ++#endif ++ ++ /* products will be transferred into VRs before adding up. ++ * see main loop below for comments on accumulation scheme. */ ++ vec_t product0, product1, product2; ++ ++ product0.dw = vec_splat_u64 (0); ++ ++ switch ((size_t)n % 4) ++ { ++ case 0: ++ break; ++ ++ case 1: ++ idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET; ++ ++ p3_low = s1p[0]; ++ s390_umul_ppmm (p3_high, p3_low, s2limb); ++ ++#ifdef ADD ++ rp0_addend.dw[1] = rp[0]; ++ product0.dw[1] = p3_low; ++ ++ sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw); ++ carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0); ++ ++ rp[0] = sum0.dw[1]; ++#else ++ rp[0] = p3_low; ++#endif ++ ++ carry_limb = p3_high; ++ break; ++ ++ case 2: ++ p0_low = s1p[0]; ++ p3_low = s1p[1]; ++ idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET; ++ ++ s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb); ++ ++ carry_prod.dw[0] = p3_low; ++ ++ product0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++ carry_limb = p3_high; ++ ++#ifdef ADD ++ rp0_addend = vec_load_elements_reversed (rp, 0); ++ sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw); ++ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw); ++ ++ sum1.sw = vec_add_u128 (sum0.sw, product0.sw); ++ carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw); ++#else ++ sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw); ++ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw); ++#endif ++ ++ vec_store_elements_reversed (rp, 0, sum1); ++ ++ break; ++ ++ case 3: ++ idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET; ++ ++ p0_low = s1p[0]; ++ s390_umul_ppmm (p0_high, p0_low, s2limb); ++ ++#ifdef ADD ++ rp0_addend.dw[1] = rp[0]; ++ product0.dw[1] = p0_low; ++ ++ sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw); ++ carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0); ++ ++ rp[0] = sum0.dw[1]; ++#else ++ rp[0] = p0_low; ++#endif ++ carry_limb = p0_high; ++ ++ p1_low = s1p[1]; ++ p3_low = s1p[2]; ++ ++ s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb); ++ ++ carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb); ++ product1.dw = vec_load_2di_as_pair (p1_high, p1_low); ++ carry_limb = p3_high; ++ ++#ifdef ADD ++ rp0_addend = vec_load_elements_reversed (rp, 8); ++ sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw); ++ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw); ++ ++ sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw); ++ carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw); ++#else ++ sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw); ++ carry_vec0.sw ++ = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw); ++#endif ++ vec_store_elements_reversed (rp, 8, sum1); ++ break; ++ } ++ ++#ifdef BRCTG ++ for (; iterations > 0; iterations--) ++ { ++#else ++ while (idx < idx_bound) ++ { ++#endif ++ vec_t overlap_addend0; ++ vec_t overlap_addend1; ++ ++ /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the ++ * result in a GPR pair. One of the factors is taken from the GPR pair ++ * and overwritten. ++ * To reuse factors, it turned out cheaper to load limbs multiple times ++ * than copying GPR contents. Enforce that and the use of addressing by ++ * base + index gpr + immediate displacement via inline asm. ++ */ ++ ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET); ++ ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET); ++ ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET); ++ ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET); ++ ++ /* ++ * accumulate products as follows (for addmul): ++ * | rp[i+3] | rp[i+2] | rp[i+1] | rp[i] | ++ * p0_high | p0_low | ++ * p1_high | p1_low | carry-limb in ++ * p2_high | p2_low | ++ * c-limb out <- p3_high | p3_low | ++ * | < 128-bit VR > < 128-bit VR > ++ * ++ * < rp1_addend > < rp0_addend > ++ * carry-chain 0 <- + <- + <- carry_vec0[127] ++ * < product1 > < product0 > ++ * carry-chain 1 <- + <- + <- carry_vec1[127] ++ * < overlap_addend1 > < overlap_addend0 > ++ * ++ * note that a 128-bit add with carry in + out is built from two insns ++ * - vec_adde_u128 (vacq) provides sum ++ * - vec_addec_u128 (vacccq) provides the new carry bit ++ */ ++ ++ s390_double_umul_ppmm (p0_high, p0_low, p1_high, p1_low, s2limb); ++ ++ /* ++ * "barrier" to enforce scheduling loads for all limbs and first round ++ * of MLGR before anything else. ++ */ ++ asm volatile(""); ++ ++ product0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++#ifdef ADD ++ rp0_addend = vec_load_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET); ++ rp1_addend = vec_load_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET); ++#endif ++ /* increment loop index to unblock dependant loads of limbs for the next ++ * iteration (see above at #define LOOP_ADVANCE) */ ++ idx += LOOP_ADVANCE; ++ ++ s390_double_umul_ppmm (p2_high, p2_low, p3_high, p3_low, s2limb); ++ ++ overlap_addend0.dw = vec_load_2di_as_pair (p1_low, carry_limb); ++ asm volatile(""); ++ ++#ifdef ADD ++ sum0.sw = vec_adde_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw); ++ sum1.sw = vec_adde_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw); ++ ++ carry_vec0.sw ++ = vec_addec_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw); ++ carry_vec1.sw ++ = vec_addec_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw); ++#else ++ sum1.sw = vec_adde_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw); ++ carry_vec0.sw ++ = vec_addec_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw); ++#endif ++ ++ asm volatile(""); ++ product2.dw = vec_load_2di_as_pair (p2_high, p2_low); ++ overlap_addend1.dw = vec_load_2di_as_pair (p3_low, p1_high); ++ ++ vec_t sum4; ++ ++#ifdef ADD ++ vec_t sum3; ++ sum3.sw = vec_adde_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw); ++ sum4.sw = vec_adde_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw); ++ ++ carry_vec0.sw ++ = vec_addec_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw); ++ carry_vec1.sw ++ = vec_addec_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw); ++#else ++ sum4.sw = vec_adde_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw); ++ carry_vec0.sw ++ = vec_addec_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw); ++#endif ++ vec_store_elements_reversed_idx (rp, idx, IDX_OFFSET - LOOP_ADVANCE, ++ sum1); ++ vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE, ++ sum4); ++ ++ carry_limb = p3_high; ++ } ++ ++#ifdef ADD ++ carry_vec0.dw += carry_vec1.dw; ++ carry_limb += carry_vec0.dw[1]; ++#else ++ carry_limb += carry_vec0.dw[1]; ++#endif ++ ++ return carry_limb; ++} ++ ++#undef OPERATION_addmul_1 ++#undef OPERATION_mul_1 ++#undef FUNCNAME ++#undef ADD +diff --git a/mpn/s390_64/z13/common-vec.h b/mpn/s390_64/z13/common-vec.h +new file mode 100644 +index 000000000..a59e6eefe +--- /dev/null ++++ b/mpn/s390_64/z13/common-vec.h +@@ -0,0 +1,175 @@ ++/* Common vector helpers and macros for IBM z13 and later ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#ifndef __S390_64_Z13_COMMON_VEC_H ++#define __S390_64_Z13_COMMON_VEC_H ++ ++#include ++#include ++ ++/* ++ * Vector intrinsics use vector element types that kind-of make sense for the ++ * specific operation (e.g., vec_permi permutes doublewords). To use VRs ++ * interchangeably with different intrinsics, typedef the two variants and wrap ++ * them in a union. ++ */ ++#define VLEN_BYTES 16 ++typedef unsigned long long v2di __attribute__ ((vector_size (VLEN_BYTES))); ++typedef unsigned char v16qi __attribute__ ((vector_size (VLEN_BYTES))); ++ ++/* ++ * The Z vector intrinsics use vectors with different element types (e.g., ++ * v16qi for the 128-bit adds and v2di for vec_permi). ++ */ ++union vec ++{ ++ v2di dw; ++ v16qi sw; ++}; ++ ++typedef union vec vec_t; ++ ++/* ++ * single-instruction combine of two GPRs into a VR ++ */ ++static inline v2di ++vec_load_2di_as_pair (unsigned long a, unsigned long b) ++{ ++ v2di res; ++ __asm__("vlvgp\t%0,%1,%2" : "=v"(res) : "r"(a), "r"(b)); ++ return res; ++} ++ ++/* ++ * 64x64 mult where caller needs to care about proper register allocation: ++ * multiply xl with m1, treating both as unsigned, and place the result in ++ * xh:xl. ++ * mlgr operates on register pairs, so xh must be an even gpr followed by xl ++ */ ++#define s390_umul_ppmm(xh, xl, m1) \ ++ do \ ++ { \ ++ asm("mlgr\t%0,%3" : "=r"(xh), "=r"(xl) : "%1"(xl), "r"(m1)); \ ++ } \ ++ while (0); ++ ++/* ++ * two 64x64 multiplications, scheduled so that they will dispatch and issue to ++ * different sides: each mlgr is dispatched alone in an instruction group and ++ * subsequent groups will issue on different execution sides. ++ * there is a variant where both products use the same multiplicand and one ++ * that uses two different multiplicands. constraints from s390_umul_ppmm apply ++ * here. ++ */ ++#define s390_double_umul_ppmm(X0H, X0L, X1H, X1L, MX) \ ++ do \ ++ { \ ++ asm("mlgr\t%[x0h],%[mx]\n\t" \ ++ "mlgr\t%[x1h],%[mx]" \ ++ : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \ ++ [x1l] "=r"(X1L) \ ++ : "[x0l]"(X0L), "[x1l]"(X1L), [mx] "r"(MX)); \ ++ } \ ++ while (0); ++ ++#define s390_double_umul_ppmm_distinct(X0H, X0L, X1H, X1L, MX0, MX1) \ ++ do \ ++ { \ ++ asm("mlgr\t%[x0h],%[mx0]\n\t" \ ++ "mlgr\t%[x1h],%[mx1]" \ ++ : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \ ++ [x1l] "=r"(X1L) \ ++ : "[x0l]"(X0L), "[x1l]"(X1L), [mx0] "r"(MX0), [mx1] "r"(MX1)); \ ++ } \ ++ while (0); ++ ++#define ASM_LOADGPR_BASE(DST, BASE, OFFSET) \ ++ asm volatile("lg\t%[r],%[off](%[b])" \ ++ : [r] "=r"(DST) \ ++ : [b] "a"(BASE), [off] "L"(OFFSET) \ ++ : "memory"); ++ ++#define ASM_LOADGPR(DST, BASE, INDEX, OFFSET) \ ++ asm volatile("lg\t%[r],%[off](%[b],%[x])" \ ++ : [r] "=r"(DST) \ ++ : [b] "a"(BASE), [x] "a"(INDEX), [off] "L"(OFFSET) \ ++ : "memory"); ++ ++/* ++ * Load a vector register from memory and swap the two 64-bit doubleword ++ * elements. ++ */ ++static inline vec_t ++vec_load_elements_reversed_idx (mp_limb_t const *base, ssize_t const index, ++ ssize_t const offset) ++{ ++ vec_t res; ++ char *ptr = (char *)base; ++ ++ res.sw = *(v16qi *)(ptr + index + offset); ++ res.dw = vec_permi (res.dw, res.dw, 2); ++ ++ return res; ++} ++ ++static inline vec_t ++vec_load_elements_reversed (mp_limb_t const *base, ssize_t const offset) ++{ ++ return vec_load_elements_reversed_idx (base, 0, offset); ++} ++ ++/* ++ * Store a vector register to memory and swap the two 64-bit doubleword ++ * elements. ++ */ ++static inline void ++vec_store_elements_reversed_idx (mp_limb_t *base, ssize_t const index, ++ ssize_t const offset, vec_t vec) ++{ ++ char *ptr = (char *)base; ++ ++ vec.dw = vec_permi (vec.dw, vec.dw, 2); ++ *(v16qi *)(ptr + index + offset) = vec.sw; ++} ++ ++static inline void ++vec_store_elements_reversed (mp_limb_t *base, ssize_t const offset, vec_t vec) ++{ ++ vec_store_elements_reversed_idx (base, 0, offset, vec); ++} ++ ++#define ASM_VZERO(VEC) \ ++ do \ ++ { \ ++ asm("vzero\t%[vec]" : [vec] "=v"(VEC)); \ ++ } \ ++ while (0) ++ ++#endif +diff --git a/mpn/s390_64/z13/mul_1.c b/mpn/s390_64/z13/mul_1.c +new file mode 100644 +index 000000000..7584dc8c7 +--- /dev/null ++++ b/mpn/s390_64/z13/mul_1.c +@@ -0,0 +1,31 @@ ++/* mul_1 for IBM z13 or later ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "s390_64/z13/addmul_1.c" +-- +2.40.1 + diff --git a/SOURCES/ibm_z13_simd_part2.patch b/SOURCES/ibm_z13_simd_part2.patch new file mode 100644 index 0000000..3d216d9 --- /dev/null +++ b/SOURCES/ibm_z13_simd_part2.patch @@ -0,0 +1,535 @@ +Co-authored-by: Stefan Liebler +--- + mpn/s390_64/z13/aormul_2.c | 476 +++++++++++++++++++++++++++++++++++ + mpn/s390_64/z13/gmp-mparam.h | 37 +++ + 2 files changed, 513 insertions(+) + create mode 100644 mpn/s390_64/z13/aormul_2.c + create mode 100644 mpn/s390_64/z13/gmp-mparam.h + +diff --git a/mpn/s390_64/z13/aormul_2.c b/mpn/s390_64/z13/aormul_2.c +new file mode 100644 +index 000000000..9a69fc38e +--- /dev/null ++++ b/mpn/s390_64/z13/aormul_2.c +@@ -0,0 +1,476 @@ ++/* Addmul_2 / mul_2 for IBM z13 or later ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "gmp-impl.h" ++ ++#include "s390_64/z13/common-vec.h" ++ ++#undef FUNCNAME ++ ++#ifdef DO_INLINE ++# ifdef OPERATION_addmul_2 ++# define ADD ++# define FUNCNAME inline_addmul_2 ++# elif defined(OPERATION_mul_2) ++# define FUNCNAME inline_mul_2 ++# else ++# error Missing define for operation to perform ++# endif ++#else ++# ifdef OPERATION_addmul_2 ++# define ADD ++# define FUNCNAME mpn_addmul_2 ++# elif defined(OPERATION_mul_2) ++# define FUNCNAME mpn_mul_2 ++# else ++# error Missing define for operation to perform ++# endif ++#endif ++ ++#ifdef DO_INLINE ++static inline mp_limb_t ++FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, const mp_limb_t *vp) ++ __attribute__ ((always_inline)); ++ ++static inline ++#endif ++mp_limb_t ++FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, ++ const mp_limb_t *vp) ++{ ++ ++ /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in ++ VRs (using each VR as a single 128-bit accumulator). ++ The inner loop is unrolled to four limbs, with two blocks of four ++ multiplications each. Since the MLGR operation operates on even/odd GPR ++ pairs, pin the products appropriately. */ ++ ++ register mp_limb_t p0_high asm("r0"); ++ register mp_limb_t p0_low asm("r1"); ++ ++ register mp_limb_t p1_high asm("r8"); ++ register mp_limb_t p1_low asm("r9"); ++ ++ register mp_limb_t p2_high asm("r6"); ++ register mp_limb_t p2_low asm("r7"); ++ ++ register mp_limb_t p3_high asm("r10"); ++ register mp_limb_t p3_low asm("r11"); ++ ++ vec_t carry_prod = { .dw = vec_splat_u64 (0) }; ++ vec_t zero = { .dw = vec_splat_u64 (0) }; ++ ++ /* two carry-bits for the 128-bit VR adds - stored in VRs */ ++#ifdef ADD ++ vec_t carry_vec0 = { .dw = vec_splat_u64 (0) }; ++#endif ++ vec_t carry_vec1 = { .dw = vec_splat_u64 (0) }; ++ ++ vec_t tmp; ++ ++ vec_t sum0, sum1; ++ ++ /* products transferred into VRs for accumulating there */ ++ vec_t pv0, pv3; ++ vec_t pv1_low, pv1_high, pv2_low, pv2_high; ++ vec_t low, middle, high; ++#ifdef ADD ++ vec_t rp0, rp1; ++#endif ++ ++ register mp_limb_t v0 asm("r12"); ++ register mp_limb_t v1 asm("r5"); ++ v0 = vp[0]; ++ v1 = vp[1]; ++ ++ /* The scalar multiplications compete with pointer and index increments for ++ * issue ports. Thus, increment the loop index in the middle of the loop so ++ * that the operations for the next iteration's multiplications can be ++ * loaded in time (looks horrible, yet helps performance) and make sure we ++ * use addressing with base reg + index reg + immediate displacement ++ * (so that only the single index needs incrementing, instead of multiple ++ * pointers). */ ++#undef LOOP_ADVANCE ++#define LOOP_ADVANCE (4 * sizeof (mp_limb_t)) ++#define IDX_OFFSET (LOOP_ADVANCE) ++ ++ register ssize_t idx = 0 - IDX_OFFSET; ++#ifdef BRCTG ++ ssize_t iterations = (size_t)n / 4; ++#else ++ ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET; ++#endif ++ ++ /* ++ * To minimize latency in the carry chain, accumulate in VRs with 128-bit ++ * adds with carry in and out. As a downside, these require two insns for ++ * each add - one to calculate the sum, one to deliver the carry out. ++ * To reduce the overall number of insns to execute, combine adding up ++ * product limbs such that there cannot be a carry out and one (for mul) or ++ * two (for addmul) adds with carry chains. ++ * ++ * Since (2^64-1) * (2^64-1) = (2^128-1) - 2 * (2^64-1), we can add two ++ * limbs into each 128-bit product without causing carry out. ++ * ++ * For each block of 2 limbs * 2 limbs ++ * ++ * | u[i] * v[0] (p2) | ++ * | u[i] * v[1] (p0) | ++ * | u[i+1] * v[0](p1) | ++ * | u[i+1] * v[1](p3) | ++ * < 128 bits > < 128 bits > ++ * ++ * we can begin accumulating with "simple" carry-oblivious 128-bit adds: ++ * - p0 + low limb of p1 ++ * + high limb of p2 ++ * and combine resulting low limb with p2's low limb ++ * - p3 + high limb of p1 ++ * + high limb of sum above ++ * ... which will will result in two 128-bit limbs to be fed into the carry ++ * chain(s). ++ * Overall, that scheme saves instructions and improves performance, despite ++ * slightly increasing latency between multiplications and carry chain (yet ++ * not in the carry chain). ++ */ ++ ++#define LOAD_LOW_LIMB(VEC, LIMB) \ ++ do \ ++ { \ ++ asm("vzero\t%[vec]\n\t" \ ++ "vlvgg\t%[vec],%[limb],1" \ ++ : [vec] "=v"(VEC) \ ++ : [limb] "r"(LIMB)); \ ++ } \ ++ while (0) ++ ++ /* for the 128-bit adds in the carry chain, to calculate a + b + carry-in we ++ * need paired vec_adde_u128 (delivers sum) and vec_addec_u128 (delivers new ++ * carry) */ ++#define ADD_UP2_CARRY_INOUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \ ++ do \ ++ { \ ++ sum##SUMIDX.sw \ ++ = vec_adde_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \ ++ carry_vec##CARRYIDX.sw \ ++ = vec_addec_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \ ++ } \ ++ while (0) ++ ++#define ADD_UP_CARRY_INOUT(SUMIDX, ADDEND1, ADDEND2) \ ++ ADD_UP2_CARRY_INOUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2) ++ ++ /* variant without carry-in for prologue */ ++#define ADD_UP2_CARRY_OUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \ ++ do \ ++ { \ ++ sum##SUMIDX.sw = vec_add_u128 (ADDEND1.sw, ADDEND2.sw); \ ++ carry_vec##CARRYIDX.sw = vec_addc_u128 (ADDEND1.sw, ADDEND2.sw); \ ++ } \ ++ while (0) ++ ++#define ADD_UP_CARRY_OUT(SUMIDX, ADDEND1, ADDEND2) \ ++ ADD_UP2_CARRY_OUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2) ++ ++ /* prologue for 4x-unrolled main loop */ ++ switch ((size_t)n % 4) ++ { ++ case 1: ++ ASM_LOADGPR_BASE (p0_low, up, 0); ++ ASM_LOADGPR_BASE (p1_low, up, 0); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1); ++ carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low); ++ ++/* gcc tries to be too clever and vlr from a reg that is already zero. vzero is ++ * cheaper. */ ++# define NEW_CARRY(VEC, LIMB) \ ++ do \ ++ { \ ++ asm("vzero\t%[vec]\n\t" \ ++ "vlvgg\t%[vec],%[limb],1" \ ++ : [vec] "=v"(VEC) \ ++ : [limb] "r"(LIMB)); \ ++ } \ ++ while (0) ++ ++ NEW_CARRY (tmp, p0_high); ++ ++ carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw); ++#ifdef ADD ++ carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp); ++#else ++ rp[0] = p0_low; ++#endif ++ idx += sizeof (mp_limb_t); ++ break; ++ ++ case 2: ++ ASM_LOADGPR_BASE (p0_low, up, 0); ++ ASM_LOADGPR_BASE (p1_low, up, 8); ++ ASM_LOADGPR_BASE (p2_low, up, 0); ++ ASM_LOADGPR_BASE (p3_low, up, 8); ++ ++ asm("" ++ : "=r"(p0_low), "=r"(p2_low) ++ : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low)); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ LOAD_LOW_LIMB (pv1_low, p1_low); ++ LOAD_LOW_LIMB (pv1_high, p1_high); ++ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); ++ LOAD_LOW_LIMB (pv2_high, p2_high); ++ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ LOAD_LOW_LIMB (pv2_low, p2_low); ++ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); ++ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); ++ low.dw = vec_permi (middle.dw, pv2_low.dw, 3); ++ middle.dw = vec_permi (zero.dw, middle.dw, 0); ++ high.sw = vec_add_u128 (middle.sw, pv3.sw); ++#ifdef ADD ++ rp0 = vec_load_elements_reversed (rp, 0); ++ ADD_UP_CARRY_OUT (0, rp0, carry_prod); ++#else ++ sum0 = carry_prod; ++#endif ++ ADD_UP_CARRY_OUT (1, sum0, low); ++ vec_store_elements_reversed (rp, 0, sum1); ++ carry_prod = high; ++ ++ idx += 2 * sizeof (mp_limb_t); ++ break; ++ ++ case 3: ++ ASM_LOADGPR_BASE (p0_low, up, 0); ++ ASM_LOADGPR_BASE (p1_low, up, 0); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1); ++ carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low); ++ NEW_CARRY (tmp, p0_high); ++ carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw); ++ ++#ifdef ADD ++ carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp); ++#else ++ rp[0] = p0_low; ++#endif ++ ++ ASM_LOADGPR_BASE (p0_low, up, 8); ++ ASM_LOADGPR_BASE (p1_low, up, 16); ++ ASM_LOADGPR_BASE (p2_low, up, 8); ++ ASM_LOADGPR_BASE (p3_low, up, 16); ++ ++ asm("" ++ : "=r"(p0_low), "=r"(p2_low) ++ : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low)); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++ LOAD_LOW_LIMB (pv1_low, p1_low); ++ LOAD_LOW_LIMB (pv1_high, p1_high); ++ ++ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); ++ LOAD_LOW_LIMB (pv2_high, p2_high); ++ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ ++ LOAD_LOW_LIMB (pv2_low, p2_low); ++ ++ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); ++ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); ++ ++ low.dw = vec_permi (middle.dw, pv2_low.dw, 3); ++ middle.dw = vec_permi (zero.dw, middle.dw, 0); ++ high.sw = vec_add_u128 (middle.sw, pv3.sw); ++ ++#ifdef ADD ++ vec_t rp0 = vec_load_elements_reversed (rp, 8); ++ ADD_UP_CARRY_OUT (0, rp0, carry_prod); ++#else ++ sum0 = carry_prod; ++#endif ++ ADD_UP_CARRY_INOUT (1, sum0, low); ++ ++ vec_store_elements_reversed (rp, 8, sum1); ++ ++ carry_prod = high; ++ ++ idx += 3 * sizeof (mp_limb_t); ++ break; ++ } ++ ++ /* ++ * branch-on-count implicitly hint to the branch prediction as taken, while ++ * compare-and-branch hints as not taken. currently, using branch-on-count ++ * has a performance advantage, but it is not clear that it is generally ++ * the better choice (e.g., branch-on-count requires decrementing the ++ * separate counter). so, allow switching the loop condition to enable ++ * either category of branch instructions: ++ * - idx is less than an upper bound, for compare-and-branch ++ * - iteration counter greater than zero, for branch-on-count ++ */ ++#ifdef BRCTG ++ for (; iterations > 0; iterations--) ++ { ++#else ++ while (idx < idx_bound) ++ { ++#endif ++ /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the ++ * result in a GPR pair. One of the factors is taken from the GPR pair ++ * and overwritten. ++ * To reuse factors, it turned out cheaper to load limbs multiple times ++ * than copying GPR contents. Enforce that and the use of addressing by ++ * base + index gpr + immediate displacement via inline asm. ++ */ ++ ASM_LOADGPR (p0_low, up, idx, 0 + IDX_OFFSET); ++ ASM_LOADGPR (p1_low, up, idx, 8 + IDX_OFFSET); ++ ASM_LOADGPR (p2_low, up, idx, 0 + IDX_OFFSET); ++ ASM_LOADGPR (p3_low, up, idx, 8 + IDX_OFFSET); ++ ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ ++ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++ LOAD_LOW_LIMB (pv1_low, p1_low); ++ LOAD_LOW_LIMB (pv1_high, p1_high); ++ ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); ++ LOAD_LOW_LIMB (pv2_high, p2_high); ++ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ ++ LOAD_LOW_LIMB (pv2_low, p2_low); ++ ++ ASM_LOADGPR (p0_low, up, idx, 16 + IDX_OFFSET); ++ ASM_LOADGPR (p1_low, up, idx, 24 + IDX_OFFSET); ++ ASM_LOADGPR (p2_low, up, idx, 16 + IDX_OFFSET); ++ ASM_LOADGPR (p3_low, up, idx, 24 + IDX_OFFSET); ++ ++ idx += LOOP_ADVANCE; ++ ++ /* ++ * "barrier" to enforce scheduling the index increment before the second ++ * block of multiplications. not required for clang. ++ */ ++#ifndef __clang__ ++ asm("" ++ : "=r"(idx), "=r"(p0_high), "=r"(p2_high) ++ : "0"(idx), "1"(p0_high), "2"(p2_high)); ++#endif ++ ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ /* ++ * "barrier" to enforce scheduling all MLGRs first, before any adding ++ * up. note that clang produces better code without. ++ */ ++#ifndef __clang__ ++ asm("" ++ : "=v"(pv0.sw), "=v"(pv3.sw) ++ : "1"(pv3.sw), "0"(pv0.sw), "r"(p0_high), "r"(p2_high)); ++#endif ++ ++ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); ++ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); ++ ++ low.dw = vec_permi (middle.dw, pv2_low.dw, ++ 3); /* least-significant doubleword from both vectors */ ++ middle.dw = vec_permi (zero.dw, middle.dw, 0); ++ high.sw = vec_add_u128 (middle.sw, pv3.sw); ++ ++#ifdef ADD ++ rp0 = vec_load_elements_reversed_idx (rp, idx, ++ 0 + IDX_OFFSET - LOOP_ADVANCE); ++ ADD_UP_CARRY_INOUT (0, rp0, carry_prod); ++#else ++ sum0 = carry_prod; ++#endif ++ ADD_UP_CARRY_INOUT (1, sum0, low); ++ ++ vec_store_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET - LOOP_ADVANCE, ++ sum1); ++ ++ carry_prod = high; ++ ++ vec_t pv0_2, pv3_2; ++ vec_t pv1_low_2, pv1_high_2, pv2_low_2, pv2_high_2; ++ vec_t low_2, middle_2, high_2; ++ vec_t sum2, sum3; ++ ++ pv0_2.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ LOAD_LOW_LIMB (pv1_low_2, p1_low); ++ LOAD_LOW_LIMB (pv1_high_2, p1_high); ++ ++ pv0_2.sw = vec_add_u128 (pv0_2.sw, pv1_low_2.sw); ++ LOAD_LOW_LIMB (pv2_high_2, p2_high); ++ pv3_2.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ pv3_2.sw = vec_add_u128 (pv3_2.sw, pv1_high_2.sw); ++ middle_2.sw = vec_add_u128 (pv0_2.sw, pv2_high_2.sw); ++ ++ LOAD_LOW_LIMB (pv2_low_2, p2_low); ++ low_2.dw ++ = vec_permi (middle_2.dw, pv2_low_2.dw, ++ 3); /* least-significant doubleword from both vectors */ ++ middle_2.dw = vec_permi (zero.dw, middle_2.dw, 0); ++ high_2.sw = vec_add_u128 (middle_2.sw, pv3_2.sw); ++ ++ /* ++ * another "barrier" to influence scheduling. (also helps in clang) ++ */ ++ asm("" : : "v"(pv0_2.sw), "r"(p2_high), "r"(p3_high), "v"(pv3_2.sw)); ++ ++#ifdef ADD ++ rp1 = vec_load_elements_reversed_idx (rp, idx, ++ 16 + IDX_OFFSET - LOOP_ADVANCE); ++ ADD_UP2_CARRY_INOUT (2, 0, rp1, carry_prod); ++#else ++ sum2 = carry_prod; ++#endif ++ ADD_UP2_CARRY_INOUT (3, 1, sum2, low_2); ++ ++ vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE, ++ sum3); ++ ++ carry_prod = high_2; ++ } ++ ++#ifdef ADD ++ sum0.sw = vec_adde_u128 (carry_prod.sw, carry_vec0.sw, carry_vec1.sw); ++#else ++ sum0.sw = vec_add_u128 (carry_prod.sw, carry_vec1.sw); ++#endif ++ ++ *(mp_ptr) (((char *)rp) + idx + 0 + IDX_OFFSET) = (mp_limb_t)sum0.dw[1]; ++ ++ return (mp_limb_t)sum0.dw[0]; ++} +diff --git a/mpn/s390_64/z13/gmp-mparam.h b/mpn/s390_64/z13/gmp-mparam.h +new file mode 100644 +index 000000000..a17503fd0 +--- /dev/null ++++ b/mpn/s390_64/z13/gmp-mparam.h +@@ -0,0 +1,37 @@ ++/* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file. ++ ++Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#define GMP_LIMB_BITS 64 ++#define GMP_LIMB_BYTES 8 ++ ++#define HAVE_NATIVE_mpn_addmul_2 1 ++#define HAVE_NATIVE_mpn_mul_2 1 ++ ++#include "mpn/s390_64/gmp-mparam.h" +-- +2.40.1 diff --git a/SOURCES/ibm_z13_simd_part3.patch b/SOURCES/ibm_z13_simd_part3.patch new file mode 100644 index 0000000..8301e57 --- /dev/null +++ b/SOURCES/ibm_z13_simd_part3.patch @@ -0,0 +1,138 @@ +Co-authored-by: Stefan Liebler +--- + mpn/s390_64/z13/mul_basecase.c | 124 +++++++++++++++++++++++++++++++++ + 1 file changed, 124 insertions(+) + create mode 100644 mpn/s390_64/z13/mul_basecase.c + +diff --git a/mpn/s390_64/z13/mul_basecase.c b/mpn/s390_64/z13/mul_basecase.c +new file mode 100644 +index 000000000..f1b7160b3 +--- /dev/null ++++ b/mpn/s390_64/z13/mul_basecase.c +@@ -0,0 +1,124 @@ ++/* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two ++ natural numbers of length m and n. ++ ++ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY ++ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include ++ ++#include "gmp-impl.h" ++ ++/* Note: we explicitly inline all mul and addmul routines here to reduce the ++ * number of branches in prologues of unrolled functions. That comes at the ++ cost of duplicating common loop bodies in object code. */ ++#define DO_INLINE ++ ++/* ++ * tweak loop conditions in addmul subroutines to enable use of ++ * branch-relative-on-count (BRCTG) instructions, which currently results in ++ * better performance. ++ */ ++#define BRCTG ++ ++#include "s390_64/z13/common-vec.h" ++ ++#define OPERATION_mul_1 ++#include "s390_64/z13/addmul_1.c" ++#undef OPERATION_mul_1 ++ ++#define OPERATION_addmul_1 ++#include "s390_64/z13/addmul_1.c" ++#undef OPERATION_addmul_1 ++ ++#define OPERATION_mul_2 ++#include "s390_64/z13/aormul_2.c" ++#undef OPERATION_mul_2 ++ ++#define OPERATION_addmul_2 ++#include "s390_64/z13/aormul_2.c" ++#undef OPERATION_addmul_2 ++ ++void ++mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp, ++ mp_size_t vn) ++{ ++ ASSERT (un >= vn); ++ ASSERT (vn >= 1); ++ ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un)); ++ ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn)); ++ ++ /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch ++ * for un%4 and inline specific variants. */ ++ ++#define BRANCH_FOR_MOD(N) \ ++ do \ ++ { \ ++ if (vn >= 2) \ ++ { \ ++ rp[un + 1] = inline_mul_2 (rp, up, un, vp); \ ++ rp += 2, vp += 2, vn -= 2; \ ++ } \ ++ else \ ++ { \ ++ rp[un] = inline_mul_1 (rp, up, un, vp[0]); \ ++ return; \ ++ } \ ++ \ ++ while (vn >= 2) \ ++ { \ ++ rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp); \ ++ rp += 2, vp += 2, vn -= 2; \ ++ } \ ++ \ ++ while (vn >= 1) \ ++ { \ ++ rp[un] = inline_addmul_1 (rp, up, un, vp[0]); \ ++ rp += 1, vp += 1, vn -= 1; \ ++ } \ ++ } \ ++ while (0); ++ ++ switch (((size_t)un) % 4) ++ { ++ case 0: ++ BRANCH_FOR_MOD (0); ++ break; ++ case 1: ++ BRANCH_FOR_MOD (1); ++ break; ++ case 2: ++ BRANCH_FOR_MOD (2); ++ break; ++ case 3: ++ BRANCH_FOR_MOD (3); ++ break; ++ } ++} +-- +2.40.1 diff --git a/SOURCES/ibm_z13_simd_part4.patch b/SOURCES/ibm_z13_simd_part4.patch new file mode 100644 index 0000000..c87c17c --- /dev/null +++ b/SOURCES/ibm_z13_simd_part4.patch @@ -0,0 +1,151 @@ +From: Marius Hillenbrand + +--- + mpn/s390_64/z13/gmp-mparam.h | 129 ++++++++++++++++++++++++++++++++++- + 1 file changed, 127 insertions(+), 2 deletions(-) + +diff --git a/mpn/s390_64/z13/gmp-mparam.h b/mpn/s390_64/z13/gmp-mparam.h +index a17503fd0..50e7f39d1 100644 +--- a/mpn/s390_64/z13/gmp-mparam.h ++++ b/mpn/s390_64/z13/gmp-mparam.h +@@ -1,6 +1,6 @@ + /* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file. + +-Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. ++Copyright 2021 Free Software Foundation, Inc. + + This file is part of the GNU MP Library. + +@@ -34,4 +34,129 @@ see https://www.gnu.org/licenses/. */ + #define HAVE_NATIVE_mpn_addmul_2 1 + #define HAVE_NATIVE_mpn_mul_2 1 + +-#include "mpn/s390_64/gmp-mparam.h" ++/* Generated by tuneup.c, 2021-07-30, gcc 10.2 */ ++ ++#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define MOD_1_1P_METHOD 2 ++#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define MOD_1N_TO_MOD_1_1_THRESHOLD 17 ++#define MOD_1U_TO_MOD_1_1_THRESHOLD 15 ++#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ ++#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ ++#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5 ++#define USE_PREINV_DIVREM_1 1 ++#define DIV_QR_1N_PI1_METHOD 3 ++#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define DIV_QR_2_PI2_THRESHOLD 996 ++#define DIVEXACT_1_THRESHOLD 4 ++#define BMOD_1_TO_MOD_1_THRESHOLD 0 /* always */ ++ ++#define DIV_1_VS_MUL_1_PERCENT 404 ++ ++#define MUL_TOOM22_THRESHOLD 23 ++#define MUL_TOOM33_THRESHOLD 94 ++#define MUL_TOOM44_THRESHOLD 166 ++#define MUL_TOOM6H_THRESHOLD 286 ++#define MUL_TOOM8H_THRESHOLD 626 ++ ++#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113 ++#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 ++#define MUL_TOOM42_TO_TOOM53_THRESHOLD 143 ++#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 ++#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 ++ ++#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ ++#define SQR_TOOM2_THRESHOLD 12 ++#define SQR_TOOM3_THRESHOLD 84 ++#define SQR_TOOM4_THRESHOLD 234 ++#define SQR_TOOM6_THRESHOLD 318 ++#define SQR_TOOM8_THRESHOLD 478 ++ ++#define MULMID_TOOM42_THRESHOLD 42 ++ ++#define MULMOD_BNM1_THRESHOLD 13 ++#define SQRMOD_BNM1_THRESHOLD 7 ++ ++#define MUL_FFT_MODF_THRESHOLD 332 /* k = 5 */ ++#define MUL_FFT_TABLE3 \ ++ { { 332, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ ++ { 21, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ ++ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ ++ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ ++ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ ++ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ ++ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ ++ { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ ++ { 67,10}, { 47,11}, { 2048,12}, { 4096,13}, \ ++ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ ++ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ ++ {2097152,22}, {4194304,23}, {8388608,24} } ++#define MUL_FFT_TABLE3_SIZE 47 ++#define MUL_FFT_THRESHOLD 2752 ++ ++#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ ++#define SQR_FFT_TABLE3 \ ++ { { 240, 5}, { 8, 4}, { 17, 5}, { 13, 6}, \ ++ { 7, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ ++ { 9, 5}, { 19, 6}, { 15, 7}, { 8, 6}, \ ++ { 17, 7}, { 9, 6}, { 19, 7}, { 10, 6}, \ ++ { 21, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ ++ { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ ++ { 21, 9}, { 11, 8}, { 23, 9}, { 15, 8}, \ ++ { 31, 9}, { 19, 8}, { 39, 9}, { 23,10}, \ ++ { 15, 9}, { 39,10}, { 23,11}, { 15,10}, \ ++ { 31, 9}, { 63,10}, { 47,11}, { 2048,12}, \ ++ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ ++ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ ++ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } ++#define SQR_FFT_TABLE3_SIZE 52 ++#define SQR_FFT_THRESHOLD 1856 ++ ++#define MULLO_BASECASE_THRESHOLD 0 /* always */ ++#define MULLO_DC_THRESHOLD 25 ++#define MULLO_MUL_N_THRESHOLD 5397 ++#define SQRLO_BASECASE_THRESHOLD 0 /* always */ ++#define SQRLO_DC_THRESHOLD 396 ++#define SQRLO_SQR_THRESHOLD 3704 ++ ++#define DC_DIV_QR_THRESHOLD 15 ++#define DC_DIVAPPR_Q_THRESHOLD 50 ++#define DC_BDIV_QR_THRESHOLD 66 ++#define DC_BDIV_Q_THRESHOLD 202 ++ ++#define INV_MULMOD_BNM1_THRESHOLD 46 ++#define INV_NEWTON_THRESHOLD 29 ++#define INV_APPR_THRESHOLD 13 ++ ++#define BINV_NEWTON_THRESHOLD 312 ++#define REDC_1_TO_REDC_2_THRESHOLD 79 ++#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ ++ ++#define MU_DIV_QR_THRESHOLD 979 ++#define MU_DIVAPPR_Q_THRESHOLD 979 ++#define MUPI_DIV_QR_THRESHOLD 13 ++#define MU_BDIV_QR_THRESHOLD 942 ++#define MU_BDIV_Q_THRESHOLD 1367 ++ ++#define POWM_SEC_TABLE 3,19,215,1730 ++ ++#define GET_STR_DC_THRESHOLD 10 ++#define GET_STR_PRECOMPUTE_THRESHOLD 15 ++#define SET_STR_DC_THRESHOLD 882 ++#define SET_STR_PRECOMPUTE_THRESHOLD 2520 ++ ++#define FAC_DSC_THRESHOLD 228 ++#define FAC_ODD_THRESHOLD 24 ++ ++#define MATRIX22_STRASSEN_THRESHOLD 19 ++#define HGCD2_DIV1_METHOD 1 ++#define HGCD_THRESHOLD 61 ++#define HGCD_APPR_THRESHOLD 51 ++#define HGCD_REDUCE_THRESHOLD 1962 ++#define GCD_DC_THRESHOLD 217 ++#define GCDEXT_DC_THRESHOLD 263 ++#define JACOBI_BASE_METHOD 4 ++ +-- +2.40.1 diff --git a/SPECS/gmp.spec b/SPECS/gmp.spec new file mode 100644 index 0000000..069f8d7 --- /dev/null +++ b/SPECS/gmp.spec @@ -0,0 +1,739 @@ +# +# Important for %%{ix86}: +# This rpm has to be build on a CPU with sse2 support like Pentium 4 ! +# + +Summary: A GNU arbitrary precision library +Name: gmp +Version: 6.2.0 +Release: 13%{?dist} +Epoch: 1 +URL: http://gmplib.org/ +Source0: ftp://ftp.gmplib.org/pub/gmp-%{version}/gmp-%{version}.tar.bz2 +# or ftp://ftp.gnu.org/pub/gnu/gmp/gmp-%{version}.tar.xz +Source2: gmp.h +Source3: gmp-mparam.h +Patch2: gmp-6.0.0-debuginfo.patch +Patch3: gmp-intel-cet.patch +Patch4: cve-2021-43618.patch +Patch5: ibm_z13_simd_part1.patch +Patch6: ibm_z13_simd_part2.patch +Patch7: ibm_z13_simd_part3.patch +Patch8: ibm_z13_simd_part4.patch +License: LGPLv3+ or GPLv2+ +BuildRequires: autoconf automake libtool +BuildRequires: gcc +BuildRequires: gcc-c++ +BuildRequires: git +#autoreconf on arm needs: +BuildRequires: perl-Carp +# Generate the .hmac checksum unless --without fips is used +%bcond_without fips +%if %{with fips} +BuildRequires: fipscheck +%endif +BuildRequires: make + +%description +The gmp package contains GNU MP, a library for arbitrary precision +arithmetic, signed integers operations, rational numbers and floating +point numbers. GNU MP is designed for speed, for both small and very +large operands. GNU MP is fast because it uses fullwords as the basic +arithmetic type, it uses fast algorithms, it carefully optimizes +assembly code for many CPUs' most common inner loops, and it generally +emphasizes speed over simplicity/elegance in its operations. + +Install the gmp package if you need a fast arbitrary precision +library. + +%package c++ +Summary: C++ bindings for the GNU MP arbitrary precision library +Requires: %{name}%{?_isa} = %{epoch}:%{version}-%{release} + +%description c++ +Bindings for using the GNU MP arbitrary precision library in C++ applications. + +%package devel +Summary: Development tools for the GNU MP arbitrary precision library +Requires: %{name}%{?_isa} = %{epoch}:%{version}-%{release} +Requires: %{name}-c++%{?_isa} = %{epoch}:%{version}-%{release} + +%description devel +The libraries, header files and documentation for using the GNU MP +arbitrary precision library in applications. + +If you want to develop applications which will use the GNU MP library, +you'll need to install the gmp-devel package. You'll also need to +install the gmp package. + +%package static +Summary: Development tools for the GNU MP arbitrary precision library +Requires: %{name}-devel = %{epoch}:%{version}-%{release} + +%description static +The static libraries for using the GNU MP arbitrary precision library +in applications. + +%prep +%autosetup -S git + +# switch the defaults to new cpus on s390x +%ifarch s390x +( cd mpn/s390_64; ln -s z13 s390x ) +%endif + +%build +autoreconf -ifv +if as --help | grep -q execstack; then + # the object files do not require an executable stack + export CCAS="gcc -c -Wa,--noexecstack" +fi + +%ifarch %{ix86} + export CFLAGS=$(echo %{optflags} | sed -e "s/-mtune=[^ ]*//g" | sed -e "s/-march=[^ ]*/-march=i686/g") + export CXXFLAGS=$(echo %{optflags} | sed -e "s/-mtune=[^ ]*//g" | sed -e "s/-march=[^ ]*/-march=i686/g") +%endif + +%configure --enable-cxx --enable-fat + +sed -e 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' \ + -e 's|^runpath_var=LD_RUN_PATH|runpath_var=DIE_RPATH_DIE|g' \ + -e 's|-lstdc++ -lm|-lstdc++|' \ + -i libtool +export LD_LIBRARY_PATH=`pwd`/.libs +%make_build + +%if %{with fips} +%define __spec_install_post \ + %{?__debug_package:%{__debug_install_post}} \ + %{__arch_install_post} \ + %{__os_install_post} \ + fipshmac -d $RPM_BUILD_ROOT%{_libdir} $RPM_BUILD_ROOT%{_libdir}/libgmp.so.10.* \ + file=`basename $RPM_BUILD_ROOT%{_libdir}/libgmp.so.10.*.hmac` && \ + mkdir -p $RPM_BUILD_ROOT%{_libdir}/fipscheck && \ + mv $RPM_BUILD_ROOT%{_libdir}/$file $RPM_BUILD_ROOT%{_libdir}/fipscheck/$file && \ + ln -s $file $RPM_BUILD_ROOT%{_libdir}/fipscheck/libgmp.so.10.hmac && \ + cp $RPM_BUILD_ROOT%{_libdir}/fipscheck/$file $RPM_BUILD_ROOT%{_libdir}/.$file && \ + ln -s .$file $RPM_BUILD_ROOT%{_libdir}/.libgmp.so.10.hmac +%{nil} +%endif + +%install +export LD_LIBRARY_PATH=`pwd`/.libs +%make_install +install -m 644 gmp-mparam.h ${RPM_BUILD_ROOT}%{_includedir} +rm -f $RPM_BUILD_ROOT%{_libdir}/lib{gmp,mp,gmpxx}.la +rm -f $RPM_BUILD_ROOT%{_infodir}/dir +/sbin/ldconfig -n $RPM_BUILD_ROOT%{_libdir} +ln -sf libgmpxx.so.4 $RPM_BUILD_ROOT%{_libdir}/libgmpxx.so + +# Rename gmp.h to gmp-.h and gmp-mparam.h to gmp-mparam-.h to +# avoid file conflicts on multilib systems and install wrapper include files +# gmp.h and gmp-mparam-.h +basearch=%{_arch} +# always use i386 for iX86 +%ifarch %{ix86} +basearch=i386 +%endif +# always use arm for arm* +%ifarch %{arm} +basearch=arm +%endif +# superH architecture support +%ifarch sh3 sh4 +basearch=sh +%endif +# Rename files and install wrappers + +mv %{buildroot}/%{_includedir}/gmp.h %{buildroot}/%{_includedir}/gmp-${basearch}.h +install -m644 %{SOURCE2} %{buildroot}/%{_includedir}/gmp.h +mv %{buildroot}/%{_includedir}/gmp-mparam.h %{buildroot}/%{_includedir}/gmp-mparam-${basearch}.h +install -m644 %{SOURCE3} %{buildroot}/%{_includedir}/gmp-mparam.h + + +%check +%ifnarch ppc +export LD_LIBRARY_PATH=`pwd`/.libs +%make_build check +%endif + +%ldconfig_scriptlets + +%ldconfig_scriptlets c++ + +%files +%{!?_licensedir:%global license %%doc} +%license COPYING COPYING.LESSERv3 COPYINGv2 COPYINGv3 +%doc NEWS README +%{_libdir}/libgmp.so.* +%if %{with fips} +%{_libdir}/.libgmp.so.*.hmac +%{_libdir}/fipscheck/libgmp.so.*.hmac +%endif + +%files c++ +%{_libdir}/libgmpxx.so.* + +%files devel +%{_libdir}/libgmp.so +%{_libdir}/libgmpxx.so +%{_libdir}/pkgconfig/gmp.pc +%{_libdir}/pkgconfig/gmpxx.pc +%{_includedir}/*.h +%{_infodir}/gmp.info* + +%files static +%{_libdir}/libgmp.a +%{_libdir}/libgmpxx.a + +%changelog +* Tue Aug 03 2023 Jakub Martisko - 1:6.2.0-13 +- Fix: previous commit removed one function from the library and thus broke the ABI +- function gmpn_preinv_divrem_1 should now not be removed +Related: rhbz#2044216 + +* Tue Jul 18 2023 Jakub Martisko - 1:6.2.0-12 +- Add SIMD optimization patches for s390x (provided by the IBM) +Resolves: rhbz#2044216 + +* Tue Jun 06 2023 Jakub Martisko - 1:6.2.0-11 +Fix: Integer overflow and resultant buffer overflow via crafted input +Resolves: CVE-2021-43618 + +* Fri Aug 27 2021 Jakub Martisko - 1:6.2.0-10 +- Add the support for intel CET +Resolves: rhbz#1977890 + +* Wed Aug 18 2021 Jakub Martisko - 1:6.2.0-9 +- Move the .hmac files to the fipscheck subfolder +- Make symlinks from their original location (Fedora contains the .hmac files there) pointing to their new location +Resolves: rhbz#1980758 + +* Mon Aug 09 2021 Mohan Boddu - 1:6.2.0-8 +- Rebuilt for IMA sigs, glibc 2.34, aarch64 flags + Related: rhbz#1991688 + +* Thu Apr 15 2021 Mohan Boddu - 1:6.2.0-7 +- Rebuilt for RHEL 9 BETA on Apr 15th 2021. Related: rhbz#1947937 + +* Tue Jan 26 2021 Fedora Release Engineering - 1:6.2.0-6 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_34_Mass_Rebuild + +* Tue Sep 15 2020 Kalev Lember - 1:6.2.0-5 +- Move gmpxx.pc to -devel subpackage as well + +* Fri Aug 07 2020 Peter Robinson - 1:6.2.0-4 +- The pkgcfg file should be in devel + +* Tue Jul 28 2020 Jakub Martisko - 1:6.2.0-3 +- Use make macros + +* Mon Jul 27 2020 Fedora Release Engineering - 1:6.2.0-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_33_Mass_Rebuild + +* Mon Feb 17 2020 Jakub Martisko - 1:6.2.0-1 +- Rebase to 6.2.0 + +* Tue Jan 28 2020 Fedora Release Engineering - 1:6.1.2-13 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild + +* Tue Dec 03 2019 Jakub Martisko - 1:6.1.2-12 +- Reenable the fat binaries build option +Resolves: #1779060 + +* Thu Jul 25 2019 Fedora Release Engineering - 1:6.1.2-11 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_31_Mass_Rebuild + +* Fri Feb 15 2019 Anderson Toshiyuki Sasaki - 1:6.1.2-10 +- Create HMAC checksum for FIPS integrity self tests + +* Thu Jan 31 2019 Fedora Release Engineering - 1:6.1.2-9 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild + +* Fri Jul 13 2018 Fedora Release Engineering - 1:6.1.2-8 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_29_Mass_Rebuild + +* Wed Feb 07 2018 Fedora Release Engineering - 1:6.1.2-7 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild + +* Wed Aug 02 2017 Fedora Release Engineering - 1:6.1.2-6 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild + +* Wed Jul 26 2017 Fedora Release Engineering - 1:6.1.2-5 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Mass_Rebuild + +* Mon Mar 13 2017 David Kaspar [Dee'Kej] - 1:6.1.2-4 +- Fix the build process for ix89 family + +* Fri Feb 17 2017 David Kaspar [Dee'Kej] - 1:6.1.2-3 +- Build process updated to correctly build .debug_info for i386 + and to correctly use hardening flags + +* Fri Feb 10 2017 Fedora Release Engineering - 1:6.1.2-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_26_Mass_Rebuild + +* Tue Dec 20 2016 Frantisek Kluknavsky - 1:6.1.2-1 +- rebase + +* Wed Jun 22 2016 Frantisek Kluknavsky - 1:6.1.1-1 +- rebase + +* Fri Apr 08 2016 Yaakov Selkowitz - 1:6.1.0-3 +- Split c++ subpackage (#1325439) + +* Wed Feb 03 2016 Fedora Release Engineering - 1:6.1.0-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_24_Mass_Rebuild + +* Wed Nov 25 2015 Frantisek Kluknavsky - 1:6.1.0-1 +- rebase to 6.1.0 +- gmp-6.0.0-ppc64.patch already upstream, dropped + +* Mon Sep 14 2015 Frantisek Kluknavsky - 1:6.0.0-13 +- do not package sse2 variant, use --enable-fat instead (a bit dangerous, some low level routines will be skipped in `make check`) + +* Fri Sep 04 2015 Michal Toman - 1:6.0.0-12 +- Add support for MIPS architecture to gmp.h and gmp-mparam.h + +* Wed Jun 17 2015 Fedora Release Engineering - 1:6.0.0-11 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild + +* Sat May 02 2015 Kalev Lember - 1:6.0.0-10 +- Rebuilt for GCC 5 C++11 ABI change + +* Thu Apr 02 2015 Frantisek Kluknavsky - 1:6.0.0-9 +- bug965318 - improve debuginfo of assembler sources + +* Thu Sep 04 2014 Dan Horák - 1:6.0.0-8 +- drop s390x patch, support is already in upstream + +* Sat Aug 16 2014 Fedora Release Engineering - 1:6.0.0-7 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_22_Mass_Rebuild + +* Sat Jul 12 2014 Tom Callaway - 1:6.0.0-6 +- fix license handling + +* Thu Jul 10 2014 Brent Baude - 1:6.0.0-5 +- Fix gmp headers for ppc64le (#1083429) + +* Sat Jun 07 2014 Fedora Release Engineering - 1:6.0.0-4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_Mass_Rebuild + +* Thu Apr 24 2014 Karsten Hopp 6.0.0-3 +- set default for BMOD_1_TO_MOD_1_THRESHOLD on ppc64, patch by + Torbjorn Granlund: + https://gmplib.org/repo/gmp/rev/4a6d258b467f + +* Mon Apr 14 2014 Frantisek Kluknavsky - 1:6.0.0-2 +- rebase + +* Wed Nov 06 2013 Frantisek Kluknavsky - 1:5.1.3-2 +- support for aarch64 + +* Wed Nov 06 2013 Frantisek Kluknavsky - 1:5.1.3-1 +- rebase to 5.1.3 + +* Sat Aug 03 2013 Fedora Release Engineering - 1:5.1.2-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild + +* Thu May 30 2013 Frantisek Kluknavsky - 1:5.1.2-1 +- rebase to 5.1.2 + +* Thu Mar 28 2013 Frantisek Kluknavsky - 1:5.1.1-3 +- added build dependency needed to autoreconf on arm + +* Thu Feb 14 2013 Frantisek Kluknavsky - 1:5.1.1-2 +- rebase to 5.1.1 +- deleted unapplicable part of gmp-4.0.1-s390.patch + +* Fri Jan 25 2013 Frantisek Kluknavsky - 1:5.1.0-1 +- rebase to 5.1.0, de-ansi patch no longer applicable +- upstream dropped libmp.so (bsdmp-like interface) +- silenced bogus date in changelog + +* Tue Jan 22 2013 Peter Robinson 1:5.0.5-6 +- Rebuild against new binutils to fix FTBFS on ARM + +* Fri Nov 23 2012 Frantisek Kluknavsky - 1:5.0.5-5 +- minor spec cleanup + +* Fri Jul 20 2012 Peter Schiffer 1:5.0.5-3 +- fixed FTBFS + +* Thu Jul 19 2012 Fedora Release Engineering - 1:5.0.5-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_18_Mass_Rebuild + +* Mon Jun 25 2012 Peter Schiffer 1:5.0.5-1 +- resolves: #820897 + update to 5.0.5 + +* Thu Apr 19 2012 Peter Schiffer 1:5.0.4-1 +- resolves: #785116 + update to 5.0.4 + +* Tue Feb 28 2012 Fedora Release Engineering - 1:5.0.2-6 +- Rebuilt for c++ ABI breakage + +* Thu Jan 19 2012 Peter Schiffer 1:5.0.2-5 +- fixed FTBFS with gcc 4.7 on 32bit arch + +* Fri Jan 13 2012 Fedora Release Engineering - 1:5.0.2-4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_17_Mass_Rebuild + +* Fri Oct 14 2011 Peter Schiffer 1:5.0.2-3 +- removed old compatibility library + +* Mon Sep 26 2011 Peter Schiffer 1:5.0.2-2 +- temporary build wild old compatibility library version + +* Tue Sep 20 2011 Peter Schiffer 1:5.0.2-1 +- resolves: #702919 + update to 5.0.2 +- resolves: #738091 + removed unused direct shlib dependency on libm + updated license in gmp.h and gmp-mparam.h files + +* Mon Jun 13 2011 Ivana Hutarova Varekova 1:4.3.2-4 +- Resolves: #706374 + fix sse2/libgmp.so.3.5.2 debuginfo data + +* Tue Feb 08 2011 Fedora Release Engineering - 1:4.3.2-3 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_15_Mass_Rebuild + +* Wed Nov 24 2010 Ivana Hutarova Varekova 1:4.3.2-2 +- fix Requires tag + +* Wed Nov 24 2010 Ivana Hutarova Varekova 1:4.3.2-1 +- downgrade from 5.0.1 to 4.3.2 + +* Mon May 24 2010 Ivana Hutarova Varekova 5.0.1-1 +- update to 5.0.1 + +* Tue Mar 2 2010 Ivana Hutarova Varekova 4.3.1-7 +- fix the license tag + +* Fri Nov 27 2009 Ivana Hutarova Varekova 4.3.1-6 +- remove unnecessary dependences + remove duplicated documentation + +* Mon Aug 10 2009 Ivana Varekova 4.3.1-5 +- fix installation with --excludedocs option (#515947) + +* Fri Jul 24 2009 Fedora Release Engineering - 4.3.1-4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_12_Mass_Rebuild + +* Wed Jun 17 2009 Ivana Varekova 4.3.1-3 +- rebuild + +* Mon Jun 15 2009 Ivana Varekova 4.3.1-2 +- Resolves: #505592 + add RPM_OPT_FLAGS + +* Thu May 28 2009 Ivana Varekova 4.3.1-1 +- update to 4.3.1 +- remove configure macro (built problem) + +* Thu Apr 09 2009 Dennis Gilmore - 4.2.4-6 +- no check that --host and --target are the same when building i586 or sparcv9 they are not + +* Tue Feb 24 2009 Fedora Release Engineering - 4.2.4-5 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_11_Mass_Rebuild + +* Tue Dec 23 2008 Ivana Varekova 4.2.4-4 +- fix spec file + +* Mon Dec 8 2008 Ivana Varekova 4.2.4-3 +- remove useless option (#475073) + +* Wed Dec 3 2008 Stepan Kasal 4.2.4-2 +- Run full autoreconf, add automake to BuildRequires. + +* Mon Nov 10 2008 Ivana Varekova 4.2.4-1 +- update to 4.2.4 + +* Fri Nov 7 2008 Ivana Varekova 4.2.2-9 +- remove useless patch (#470200) + +* Thu Apr 24 2008 Tom "spot" Callaway 4.2.2-8 +- add sparc/sparc64 support + +* Wed Mar 19 2008 Ivana Varekova 4.2.2-7 +- add superH support (#437688) + +* Wed Feb 13 2008 Ivana varekova 4.2.2-6 +- fix gcc-4.3 problem - add (#432336) + +* Fri Feb 8 2008 Ivana Varekova 4.2.2-5 +- split the devel subpackage to devel and static parts + +* Thu Feb 7 2008 Ivana Varekova 4.2.2-4 +- change license tag + +* Mon Sep 24 2007 Ivana Varekova 4.2.2-3 +- fix libgmpxx.so link + +* Thu Sep 20 2007 Ivana Varekova 4.2.2-2 +- fix check tag + +* Wed Sep 19 2007 Ivana Varekova 4.2.2-1 +- update to 4.2.2 + +* Mon Aug 20 2007 Ivana Varekova 4.2.1-3 +- spec file cleanup (#253439) + +* Tue Aug 7 2007 Ivana Varekova 4.2.1-2 +- add arm support (#245456) + thanks to Lennert Buytenhek + +* Mon Aug 6 2007 Ivana Varekova 4.2.1-1 +- update to 4.2.1 +- do some spec cleanups +- fix 238794 - gmp-devel depends on {version} but not on + {version}-{release} +- remove mpfr (moved to separate package) + +* Thu Jul 05 2007 Florian La Roche 4.1.4-13 +- don't fail scripts to e.g. allow excludedocs installs + +* Tue Apr 24 2007 Karsten Hopp 4.1.4-12.3 +- fix library permissions + +* Wed Mar 14 2007 Karsten Hopp 4.1.4-12.2 +- fix typo + +* Wed Mar 14 2007 Thomas Woerner 4.1.4-12.1 +- added alpha support for gmp.h and gmp-mparam.h wrappers + +* Fri Feb 23 2007 Karsten Hopp 4.1.4-12 +- remove trailing dot from summary +- fix buildroot +- fix post/postun/... requirements +- use make install DESTDIR=... +- replace tabs with spaces +- convert changelog to utf-8 + +* Wed Jan 17 2007 Jakub Jelinek 4.1.4-11 +- make sure libmpfr.a doesn't contain SSE2 instructions on i?86 (#222371) +- rebase to mpfr 2.2.1 from 2.2.0 + cumulative fixes + +* Thu Nov 2 2006 Thomas Woerner 4.1.4-10 +- fixed arch order in gmp.h and gmp-mparam.h wrapper for all architectures + +* Thu Nov 2 2006 Joe Orton 4.1.4-10 +- include ppc64 header on ppc64 not ppc header + +* Fri Oct 27 2006 Thomas Woerner - 4.1.4-9 +- fixed multilib devel conflicts for gmp (#212286) + +* Thu Oct 26 2006 Jakub Jelinek - 4.1.4-8 +- upgrade mpfr to 2.2.0 (#211971) +- apply mpfr 2.2.0 cumulative patch + +* Fri Jul 14 2006 Thomas Woerner - 4.1.4-7 +- release bump + +* Fri Feb 10 2006 Jesse Keating - 4.1.4-6.2.1 +- bump again for double-long bug on ppc(64) + +* Tue Feb 07 2006 Jesse Keating - 4.1.4-6.2 +- rebuilt for new gcc4.1 snapshot and glibc changes + +* Fri Dec 09 2005 Jesse Keating +- rebuilt + +* Mon Apr 18 2005 Thomas Woerner 4.1.4-6 +- fixed __setfpucw call in mpfr-test.h + +* Wed Mar 02 2005 Karsten Hopp 4.1.4-5 +- build with gcc-4 + +* Wed Feb 09 2005 Karsten Hopp 4.1.4-4 +- rebuilt + +* Sun Sep 26 2004 Florian La Roche +- 4.1.4 +- disable ppc64 patch, now fixed upstream + +* Tue Jun 15 2004 Elliot Lee +- rebuilt + +* Mon May 24 2004 Thomas Woerner 4.1.3-1 +- new version 4.1.3 + +* Wed Mar 31 2004 Thomas Woerner 4.1.2-14 +- dropped RPATH (#118506) + +* Sat Mar 06 2004 Florian La Roche +- also build SSE2 DSOs, patch from Ulrich Drepper + +* Tue Mar 02 2004 Elliot Lee +- rebuilt + +* Fri Feb 13 2004 Elliot Lee +- rebuilt + +* Thu Jan 29 2004 Thomas Woerner 4.1.2-11 +- BuildRequires for automake16 + +* Mon Dec 01 2003 Florian La Roche +- fix symlink to libgmpxx.so.3 #111135 +- add patch to factorize.c from gmp homepage + +* Thu Oct 23 2003 Joe Orton 4.1.2-9 +- build with -Wa,--noexecstack + +* Thu Oct 23 2003 Joe Orton 4.1.2-8 +- build assembly code with -Wa,--execstack +- use parallel make +- run tests, and fix C++ therein + +* Thu Oct 02 2003 Florian La Roche +- enable mpfr #104395 +- enable cxx #80195 +- add COPYING.LIB +- add fixes from gmp web-site +- remove some cruft patches for older libtool releases + +* Wed Jun 04 2003 Elliot Lee +- rebuilt + +* Tue Jun 03 2003 Florian La Roche +- make configure.in work with newer autoconf + +* Sun Jun 01 2003 Florian La Roche +- do not set extra_functions for s390x #92001 + +* Thu Feb 13 2003 Elliot Lee 4.1.2-3 +- Add ppc64 patch, accompanied by running auto* + +* Wed Jan 22 2003 Tim Powers +- rebuilt + +* Wed Jan 01 2003 Florian La Roche +- update to 4.1.2 + +* Tue Dec 03 2002 Florian La Roche +- update to 4.1.1 +- remove un-necessary patches +- adjust s390/x86_64 patch + +* Sun Oct 06 2002 Florian La Roche +- add s390x patch +- disable current x86-64 support in longlong.h + +* Mon Jul 8 2002 Trond Eivind Glomsrød 4.1-4 +- Add 4 patches, among them one for #67918 +- Update URL +- s/Copyright/License/ + +* Mon Jul 8 2002 Trond Eivind Glomsrød 4.1-3 +- Redefine the configure macro, the included configure + script isn't happy about the rpm default one (#68190). Also, make + sure the included libtool isn't replaced, + +* Fri Jun 21 2002 Tim Powers +- automated rebuild + +* Sat May 25 2002 Florian La Roche +- update to version 4.1 +- patch s390 gmp-mparam.h to match other archs. + +* Thu May 23 2002 Tim Powers +- automated rebuild + +* Mon Mar 11 2002 Trond Eivind Glomsrød 4.0.1-3 +- Use standard %%configure macro and edit %%{_tmppath} + +* Tue Feb 26 2002 Trond Eivind Glomsrød 4.0.1-2 +- Rebuild + +* Tue Jan 22 2002 Florian La Roche +- update to 4.0.1 +- bzip2 src + +* Wed Jan 09 2002 Tim Powers +- automated rebuild + +* Sun Jun 24 2001 Elliot Lee +- Bump release + rebuild. + +* Mon Feb 05 2001 Philipp Knirsch +- Fixed bugzilla bug #25515 where GMP wouldn't work on IA64 as IA64 is not +correctly identified as a 64 bit platform. + +* Mon Dec 18 2000 Preston Brown +- include bsd mp library + +* Tue Oct 17 2000 Florian La Roche +- update to 3.1.1 + +* Sun Sep 3 2000 Florian La Roche +- update to 3.1 + +* Sat Aug 19 2000 Preston Brown +- devel subpackage depends on main package so that .so symlink is OK. + +* Thu Jul 13 2000 Prospector +- automatic rebuild + +* Sat Jun 3 2000 Nalin Dahyabhai +- switch to the configure and makeinstall macros +- FHS-compliance fixing +- move docs to non-devel package + +* Fri Apr 28 2000 Bill Nottingham +- libtoolize for ia64 + +* Fri Apr 28 2000 Florian La Roche +- update to 3.0.1 + +* Thu Apr 27 2000 Jakub Jelinek +- sparc64 fixes for 3.0 + +* Wed Apr 26 2000 Florian La Roche +- update to 3.0 + +* Mon Feb 14 2000 Matt Wilson +- #include in files that use string functions + +* Wed Feb 02 2000 Cristian Gafton +- fix description and summary + +* Mon Dec 06 1999 Michael K. Johnson +- s/GPL/LGPL/ +- build as non-root (#7604) + +* Mon Sep 06 1999 Jakub Jelinek +- merge in some debian gmp fixes +- Ulrich Drepper's __gmp_scale2 fix +- my mpf_set_q fix +- sparc64 fixes + +* Wed Apr 28 1999 Cristian Gafton +- add sparc patch for PIC handling + +* Sun Mar 21 1999 Cristian Gafton +- auto rebuild in the new build environment (release 8) + +* Thu Feb 11 1999 Michael Johnson +- include the private header file gmp-mparam.h because several + apps seem to assume that they are building against the gmp + source tree and require it. Sigh. + +* Tue Jan 12 1999 Michael K. Johnson +- libtoolize to work on arm + +* Thu Sep 10 1998 Cristian Gafton +- yet another touch of the spec file + +* Wed Sep 2 1998 Michael Fulbright +- looked over before inclusion in RH 5.2 + +* Sun May 24 1998 Dick Porter +- Patch Makefile.in, not Makefile +- Don't specify i586, let configure decide the arch + +* Sat Jan 24 1998 Marc Ewing +- started with package from Toshio Kuratomi +- cleaned up file list +- fixed up install-info support +