From 168a13dcac503b8a673f16922b1dcbda79a4f0a9 Mon Sep 17 00:00:00 2001 From: MSVSphere Packaging Team Date: Fri, 29 Mar 2024 15:40:45 +0300 Subject: [PATCH] import gmp-6.1.2-12.el8 --- .gitignore | 1 + .gmp.metadata | 1 + SOURCES/cve-2021-43618.patch | 25 + SOURCES/gmp-6.0.0-debuginfo.patch | 21 + SOURCES/gmp-fcf-protection.patch | 1985 +++++++++++++++++++++++++++++ SOURCES/gmp-mparam.h | 88 ++ SOURCES/gmp.h | 88 ++ SOURCES/ibm_z13_simd_part1.patch | 596 +++++++++ SOURCES/ibm_z13_simd_part2.patch | 536 ++++++++ SOURCES/ibm_z13_simd_part3.patch | 139 ++ SOURCES/ibm_z13_simd_part4.patch | 151 +++ SPECS/gmp.spec | 712 +++++++++++ 12 files changed, 4343 insertions(+) create mode 100644 .gitignore create mode 100644 .gmp.metadata create mode 100644 SOURCES/cve-2021-43618.patch create mode 100644 SOURCES/gmp-6.0.0-debuginfo.patch create mode 100644 SOURCES/gmp-fcf-protection.patch create mode 100644 SOURCES/gmp-mparam.h create mode 100644 SOURCES/gmp.h create mode 100644 SOURCES/ibm_z13_simd_part1.patch create mode 100644 SOURCES/ibm_z13_simd_part2.patch create mode 100644 SOURCES/ibm_z13_simd_part3.patch create mode 100644 SOURCES/ibm_z13_simd_part4.patch create mode 100644 SPECS/gmp.spec diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..30416dc --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +SOURCES/gmp-6.1.2.tar.bz2 diff --git a/.gmp.metadata b/.gmp.metadata new file mode 100644 index 0000000..7f96990 --- /dev/null +++ b/.gmp.metadata @@ -0,0 +1 @@ +366ded6a44cd108ba6b3f5b9a252eab3f3a95cdf SOURCES/gmp-6.1.2.tar.bz2 diff --git a/SOURCES/cve-2021-43618.patch b/SOURCES/cve-2021-43618.patch new file mode 100644 index 0000000..f741972 --- /dev/null +++ b/SOURCES/cve-2021-43618.patch @@ -0,0 +1,25 @@ + +# HG changeset patch +# User Marco Bodrato +# Date 1634836009 -7200 +# Node ID 561a9c25298e17bb01896801ff353546c6923dbd +# Parent e1fd9db13b475209a864577237ea4b9105b3e96e +mpz/inp_raw.c: Avoid bit size overflows + +diff -r e1fd9db13b47 -r 561a9c25298e mpz/inp_raw.c +--- a/mpz/inp_raw.c Tue Dec 22 23:49:51 2020 +0100 ++++ b/mpz/inp_raw.c Thu Oct 21 19:06:49 2021 +0200 +@@ -88,8 +88,11 @@ + + abs_csize = ABS (csize); + ++ if (UNLIKELY (abs_csize > ~(mp_bitcnt_t) 0 / 8)) ++ return 0; /* Bit size overflows */ ++ + /* round up to a multiple of limbs */ +- abs_xsize = BITS_TO_LIMBS (abs_csize*8); ++ abs_xsize = BITS_TO_LIMBS ((mp_bitcnt_t) abs_csize * 8); + + if (abs_xsize != 0) + { + diff --git a/SOURCES/gmp-6.0.0-debuginfo.patch b/SOURCES/gmp-6.0.0-debuginfo.patch new file mode 100644 index 0000000..bb72839 --- /dev/null +++ b/SOURCES/gmp-6.0.0-debuginfo.patch @@ -0,0 +1,21 @@ +diff -up wrk/mpn/m4-ccas.wrk wrk/mpn/m4-ccas +--- wrk/mpn/m4-ccas.wrk 2015-04-02 16:44:03.645305407 +0200 ++++ wrk/mpn/m4-ccas 2015-04-02 16:21:57.893870969 +0200 +@@ -104,4 +104,4 @@ echo "$CC" + $CC || exit + + # Comment this out to preserve .s intermediates +-rm -f $TMP ++#rm -f $TMP +diff -up wrk/mpn/Makeasm.am.wrk wrk/mpn/Makeasm.am +--- wrk/mpn/Makeasm.am.wrk 2015-04-02 16:42:41.692278742 +0200 ++++ wrk/mpn/Makeasm.am 2015-04-02 16:21:57.891870945 +0200 +@@ -66,7 +66,7 @@ SUFFIXES = .s .S .asm + + + # can be overridden during development, eg. "make RM_TMP=: mul_1.lo" +-RM_TMP = rm -f ++RM_TMP = true + + + # .S assembler, preprocessed with cpp. diff --git a/SOURCES/gmp-fcf-protection.patch b/SOURCES/gmp-fcf-protection.patch new file mode 100644 index 0000000..731cc8f --- /dev/null +++ b/SOURCES/gmp-fcf-protection.patch @@ -0,0 +1,1985 @@ +diff --git a/mpn/x86_64/addaddmul_1msb0.asm b/mpn/x86_64/addaddmul_1msb0.asm +index 87c21b4..2bfa122 100644 +--- a/mpn/x86_64/addaddmul_1msb0.asm ++++ b/mpn/x86_64/addaddmul_1msb0.asm +@@ -168,3 +168,4 @@ L(end): cmp $1, R32(n) + pop %r12 + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/addmul_2.asm b/mpn/x86_64/addmul_2.asm +index 18307d7..2999ce5 100644 +--- a/mpn/x86_64/addmul_2.asm ++++ b/mpn/x86_64/addmul_2.asm +@@ -182,3 +182,4 @@ L(end): xor R32(w1), R32(w1) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/aorrlsh1_n.asm b/mpn/x86_64/aorrlsh1_n.asm +index 6ee0872..9ebd7dc 100644 +--- a/mpn/x86_64/aorrlsh1_n.asm ++++ b/mpn/x86_64/aorrlsh1_n.asm +@@ -168,3 +168,4 @@ ifdef(`OPERATION_rsblsh1_n',` + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/aorrlshC_n.asm b/mpn/x86_64/aorrlshC_n.asm +index 5a9fd4d..c3d55a6 100644 +--- a/mpn/x86_64/aorrlshC_n.asm ++++ b/mpn/x86_64/aorrlshC_n.asm +@@ -158,3 +158,4 @@ ifelse(ADDSUB,add,` + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/aorrlsh_n.asm b/mpn/x86_64/aorrlsh_n.asm +index 5ca128f..7dd0bcf 100644 +--- a/mpn/x86_64/aorrlsh_n.asm ++++ b/mpn/x86_64/aorrlsh_n.asm +@@ -174,3 +174,4 @@ L(end): add R32(%rbx), R32(%rbx) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/aors_err1_n.asm b/mpn/x86_64/aors_err1_n.asm +index 54d0b3f..13a6af2 100644 +--- a/mpn/x86_64/aors_err1_n.asm ++++ b/mpn/x86_64/aors_err1_n.asm +@@ -223,3 +223,4 @@ L(end): + pop %rbx + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/aors_err2_n.asm b/mpn/x86_64/aors_err2_n.asm +index ce5c2a4..0466f06 100644 +--- a/mpn/x86_64/aors_err2_n.asm ++++ b/mpn/x86_64/aors_err2_n.asm +@@ -170,3 +170,4 @@ L(end): + pop %rbx + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/aors_err3_n.asm b/mpn/x86_64/aors_err3_n.asm +index bb6d0c5..cc5461f 100644 +--- a/mpn/x86_64/aors_err3_n.asm ++++ b/mpn/x86_64/aors_err3_n.asm +@@ -154,3 +154,4 @@ L(end): + pop %rbx + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/aors_n.asm b/mpn/x86_64/aors_n.asm +index 8941f7a..361e04d 100644 +--- a/mpn/x86_64/aors_n.asm ++++ b/mpn/x86_64/aors_n.asm +@@ -167,3 +167,4 @@ L(end): lea 32(up), up + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/aorsmul_1.asm b/mpn/x86_64/aorsmul_1.asm +index e3fc005..25d0c13 100644 +--- a/mpn/x86_64/aorsmul_1.asm ++++ b/mpn/x86_64/aorsmul_1.asm +@@ -178,3 +178,4 @@ IFDOS(``pop %rdi '') + IFDOS(``pop %rsi '') + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/atom/addmul_2.asm b/mpn/x86_64/atom/addmul_2.asm +index c1dcdc4..07ae7b8 100644 +--- a/mpn/x86_64/atom/addmul_2.asm ++++ b/mpn/x86_64/atom/addmul_2.asm +@@ -184,3 +184,4 @@ L(end): mul v1 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/atom/aorrlsh1_n.asm b/mpn/x86_64/atom/aorrlsh1_n.asm +index f44de19..f9d7bac 100644 +--- a/mpn/x86_64/atom/aorrlsh1_n.asm ++++ b/mpn/x86_64/atom/aorrlsh1_n.asm +@@ -236,3 +236,4 @@ IFDOS(` mov 56(%rsp), %r8 ') + sbb R32(%rbp), R32(%rbp) C save acy + jmp L(ent) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/atom/aorrlsh2_n.asm b/mpn/x86_64/atom/aorrlsh2_n.asm +index 02fb29d..5ea55b4 100644 +--- a/mpn/x86_64/atom/aorrlsh2_n.asm ++++ b/mpn/x86_64/atom/aorrlsh2_n.asm +@@ -189,3 +189,4 @@ ifdef(`OPERATION_rsblsh2_n',` + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/atom/aorsmul_1.asm b/mpn/x86_64/atom/aorsmul_1.asm +index e953153..6a12f96 100644 +--- a/mpn/x86_64/atom/aorsmul_1.asm ++++ b/mpn/x86_64/atom/aorsmul_1.asm +@@ -188,3 +188,4 @@ L(cj1): ADDSUB %rax, (rp,n,8) + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/atom/lshift.asm b/mpn/x86_64/atom/lshift.asm +index 1b37d5d..15786cb 100644 +--- a/mpn/x86_64/atom/lshift.asm ++++ b/mpn/x86_64/atom/lshift.asm +@@ -121,3 +121,4 @@ L(end): shl R8(%rcx), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/atom/lshiftc.asm b/mpn/x86_64/atom/lshiftc.asm +index 7385f8f..3171d3c 100644 +--- a/mpn/x86_64/atom/lshiftc.asm ++++ b/mpn/x86_64/atom/lshiftc.asm +@@ -125,3 +125,4 @@ L(end): shl R8(%rcx), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/atom/mul_1.asm b/mpn/x86_64/atom/mul_1.asm +index d76a3d3..304c208 100644 +--- a/mpn/x86_64/atom/mul_1.asm ++++ b/mpn/x86_64/atom/mul_1.asm +@@ -141,3 +141,4 @@ IFDOS(` mov 56(%rsp), %r8 ') + jmp L(com) + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/atom/mul_2.asm b/mpn/x86_64/atom/mul_2.asm +index f3fc3af..c7b78a7 100644 +--- a/mpn/x86_64/atom/mul_2.asm ++++ b/mpn/x86_64/atom/mul_2.asm +@@ -184,3 +184,4 @@ L(end): mul v1 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/atom/redc_1.asm b/mpn/x86_64/atom/redc_1.asm +index 62b9a84..eeb09d3 100644 +--- a/mpn/x86_64/atom/redc_1.asm ++++ b/mpn/x86_64/atom/redc_1.asm +@@ -577,3 +577,4 @@ L(n4): mov -32(mp), %rax + jmp L(cj) + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/atom/rsh1aors_n.asm b/mpn/x86_64/atom/rsh1aors_n.asm +index 6f5f638..a589b89 100644 +--- a/mpn/x86_64/atom/rsh1aors_n.asm ++++ b/mpn/x86_64/atom/rsh1aors_n.asm +@@ -285,3 +285,4 @@ L(cj1): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/atom/rshift.asm b/mpn/x86_64/atom/rshift.asm +index 29c027d..c8b78bf 100644 +--- a/mpn/x86_64/atom/rshift.asm ++++ b/mpn/x86_64/atom/rshift.asm +@@ -119,3 +119,4 @@ L(end): shr R8(cnt), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/atom/sublsh1_n.asm b/mpn/x86_64/atom/sublsh1_n.asm +index 1306acd..574b25b 100644 +--- a/mpn/x86_64/atom/sublsh1_n.asm ++++ b/mpn/x86_64/atom/sublsh1_n.asm +@@ -240,3 +240,4 @@ IFDOS(` mov 56(%rsp), %r8 ') + sbb R32(%rbp), R32(%rbp) C save acy + jmp L(ent) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bd1/aorsmul_1.asm b/mpn/x86_64/bd1/aorsmul_1.asm +index 96fec9f..ce76154 100644 +--- a/mpn/x86_64/bd1/aorsmul_1.asm ++++ b/mpn/x86_64/bd1/aorsmul_1.asm +@@ -179,3 +179,4 @@ IFDOS(``pop %rsi '') + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/bd1/mul_1.asm b/mpn/x86_64/bd1/mul_1.asm +index e59667c..308f336 100644 +--- a/mpn/x86_64/bd1/mul_1.asm ++++ b/mpn/x86_64/bd1/mul_1.asm +@@ -182,3 +182,4 @@ IFDOS(``pop %rsi '') + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/bd1/mul_2.asm b/mpn/x86_64/bd1/mul_2.asm +index 4ed5f30..f40cf47 100644 +--- a/mpn/x86_64/bd1/mul_2.asm ++++ b/mpn/x86_64/bd1/mul_2.asm +@@ -190,3 +190,4 @@ L(end): mov -8(up,n,8), %rax + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bd1/mul_basecase.asm b/mpn/x86_64/bd1/mul_basecase.asm +index e47ba58..6d61cbc 100644 +--- a/mpn/x86_64/bd1/mul_basecase.asm ++++ b/mpn/x86_64/bd1/mul_basecase.asm +@@ -414,3 +414,4 @@ L(ret2):pop %rbp + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bdiv_dbm1c.asm b/mpn/x86_64/bdiv_dbm1c.asm +index a53bd52..f9c4aa0 100644 +--- a/mpn/x86_64/bdiv_dbm1c.asm ++++ b/mpn/x86_64/bdiv_dbm1c.asm +@@ -104,3 +104,4 @@ L(lo1): sub %rax, %r8 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bdiv_q_1.asm b/mpn/x86_64/bdiv_q_1.asm +index 02eacbe..7bfa66d 100644 +--- a/mpn/x86_64/bdiv_q_1.asm ++++ b/mpn/x86_64/bdiv_q_1.asm +@@ -165,3 +165,4 @@ L(one): shr R8(%rcx), %rax + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bobcat/aors_n.asm b/mpn/x86_64/bobcat/aors_n.asm +index 22287b8..1df1a08 100644 +--- a/mpn/x86_64/bobcat/aors_n.asm ++++ b/mpn/x86_64/bobcat/aors_n.asm +@@ -148,3 +148,4 @@ PROLOGUE(func_nc) + IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bobcat/aorsmul_1.asm b/mpn/x86_64/bobcat/aorsmul_1.asm +index 415a17c..79d81f4 100644 +--- a/mpn/x86_64/bobcat/aorsmul_1.asm ++++ b/mpn/x86_64/bobcat/aorsmul_1.asm +@@ -181,3 +181,4 @@ IFDOS(` pop %rdi ') + IFDOS(` pop %rsi ') + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bobcat/copyd.asm b/mpn/x86_64/bobcat/copyd.asm +index 877714e..2f781a3 100644 +--- a/mpn/x86_64/bobcat/copyd.asm ++++ b/mpn/x86_64/bobcat/copyd.asm +@@ -89,3 +89,4 @@ L(end): cmp $-4, R32(n) + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bobcat/copyi.asm b/mpn/x86_64/bobcat/copyi.asm +index ee0f578..ff249bc 100644 +--- a/mpn/x86_64/bobcat/copyi.asm ++++ b/mpn/x86_64/bobcat/copyi.asm +@@ -92,3 +92,4 @@ L(end): cmp $4, R32(n) + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bobcat/mul_1.asm b/mpn/x86_64/bobcat/mul_1.asm +index ab428a8..b4f401b 100644 +--- a/mpn/x86_64/bobcat/mul_1.asm ++++ b/mpn/x86_64/bobcat/mul_1.asm +@@ -185,3 +185,4 @@ IFDOS(` pop %rdi ') + IFDOS(` pop %rsi ') + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bobcat/mul_basecase.asm b/mpn/x86_64/bobcat/mul_basecase.asm +index e7d46bf..14c7b13 100644 +--- a/mpn/x86_64/bobcat/mul_basecase.asm ++++ b/mpn/x86_64/bobcat/mul_basecase.asm +@@ -484,3 +484,4 @@ L(ret): pop %r13 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/bobcat/redc_1.asm b/mpn/x86_64/bobcat/redc_1.asm +index d55b1e5..d686cfb 100644 +--- a/mpn/x86_64/bobcat/redc_1.asm ++++ b/mpn/x86_64/bobcat/redc_1.asm +@@ -505,3 +505,4 @@ L(n3): mov -24(mp), %rax + jmp L(ret) + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/bobcat/sqr_basecase.asm b/mpn/x86_64/bobcat/sqr_basecase.asm +index 0e417a1..5693c46 100644 +--- a/mpn/x86_64/bobcat/sqr_basecase.asm ++++ b/mpn/x86_64/bobcat/sqr_basecase.asm +@@ -563,3 +563,4 @@ L(esd): add %rbx, w0 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/cnd_aors_n.asm b/mpn/x86_64/cnd_aors_n.asm +index 13a2ab3..35f30e7 100644 +--- a/mpn/x86_64/cnd_aors_n.asm ++++ b/mpn/x86_64/cnd_aors_n.asm +@@ -181,3 +181,4 @@ L(end): neg R32(%rax) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/com.asm b/mpn/x86_64/com.asm +index 006acaf..56b0747 100644 +--- a/mpn/x86_64/com.asm ++++ b/mpn/x86_64/com.asm +@@ -93,3 +93,4 @@ L(e10): movq 24(up,n,8), %r9 + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/copyd.asm b/mpn/x86_64/copyd.asm +index a5e6e59..020e287 100644 +--- a/mpn/x86_64/copyd.asm ++++ b/mpn/x86_64/copyd.asm +@@ -91,3 +91,4 @@ L(end): shr R32(n) + mov %r9, -16(rp) + 1: ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/copyi.asm b/mpn/x86_64/copyi.asm +index bafce7a..1a4fb6d 100644 +--- a/mpn/x86_64/copyi.asm ++++ b/mpn/x86_64/copyi.asm +@@ -90,3 +90,4 @@ L(end): shr R32(n) + mov %r9, 16(rp) + 1: ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/aors_err1_n.asm b/mpn/x86_64/core2/aors_err1_n.asm +index 3f875ae..5162272 100644 +--- a/mpn/x86_64/core2/aors_err1_n.asm ++++ b/mpn/x86_64/core2/aors_err1_n.asm +@@ -223,3 +223,4 @@ L(end): + pop %rbx + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/aors_n.asm b/mpn/x86_64/core2/aors_n.asm +index 74a1bce..19078d8 100644 +--- a/mpn/x86_64/core2/aors_n.asm ++++ b/mpn/x86_64/core2/aors_n.asm +@@ -139,3 +139,4 @@ IFDOS(` mov 56(%rsp), %r8 ') + jmp L(start) + EPILOGUE() + ++CF_PROT +diff --git a/mpn/x86_64/core2/aorsmul_1.asm b/mpn/x86_64/core2/aorsmul_1.asm +index 6b313dd..392f4de 100644 +--- a/mpn/x86_64/core2/aorsmul_1.asm ++++ b/mpn/x86_64/core2/aorsmul_1.asm +@@ -176,3 +176,4 @@ L(n1): mov 8(rp), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/divrem_1.asm b/mpn/x86_64/core2/divrem_1.asm +index 1b3f139..0a67dc3 100644 +--- a/mpn/x86_64/core2/divrem_1.asm ++++ b/mpn/x86_64/core2/divrem_1.asm +@@ -241,3 +241,4 @@ L(ret): pop %rbx + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/gcd_1.asm b/mpn/x86_64/core2/gcd_1.asm +index bdb940c..452b763 100644 +--- a/mpn/x86_64/core2/gcd_1.asm ++++ b/mpn/x86_64/core2/gcd_1.asm +@@ -144,3 +144,4 @@ L(end): pop %rcx + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/lshift.asm b/mpn/x86_64/core2/lshift.asm +index 8ccafec..00b39b8 100644 +--- a/mpn/x86_64/core2/lshift.asm ++++ b/mpn/x86_64/core2/lshift.asm +@@ -147,3 +147,4 @@ L(end): shld R8(cnt), %r8, %r11 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/lshiftc.asm b/mpn/x86_64/core2/lshiftc.asm +index 65c7b2f..4d3acfe 100644 +--- a/mpn/x86_64/core2/lshiftc.asm ++++ b/mpn/x86_64/core2/lshiftc.asm +@@ -157,3 +157,4 @@ L(end): shld R8(cnt), %r8, %r11 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/mul_basecase.asm b/mpn/x86_64/core2/mul_basecase.asm +index d16be85..04cd4c2 100644 +--- a/mpn/x86_64/core2/mul_basecase.asm ++++ b/mpn/x86_64/core2/mul_basecase.asm +@@ -973,3 +973,4 @@ L(lo3): mul v0 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/mullo_basecase.asm b/mpn/x86_64/core2/mullo_basecase.asm +index 0f03d86..efed03d 100644 +--- a/mpn/x86_64/core2/mullo_basecase.asm ++++ b/mpn/x86_64/core2/mullo_basecase.asm +@@ -425,3 +425,4 @@ L(n3): mov (vp_param), %r9 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/redc_1.asm b/mpn/x86_64/core2/redc_1.asm +index 8c296fd..d98f56f 100644 +--- a/mpn/x86_64/core2/redc_1.asm ++++ b/mpn/x86_64/core2/redc_1.asm +@@ -428,3 +428,4 @@ L(n4): mov -32(mp), %rax + jmp L(add_n) + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/core2/rsh1aors_n.asm b/mpn/x86_64/core2/rsh1aors_n.asm +index 27eed37..579fec6 100644 +--- a/mpn/x86_64/core2/rsh1aors_n.asm ++++ b/mpn/x86_64/core2/rsh1aors_n.asm +@@ -167,3 +167,4 @@ L(end): shrd $1, %rbx, %rbp + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/rshift.asm b/mpn/x86_64/core2/rshift.asm +index ab32ec8..97f4429 100644 +--- a/mpn/x86_64/core2/rshift.asm ++++ b/mpn/x86_64/core2/rshift.asm +@@ -145,3 +145,4 @@ L(end): shrd R8(cnt), %r8, %r11 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/sqr_basecase.asm b/mpn/x86_64/core2/sqr_basecase.asm +index a112c1b..0ee6ca3 100644 +--- a/mpn/x86_64/core2/sqr_basecase.asm ++++ b/mpn/x86_64/core2/sqr_basecase.asm +@@ -982,3 +982,4 @@ L(n3): mov %rax, %r10 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/core2/sublshC_n.asm b/mpn/x86_64/core2/sublshC_n.asm +index 5acc46b..7a48dfb 100644 +--- a/mpn/x86_64/core2/sublshC_n.asm ++++ b/mpn/x86_64/core2/sublshC_n.asm +@@ -156,3 +156,4 @@ L(end): shr $RSH, %r11 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreibwl/addmul_1.asm b/mpn/x86_64/coreibwl/addmul_1.asm +index aaa58e7..4fb79f9 100644 +--- a/mpn/x86_64/coreibwl/addmul_1.asm ++++ b/mpn/x86_64/coreibwl/addmul_1.asm +@@ -107,33 +107,39 @@ L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +-L(f0): mulx( (up), %r10, %r8) ++L(f0): CFPROT_ENDBR ++ mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea -1(n), n + jmp L(b0) + +-L(f3): mulx( (up), %r9, %rax) ++L(f3): CFPROT_ENDBR ++ mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + jmp L(b3) + +-L(f4): mulx( (up), %r10, %r8) ++L(f4): CFPROT_ENDBR ++ mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + jmp L(b4) + +-L(f5): mulx( (up), %r9, %rax) ++L(f5): CFPROT_ENDBR ++ mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + jmp L(b5) + +-L(f6): mulx( (up), %r10, %r8) ++L(f6): CFPROT_ENDBR ++ mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + jmp L(b6) + +-L(f1): mulx( (up), %r9, %rax) ++L(f1): CFPROT_ENDBR ++ mulx( (up), %r9, %rax) + jrcxz L(1) + jmp L(b1) + L(1): add (rp), %r9 +@@ -151,7 +157,8 @@ ifdef(`PIC', + ` nop;nop;nop;nop', + ` nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop') + +-L(f2): mulx( (up), %r10, %r8) ++L(f2): CFPROT_ENDBR ++ mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) +@@ -195,9 +202,11 @@ L(b3): adox( 48,(rp), %r9) + mulx( (up), %r9, %rax) + jmp L(top) + +-L(f7): mulx( (up), %r9, %rax) ++L(f7): CFPROT_ENDBR ++ mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + jmp L(b7) + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/coreibwl/mul_1.asm b/mpn/x86_64/coreibwl/mul_1.asm +index a271e6c..4fe4822 100644 +--- a/mpn/x86_64/coreibwl/mul_1.asm ++++ b/mpn/x86_64/coreibwl/mul_1.asm +@@ -106,48 +106,56 @@ L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +-L(f0): mulx( (up), %r10, %r8) ++L(f0): CFPROT_ENDBR ++ mulx( (up), %r10, %r8) + lea 56(up), up + lea -8(rp), rp + jmp L(b0) + +-L(f3): mulx( (up), %r9, %rax) ++L(f3): CFPROT_ENDBR ++ mulx( (up), %r9, %rax) + lea 16(up), up + lea 16(rp), rp + inc n + jmp L(b3) + +-L(f4): mulx( (up), %r10, %r8) ++L(f4): CFPROT_ENDBR ++ mulx( (up), %r10, %r8) + lea 24(up), up + lea 24(rp), rp + inc n + jmp L(b4) + +-L(f5): mulx( (up), %r9, %rax) ++L(f5): CFPROT_ENDBR ++ mulx( (up), %r9, %rax) + lea 32(up), up + lea 32(rp), rp + inc n + jmp L(b5) + +-L(f6): mulx( (up), %r10, %r8) ++L(f6): CFPROT_ENDBR ++ mulx( (up), %r10, %r8) + lea 40(up), up + lea 40(rp), rp + inc n + jmp L(b6) + +-L(f7): mulx( (up), %r9, %rax) ++L(f7): CFPROT_ENDBR ++ mulx( (up), %r9, %rax) + lea 48(up), up + lea 48(rp), rp + inc n + jmp L(b7) + +-L(f1): mulx( (up), %r9, %rax) ++L(f1): CFPROT_ENDBR ++ mulx( (up), %r9, %rax) + test n, n + jnz L(b1) + L(1): mov %r9, (rp) + ret + +-L(f2): mulx( (up), %r10, %r8) ++L(f2): CFPROT_ENDBR ++ mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) +@@ -191,3 +199,4 @@ L(end): mov %r10, -8(rp) + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/coreibwl/mul_basecase.asm b/mpn/x86_64/coreibwl/mul_basecase.asm +index 50f3ce5..74cd67c 100644 +--- a/mpn/x86_64/coreibwl/mul_basecase.asm ++++ b/mpn/x86_64/coreibwl/mul_basecase.asm +@@ -155,45 +155,53 @@ ifdef(`PIC', + jmp *(%r10,%rax,8) + ') + +-L(mf0): mulx( (up), w2, w3) ++L(mf0): CFPROT_ENDBR ++ mulx( (up), w2, w3) + lea 56(up), up + lea -8(rp), rp + jmp L(mb0) + +-L(mf3): mulx( (up), w0, w1) ++L(mf3): CFPROT_ENDBR ++ mulx( (up), w0, w1) + lea 16(up), up + lea 16(rp), rp + inc n + jmp L(mb3) + +-L(mf4): mulx( (up), w2, w3) ++L(mf4): CFPROT_ENDBR ++ mulx( (up), w2, w3) + lea 24(up), up + lea 24(rp), rp + inc n + jmp L(mb4) + +-L(mf5): mulx( (up), w0, w1) ++L(mf5): CFPROT_ENDBR ++ mulx( (up), w0, w1) + lea 32(up), up + lea 32(rp), rp + inc n + jmp L(mb5) + +-L(mf6): mulx( (up), w2, w3) ++L(mf6): CFPROT_ENDBR ++ mulx( (up), w2, w3) + lea 40(up), up + lea 40(rp), rp + inc n + jmp L(mb6) + +-L(mf7): mulx( (up), w0, w1) ++L(mf7): CFPROT_ENDBR ++ mulx( (up), w0, w1) + lea 48(up), up + lea 48(rp), rp + inc n + jmp L(mb7) + +-L(mf1): mulx( (up), w0, w1) ++L(mf1): CFPROT_ENDBR ++ mulx( (up), w0, w1) + jmp L(mb1) + +-L(mf2): mulx( (up), w2, w3) ++L(mf2): CFPROT_ENDBR ++ mulx( (up), w2, w3) + lea 8(up), up + lea 8(rp), rp + mulx( (up), w0, w1) +@@ -254,32 +262,39 @@ L(outer): + lea 8(vp), vp + jmp *jaddr + +-L(f0): mulx( 8,(up), w2, w3) ++L(f0): CFPROT_ENDBR ++ mulx( 8,(up), w2, w3) + lea 8(rp,unneg,8), rp + lea -1(n), n + jmp L(b0) + +-L(f3): mulx( -16,(up), w0, w1) ++L(f3): CFPROT_ENDBR ++ mulx( -16,(up), w0, w1) + lea -56(rp,unneg,8), rp + jmp L(b3) + +-L(f4): mulx( -24,(up), w2, w3) ++L(f4): CFPROT_ENDBR ++ mulx( -24,(up), w2, w3) + lea -56(rp,unneg,8), rp + jmp L(b4) + +-L(f5): mulx( -32,(up), w0, w1) ++L(f5): CFPROT_ENDBR ++ mulx( -32,(up), w0, w1) + lea -56(rp,unneg,8), rp + jmp L(b5) + +-L(f6): mulx( -40,(up), w2, w3) ++L(f6): CFPROT_ENDBR ++ mulx( -40,(up), w2, w3) + lea -56(rp,unneg,8), rp + jmp L(b6) + +-L(f7): mulx( 16,(up), w0, w1) ++L(f7): CFPROT_ENDBR ++ mulx( 16,(up), w0, w1) + lea 8(rp,unneg,8), rp + jmp L(b7) + +-L(f1): mulx( (up), w0, w1) ++L(f1): CFPROT_ENDBR ++ mulx( (up), w0, w1) + lea 8(rp,unneg,8), rp + jmp L(b1) + +@@ -300,7 +315,7 @@ L(done): + FUNC_EXIT() + ret + +-L(f2): ++L(f2): CFPROT_ENDBR + mulx( -8,(up), w2, w3) + lea 8(rp,unneg,8), rp + mulx( (up), w0, w1) +@@ -365,3 +380,4 @@ L(atab):JMPENT( L(f0), L(atab)) + JMPENT( L(f7), L(atab)) + TEXT + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreibwl/sqr_basecase.asm b/mpn/x86_64/coreibwl/sqr_basecase.asm +index 447ba00..64e8298 100644 +--- a/mpn/x86_64/coreibwl/sqr_basecase.asm ++++ b/mpn/x86_64/coreibwl/sqr_basecase.asm +@@ -184,42 +184,50 @@ ifdef(`PIC', + jmp *(%r10,%rax,8) + ') + +-L(mf0): mulx( 8,(up), w2, w3) ++L(mf0): CFPROT_ENDBR ++ mulx( 8,(up), w2, w3) + lea 64(up), up + C lea (rp), rp + jmp L(mb0) + +-L(mf3): mulx( 8,(up), w0, w1) ++L(mf3): CFPROT_ENDBR ++ mulx( 8,(up), w0, w1) + lea 24(up), up + lea 24(rp), rp + jmp L(mb3) + +-L(mf4): mulx( 8,(up), w2, w3) ++L(mf4): CFPROT_ENDBR ++ mulx( 8,(up), w2, w3) + lea 32(up), up + lea 32(rp), rp + jmp L(mb4) + +-L(mf5): mulx( 8,(up), w0, w1) ++L(mf5): CFPROT_ENDBR ++ mulx( 8,(up), w0, w1) + lea 40(up), up + lea 40(rp), rp + jmp L(mb5) + +-L(mf6): mulx( 8,(up), w2, w3) ++L(mf6): CFPROT_ENDBR ++ mulx( 8,(up), w2, w3) + lea 48(up), up + lea 48(rp), rp + jmp L(mb6) + +-L(mf7): mulx( 8,(up), w0, w1) ++L(mf7): CFPROT_ENDBR ++ mulx( 8,(up), w0, w1) + lea 56(up), up + lea 56(rp), rp + jmp L(mb7) + +-L(mf1): mulx( 8,(up), w0, w1) ++L(mf1): CFPROT_ENDBR ++ mulx( 8,(up), w0, w1) + lea 8(up), up + lea 8(rp), rp + jmp L(mb1) + +-L(mf2): mulx( 8,(up), w2, w3) ++L(mf2): CFPROT_ENDBR ++ mulx( 8,(up), w2, w3) + lea 16(up), up + lea 16(rp), rp + dec R32(n) +@@ -275,7 +283,8 @@ L(ed0): adox( (rp), w0) + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) +-L(f7): lea -64(up,un_save,8), up ++L(f7): CFPROT_ENDBR ++ lea -64(up,un_save,8), up + or R32(un_save), R32(n) + mov 8(up), u0 + mulx( 16,(up), w0, w1) +@@ -326,7 +335,8 @@ L(ed1): adox( (rp), w0) + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) +-L(f0): lea -64(up,un_save,8), up ++L(f0): CFPROT_ENDBR ++ lea -64(up,un_save,8), up + or R32(un_save), R32(n) + mov (up), u0 + mulx( 8,(up), w2, w3) +@@ -377,7 +387,8 @@ L(ed2): adox( (rp), w0) + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) +-L(f1): lea (up,un_save,8), up ++L(f1): CFPROT_ENDBR ++ lea (up,un_save,8), up + or R32(un_save), R32(n) + lea 8(un_save), un_save + mov -8(up), u0 +@@ -429,7 +440,8 @@ L(ed3): adox( (rp), w0) + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) +-L(f2): lea (up,un_save,8), up ++L(f2): CFPROT_ENDBR ++ lea (up,un_save,8), up + or R32(un_save), R32(n) + jz L(corner2) + mov -16(up), u0 +@@ -482,7 +494,8 @@ L(ed4): adox( (rp), w0) + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) +-L(f3): lea (up,un_save,8), up ++L(f3): CFPROT_ENDBR ++ lea (up,un_save,8), up + or R32(un_save), R32(n) + jz L(corner3) + mov -24(up), u0 +@@ -534,7 +547,8 @@ L(ed5): adox( (rp), w0) + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) +-L(f4): lea (up,un_save,8), up ++L(f4): CFPROT_ENDBR ++ lea (up,un_save,8), up + or R32(un_save), R32(n) + mov -32(up), u0 + mulx( -24,(up), w2, w3) +@@ -585,7 +599,8 @@ L(ed6): adox( (rp), w0) + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) +-L(f5): lea (up,un_save,8), up ++L(f5): CFPROT_ENDBR ++ lea (up,un_save,8), up + or R32(un_save), R32(n) + mov -40(up), u0 + mulx( -32,(up), w0, w1) +@@ -636,7 +651,8 @@ L(ed7): adox( (rp), w0) + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) +-L(f6): lea (up,un_save,8), up ++L(f6): CFPROT_ENDBR ++ lea (up,un_save,8), up + or R32(un_save), R32(n) + mov -48(up), u0 + mulx( -40,(up), w2, w3) +@@ -838,3 +854,4 @@ L(atab):JMPENT( L(f6), L(atab)) + JMPENT( L(f5), L(atab)) + TEXT + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreihwl/addmul_2.asm b/mpn/x86_64/coreihwl/addmul_2.asm +index 54aebc8..2a5f996 100644 +--- a/mpn/x86_64/coreihwl/addmul_2.asm ++++ b/mpn/x86_64/coreihwl/addmul_2.asm +@@ -236,3 +236,4 @@ L(end): mulx( v0, %rax, w3) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreihwl/aorsmul_1.asm b/mpn/x86_64/coreihwl/aorsmul_1.asm +index fd5a26d..8c03b17 100644 +--- a/mpn/x86_64/coreihwl/aorsmul_1.asm ++++ b/mpn/x86_64/coreihwl/aorsmul_1.asm +@@ -196,3 +196,4 @@ L(ret): pop %r13 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreihwl/mul_1.asm b/mpn/x86_64/coreihwl/mul_1.asm +index 1e3c338..b6463f9 100644 +--- a/mpn/x86_64/coreihwl/mul_1.asm ++++ b/mpn/x86_64/coreihwl/mul_1.asm +@@ -153,3 +153,4 @@ L(cj1): mov %rbx, 24(rp) + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/coreihwl/mul_2.asm b/mpn/x86_64/coreihwl/mul_2.asm +index 5bdb1aa..21defe9 100644 +--- a/mpn/x86_64/coreihwl/mul_2.asm ++++ b/mpn/x86_64/coreihwl/mul_2.asm +@@ -171,3 +171,4 @@ L(end): mulx( v1, %rdx, %rax) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreihwl/mul_basecase.asm b/mpn/x86_64/coreihwl/mul_basecase.asm +index b2656c8..e4a8381 100644 +--- a/mpn/x86_64/coreihwl/mul_basecase.asm ++++ b/mpn/x86_64/coreihwl/mul_basecase.asm +@@ -439,3 +439,4 @@ L(ret2):pop %rbp + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreihwl/mullo_basecase.asm b/mpn/x86_64/coreihwl/mullo_basecase.asm +index 9986e8b..6756802 100644 +--- a/mpn/x86_64/coreihwl/mullo_basecase.asm ++++ b/mpn/x86_64/coreihwl/mullo_basecase.asm +@@ -424,3 +424,4 @@ L(n3): mov (vp), %r9 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreihwl/redc_1.asm b/mpn/x86_64/coreihwl/redc_1.asm +index b1d6c0a..b8b4a9e 100644 +--- a/mpn/x86_64/coreihwl/redc_1.asm ++++ b/mpn/x86_64/coreihwl/redc_1.asm +@@ -435,3 +435,4 @@ L(ret): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreihwl/sqr_basecase.asm b/mpn/x86_64/coreihwl/sqr_basecase.asm +index 641cdf3..8e83470 100644 +--- a/mpn/x86_64/coreihwl/sqr_basecase.asm ++++ b/mpn/x86_64/coreihwl/sqr_basecase.asm +@@ -504,3 +504,4 @@ L(dend):adc %rbx, %rdx + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreinhm/aorrlsh_n.asm b/mpn/x86_64/coreinhm/aorrlsh_n.asm +index eed64e7..b1a4610 100644 +--- a/mpn/x86_64/coreinhm/aorrlsh_n.asm ++++ b/mpn/x86_64/coreinhm/aorrlsh_n.asm +@@ -198,3 +198,4 @@ IFDOS(` mov 64(%rsp), %r9 ') C cy + sbb R32(%rbx), R32(%rbx) C initialise CF save register + jmp L(ent) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreinhm/aorsmul_1.asm b/mpn/x86_64/coreinhm/aorsmul_1.asm +index b768905..e2d96a8 100644 +--- a/mpn/x86_64/coreinhm/aorsmul_1.asm ++++ b/mpn/x86_64/coreinhm/aorsmul_1.asm +@@ -185,3 +185,4 @@ L(end): mul v0 + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/coreinhm/redc_1.asm b/mpn/x86_64/coreinhm/redc_1.asm +index fc71c1b..782da6b 100644 +--- a/mpn/x86_64/coreinhm/redc_1.asm ++++ b/mpn/x86_64/coreinhm/redc_1.asm +@@ -547,3 +547,4 @@ L(n3): mov -24(mp), %rax + jmp L(ret) + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/addmul_2.asm b/mpn/x86_64/coreisbr/addmul_2.asm +index 21f0bf4..e6ffe3e 100644 +--- a/mpn/x86_64/coreisbr/addmul_2.asm ++++ b/mpn/x86_64/coreisbr/addmul_2.asm +@@ -222,3 +222,4 @@ L(end): mul v1 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/aorrlshC_n.asm b/mpn/x86_64/coreisbr/aorrlshC_n.asm +index 23ace41..75a9b8c 100644 +--- a/mpn/x86_64/coreisbr/aorrlshC_n.asm ++++ b/mpn/x86_64/coreisbr/aorrlshC_n.asm +@@ -171,3 +171,4 @@ L(end): shr $RSH, %rbp + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/aorrlsh_n.asm b/mpn/x86_64/coreisbr/aorrlsh_n.asm +index db8ee68..611dcb2 100644 +--- a/mpn/x86_64/coreisbr/aorrlsh_n.asm ++++ b/mpn/x86_64/coreisbr/aorrlsh_n.asm +@@ -213,3 +213,4 @@ IFDOS(` mov 64(%rsp), %r9 ') C cy + sbb R32(%rbx), R32(%rbx) C initialise CF save register + jmp L(ent) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/aors_n.asm b/mpn/x86_64/coreisbr/aors_n.asm +index 01abf78..07fef16 100644 +--- a/mpn/x86_64/coreisbr/aors_n.asm ++++ b/mpn/x86_64/coreisbr/aors_n.asm +@@ -196,3 +196,4 @@ PROLOGUE(func_nc) + IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/aorsmul_1.asm b/mpn/x86_64/coreisbr/aorsmul_1.asm +index 9f01d9c..41b8016 100644 +--- a/mpn/x86_64/coreisbr/aorsmul_1.asm ++++ b/mpn/x86_64/coreisbr/aorsmul_1.asm +@@ -207,3 +207,4 @@ IFDOS(``pop %rsi '') + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/mul_1.asm b/mpn/x86_64/coreisbr/mul_1.asm +index ded7d89..a30f00b 100644 +--- a/mpn/x86_64/coreisbr/mul_1.asm ++++ b/mpn/x86_64/coreisbr/mul_1.asm +@@ -159,3 +159,4 @@ IFDOS(``pop %rdi '') + IFDOS(``pop %rsi '') + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/mul_2.asm b/mpn/x86_64/coreisbr/mul_2.asm +index ffee78a..991820b 100644 +--- a/mpn/x86_64/coreisbr/mul_2.asm ++++ b/mpn/x86_64/coreisbr/mul_2.asm +@@ -161,3 +161,4 @@ L(end): mul v0 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/mul_basecase.asm b/mpn/x86_64/coreisbr/mul_basecase.asm +index 35fd1cc..063664b 100644 +--- a/mpn/x86_64/coreisbr/mul_basecase.asm ++++ b/mpn/x86_64/coreisbr/mul_basecase.asm +@@ -405,3 +405,4 @@ L(ret2):pop %rbp + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/mullo_basecase.asm b/mpn/x86_64/coreisbr/mullo_basecase.asm +index a41a8ac..1b75c78 100644 +--- a/mpn/x86_64/coreisbr/mullo_basecase.asm ++++ b/mpn/x86_64/coreisbr/mullo_basecase.asm +@@ -382,3 +382,4 @@ L(n3): mov (vp_param), %r9 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/popcount.asm b/mpn/x86_64/coreisbr/popcount.asm +index a5be33e..426d3a6 100644 +--- a/mpn/x86_64/coreisbr/popcount.asm ++++ b/mpn/x86_64/coreisbr/popcount.asm +@@ -116,3 +116,4 @@ L(cj1): add %r11, %rax + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/redc_1.asm b/mpn/x86_64/coreisbr/redc_1.asm +index f0dbe07..710e60e 100644 +--- a/mpn/x86_64/coreisbr/redc_1.asm ++++ b/mpn/x86_64/coreisbr/redc_1.asm +@@ -544,3 +544,4 @@ L(n3): mov -32(mp), %rax + jmp L(cj) + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/rsh1aors_n.asm b/mpn/x86_64/coreisbr/rsh1aors_n.asm +index fd2eaea..d390ff3 100644 +--- a/mpn/x86_64/coreisbr/rsh1aors_n.asm ++++ b/mpn/x86_64/coreisbr/rsh1aors_n.asm +@@ -191,3 +191,4 @@ L(end): shrd $1, %rbx, %rbp + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/coreisbr/sqr_basecase.asm b/mpn/x86_64/coreisbr/sqr_basecase.asm +index 46a3612..4d4e545 100644 +--- a/mpn/x86_64/coreisbr/sqr_basecase.asm ++++ b/mpn/x86_64/coreisbr/sqr_basecase.asm +@@ -482,3 +482,4 @@ L(dend):add %r8, %r10 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/div_qr_1n_pi1.asm b/mpn/x86_64/div_qr_1n_pi1.asm +index cb072e9..5a4f195 100644 +--- a/mpn/x86_64/div_qr_1n_pi1.asm ++++ b/mpn/x86_64/div_qr_1n_pi1.asm +@@ -245,3 +245,4 @@ L(q_incr_loop): + lea 8(U1), U1 + jmp L(q_incr_loop) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/div_qr_2n_pi1.asm b/mpn/x86_64/div_qr_2n_pi1.asm +index 5e59a0a..252781c 100644 +--- a/mpn/x86_64/div_qr_2n_pi1.asm ++++ b/mpn/x86_64/div_qr_2n_pi1.asm +@@ -156,3 +156,4 @@ L(fix): C Unlikely update. u2 >= d1 + sbb d1, u2 + jmp L(bck) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/div_qr_2u_pi1.asm b/mpn/x86_64/div_qr_2u_pi1.asm +index 85af96f..b47209e 100644 +--- a/mpn/x86_64/div_qr_2u_pi1.asm ++++ b/mpn/x86_64/div_qr_2u_pi1.asm +@@ -198,3 +198,4 @@ L(fix_qh): C Unlikely update. u2 >= d1 + sbb d1, u2 + jmp L(bck_qh) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/dive_1.asm b/mpn/x86_64/dive_1.asm +index 988bdab..b401112 100644 +--- a/mpn/x86_64/dive_1.asm ++++ b/mpn/x86_64/dive_1.asm +@@ -156,3 +156,4 @@ L(one): shr R8(%rcx), %rax + ret + + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/divrem_1.asm b/mpn/x86_64/divrem_1.asm +index d4d61ad..0417756 100644 +--- a/mpn/x86_64/divrem_1.asm ++++ b/mpn/x86_64/divrem_1.asm +@@ -312,3 +312,4 @@ L(ret): pop %rbx + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/divrem_2.asm b/mpn/x86_64/divrem_2.asm +index 296c9b6..73aa740 100644 +--- a/mpn/x86_64/divrem_2.asm ++++ b/mpn/x86_64/divrem_2.asm +@@ -188,3 +188,4 @@ L(fix): seta %dl + sbb %r11, %rbx + jmp L(bck) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastavx/copyd.asm b/mpn/x86_64/fastavx/copyd.asm +index 56d472f..8d4f651 100644 +--- a/mpn/x86_64/fastavx/copyd.asm ++++ b/mpn/x86_64/fastavx/copyd.asm +@@ -170,3 +170,4 @@ L(bc): test $4, R8(n) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastavx/copyi.asm b/mpn/x86_64/fastavx/copyi.asm +index 7607747..3364aa9 100644 +--- a/mpn/x86_64/fastavx/copyi.asm ++++ b/mpn/x86_64/fastavx/copyi.asm +@@ -167,3 +167,4 @@ L(bc): test $4, R8(n) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/com-palignr.asm b/mpn/x86_64/fastsse/com-palignr.asm +index c7155d1..191e5d9 100644 +--- a/mpn/x86_64/fastsse/com-palignr.asm ++++ b/mpn/x86_64/fastsse/com-palignr.asm +@@ -308,3 +308,4 @@ L(end): test $1, R8(n) + 1: FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/com.asm b/mpn/x86_64/fastsse/com.asm +index 307fb75..5dfc8e4 100644 +--- a/mpn/x86_64/fastsse/com.asm ++++ b/mpn/x86_64/fastsse/com.asm +@@ -165,3 +165,4 @@ L(sma): add $14, n + L(don): FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/copyd-palignr.asm b/mpn/x86_64/fastsse/copyd-palignr.asm +index fac6f8a..a69812c 100644 +--- a/mpn/x86_64/fastsse/copyd-palignr.asm ++++ b/mpn/x86_64/fastsse/copyd-palignr.asm +@@ -252,3 +252,4 @@ L(end): test $1, R8(n) + 1: FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/copyd.asm b/mpn/x86_64/fastsse/copyd.asm +index 5b8b8bf..f03affa 100644 +--- a/mpn/x86_64/fastsse/copyd.asm ++++ b/mpn/x86_64/fastsse/copyd.asm +@@ -156,3 +156,4 @@ L(sma): test $8, R8(n) + L(don): FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/copyi-palignr.asm b/mpn/x86_64/fastsse/copyi-palignr.asm +index 22f13f1..e50f604 100644 +--- a/mpn/x86_64/fastsse/copyi-palignr.asm ++++ b/mpn/x86_64/fastsse/copyi-palignr.asm +@@ -296,3 +296,4 @@ L(end): test $1, R8(n) + 1: FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/copyi.asm b/mpn/x86_64/fastsse/copyi.asm +index b2f3b9d..a506942 100644 +--- a/mpn/x86_64/fastsse/copyi.asm ++++ b/mpn/x86_64/fastsse/copyi.asm +@@ -175,3 +175,4 @@ dnl jnc 1b + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/lshift-movdqu2.asm b/mpn/x86_64/fastsse/lshift-movdqu2.asm +index a05e850..df8ee6d 100644 +--- a/mpn/x86_64/fastsse/lshift-movdqu2.asm ++++ b/mpn/x86_64/fastsse/lshift-movdqu2.asm +@@ -180,3 +180,4 @@ L(end8):movq (ap), %xmm0 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/lshift.asm b/mpn/x86_64/fastsse/lshift.asm +index f76972a..7d0f0fc 100644 +--- a/mpn/x86_64/fastsse/lshift.asm ++++ b/mpn/x86_64/fastsse/lshift.asm +@@ -167,3 +167,4 @@ L(end8):movq (ap), %xmm0 + movq %xmm0, (rp) + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/mpn/x86_64/fastsse/lshiftc-movdqu2.asm +index 8250910..4878dad 100644 +--- a/mpn/x86_64/fastsse/lshiftc-movdqu2.asm ++++ b/mpn/x86_64/fastsse/lshiftc-movdqu2.asm +@@ -191,3 +191,4 @@ L(end8):movq (ap), %xmm0 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/lshiftc.asm b/mpn/x86_64/fastsse/lshiftc.asm +index d252069..f042ec0 100644 +--- a/mpn/x86_64/fastsse/lshiftc.asm ++++ b/mpn/x86_64/fastsse/lshiftc.asm +@@ -177,3 +177,4 @@ L(end8):movq (ap), %xmm0 + movq %xmm0, (rp) + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/rshift-movdqu2.asm b/mpn/x86_64/fastsse/rshift-movdqu2.asm +index 1e270b1..8149717 100644 +--- a/mpn/x86_64/fastsse/rshift-movdqu2.asm ++++ b/mpn/x86_64/fastsse/rshift-movdqu2.asm +@@ -199,3 +199,4 @@ L(bc): dec R32(n) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fastsse/sec_tabselect.asm b/mpn/x86_64/fastsse/sec_tabselect.asm +index e3df110..9975eca 100644 +--- a/mpn/x86_64/fastsse/sec_tabselect.asm ++++ b/mpn/x86_64/fastsse/sec_tabselect.asm +@@ -190,3 +190,4 @@ L(tp1): movdqa %xmm8, %xmm0 + L(b000):FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/fat/fat_entry.asm b/mpn/x86_64/fat/fat_entry.asm +index 8f7599d..5f78553 100644 +--- a/mpn/x86_64/fat/fat_entry.asm ++++ b/mpn/x86_64/fat/fat_entry.asm +@@ -205,3 +205,4 @@ PROLOGUE(__gmpn_cpuid) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/gcd_1.asm b/mpn/x86_64/gcd_1.asm +index ac4aced..bf32cc0 100644 +--- a/mpn/x86_64/gcd_1.asm ++++ b/mpn/x86_64/gcd_1.asm +@@ -163,3 +163,4 @@ L(shift_alot): + mov %rax, %rcx + jmp L(mid) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/invert_limb.asm b/mpn/x86_64/invert_limb.asm +index cc79b89..829861f 100644 +--- a/mpn/x86_64/invert_limb.asm ++++ b/mpn/x86_64/invert_limb.asm +@@ -113,3 +113,4 @@ ifdef(`DARWIN',` + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/invert_limb_table.asm b/mpn/x86_64/invert_limb_table.asm +index 739d59e..16fe314 100644 +--- a/mpn/x86_64/invert_limb_table.asm ++++ b/mpn/x86_64/invert_limb_table.asm +@@ -48,3 +48,4 @@ forloop(i,256,512-1,dnl + ` .value eval(0x7fd00/i) + ')dnl + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/k10/hamdist.asm b/mpn/x86_64/k10/hamdist.asm +index 44b67b5..83e4e86 100644 +--- a/mpn/x86_64/k10/hamdist.asm ++++ b/mpn/x86_64/k10/hamdist.asm +@@ -101,3 +101,4 @@ L(top): mov (ap,n,8), %r8 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/k10/popcount.asm b/mpn/x86_64/k10/popcount.asm +index 3814aea..17e7a73 100644 +--- a/mpn/x86_64/k10/popcount.asm ++++ b/mpn/x86_64/k10/popcount.asm +@@ -136,3 +136,4 @@ C 1 = n mod 8 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/k8/aorrlsh_n.asm b/mpn/x86_64/k8/aorrlsh_n.asm +index ff3a184..8eff29e 100644 +--- a/mpn/x86_64/k8/aorrlsh_n.asm ++++ b/mpn/x86_64/k8/aorrlsh_n.asm +@@ -215,3 +215,4 @@ L(cj1): mov %r9, 8(rp,n,8) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/k8/div_qr_1n_pi1.asm b/mpn/x86_64/k8/div_qr_1n_pi1.asm +index 861402b..fef3a09 100644 +--- a/mpn/x86_64/k8/div_qr_1n_pi1.asm ++++ b/mpn/x86_64/k8/div_qr_1n_pi1.asm +@@ -247,3 +247,4 @@ L(q_incr_loop): + lea 8(U1), U1 + jmp L(q_incr_loop) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/k8/mul_basecase.asm b/mpn/x86_64/k8/mul_basecase.asm +index ca2efb9..61b6e0e 100644 +--- a/mpn/x86_64/k8/mul_basecase.asm ++++ b/mpn/x86_64/k8/mul_basecase.asm +@@ -467,3 +467,4 @@ L(ret): pop %r15 + ret + + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/k8/mullo_basecase.asm b/mpn/x86_64/k8/mullo_basecase.asm +index fa00f42..b1f5b20 100644 +--- a/mpn/x86_64/k8/mullo_basecase.asm ++++ b/mpn/x86_64/k8/mullo_basecase.asm +@@ -99,12 +99,14 @@ dnl JMPENT( L(2m4), L(tab)) C 10 + dnl JMPENT( L(3m4), L(tab)) C 11 + TEXT + +-L(1): imul %r8, %rax ++L(1): CFPROT_ENDBR ++ imul %r8, %rax + mov %rax, (rp) + FUNC_EXIT() + ret + +-L(2): mov 8(vp_param), %r11 ++L(2): CFPROT_ENDBR ++ mov 8(vp_param), %r11 + imul %rax, %r11 C u0 x v1 + mul %r8 C u0 x v0 + mov %rax, (rp) +@@ -115,7 +117,8 @@ L(2): mov 8(vp_param), %r11 + FUNC_EXIT() + ret + +-L(3): mov 8(vp_param), %r9 C v1 ++L(3): CFPROT_ENDBR ++ mov 8(vp_param), %r9 C v1 + mov 16(vp_param), %r11 + mul %r8 C u0 x v0 -> + mov %rax, (rp) C r0 +@@ -144,7 +147,8 @@ L(0m4): + L(1m4): + L(2m4): + L(3m4): +-L(gen): push %rbx ++L(gen): CFPROT_ENDBR ++ push %rbx + push %rbp + push %r13 + push %r14 +@@ -434,3 +438,4 @@ L(ret): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/k8/mulmid_basecase.asm b/mpn/x86_64/k8/mulmid_basecase.asm +index 86f1414..0ace1ba 100644 +--- a/mpn/x86_64/k8/mulmid_basecase.asm ++++ b/mpn/x86_64/k8/mulmid_basecase.asm +@@ -557,3 +557,4 @@ L(ret): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/k8/redc_1.asm b/mpn/x86_64/k8/redc_1.asm +index 9327b21..b00103f 100644 +--- a/mpn/x86_64/k8/redc_1.asm ++++ b/mpn/x86_64/k8/redc_1.asm +@@ -124,8 +124,9 @@ L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(3m4), L(tab)) + TEXT + ++L(1): CFPROT_ENDBR + ALIGN(16) +-L(1): mov (mp_param), %rax ++ mov (mp_param), %rax + mul q0 + add 8(up), %rax + adc 16(up), %rdx +@@ -135,8 +136,9 @@ L(1): mov (mp_param), %rax + jmp L(ret) + + ++L(2): CFPROT_ENDBR + ALIGN(16) +-L(2): mov (mp_param), %rax ++ mov (mp_param), %rax + mul q0 + xor R32(%r14), R32(%r14) + mov %rax, %r10 +@@ -171,7 +173,8 @@ L(2): mov (mp_param), %rax + jmp L(ret) + + +-L(3): mov (mp_param), %rax ++L(3): CFPROT_ENDBR ++ mov (mp_param), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 +@@ -247,8 +250,8 @@ L(3): mov (mp_param), %rax + jmp L(ret) + + ++L(2m4): CFPROT_ENDBR + ALIGN(16) +-L(2m4): + L(lo2): mov (mp,nneg,8), %rax + mul q0 + xor R32(%r14), R32(%r14) +@@ -323,8 +326,8 @@ L(le2): add %r10, (up) + jmp L(addx) + + ++L(1m4): CFPROT_ENDBR + ALIGN(16) +-L(1m4): + L(lo1): mov (mp,nneg,8), %rax + xor %r9, %r9 + xor R32(%rbx), R32(%rbx) +@@ -396,9 +399,9 @@ L(le1): add %r10, (up) + jmp L(addx) + + +- ALIGN(16) + L(0): +-L(0m4): ++L(0m4): CFPROT_ENDBR ++ ALIGN(16) + L(lo0): mov (mp,nneg,8), %rax + mov nneg, i + mul q0 +@@ -462,8 +465,8 @@ L(le0): add %r10, (up) + jmp L(addy) + + ++L(3m4): CFPROT_ENDBR + ALIGN(16) +-L(3m4): + L(lo3): mov (mp,nneg,8), %rax + mul q0 + mov %rax, %rbx +@@ -589,3 +592,4 @@ L(ret): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/k8/sqr_basecase.asm b/mpn/x86_64/k8/sqr_basecase.asm +index 60cf945..e6a545d 100644 +--- a/mpn/x86_64/k8/sqr_basecase.asm ++++ b/mpn/x86_64/k8/sqr_basecase.asm +@@ -131,7 +131,8 @@ L(tab): JMPENT( L(4), L(tab)) + JMPENT( L(3m4), L(tab)) + TEXT + +-L(1): mov (up), %rax ++L(1): CFPROT_ENDBR ++ mov (up), %rax + mul %rax + add $40, %rsp + mov %rax, (rp) +@@ -139,7 +140,8 @@ L(1): mov (up), %rax + FUNC_EXIT() + ret + +-L(2): mov (up), %rax ++L(2): CFPROT_ENDBR ++ mov (up), %rax + mov %rax, %r8 + mul %rax + mov 8(up), %r11 +@@ -165,7 +167,8 @@ L(2): mov (up), %rax + FUNC_EXIT() + ret + +-L(3): mov (up), %rax ++L(3): CFPROT_ENDBR ++ mov (up), %rax + mov %rax, %r10 + mul %rax + mov 8(up), %r11 +@@ -210,7 +213,8 @@ L(3): mov (up), %rax + FUNC_EXIT() + ret + +-L(4): mov (up), %rax ++L(4): CFPROT_ENDBR ++ mov (up), %rax + mov %rax, %r11 + mul %rax + mov 8(up), %rbx +@@ -281,7 +285,7 @@ L(4): mov (up), %rax + ret + + +-L(0m4): ++L(0m4): CFPROT_ENDBR + lea -16(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 + mov 8(up), %rax +@@ -339,7 +343,7 @@ L(L3): xor R32(w1), R32(w1) + jmp L(dowhile) + + +-L(1m4): ++L(1m4): CFPROT_ENDBR + lea 8(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 C u0 + mov 8(up), %rax C u1 +@@ -417,7 +421,7 @@ L(m2x): mov (up,j,8), %rax + jmp L(dowhile_end) + + +-L(2m4): ++L(2m4): CFPROT_ENDBR + lea -16(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 + mov 8(up), %rax +@@ -474,7 +478,7 @@ L(L1): xor R32(w0), R32(w0) + jmp L(dowhile_mid) + + +-L(3m4): ++L(3m4): CFPROT_ENDBR + lea 8(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 C u0 + mov 8(up), %rax C u1 +@@ -805,3 +809,4 @@ L(d1): mov %r11, 24(rp,j,8) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/logops_n.asm b/mpn/x86_64/logops_n.asm +index b277f58..b2c640c 100644 +--- a/mpn/x86_64/logops_n.asm ++++ b/mpn/x86_64/logops_n.asm +@@ -134,6 +134,7 @@ L(e10): movq 24(vp,n,8), %r9 + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT + ') + + ifdef(`VARIANT_2',` +@@ -187,6 +188,7 @@ L(e10): movq 24(vp,n,8), %r9 + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT + ') + + ifdef(`VARIANT_3',` +@@ -241,4 +243,5 @@ L(e10): movq 24(vp,n,8), %r9 + L(ret): FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT + ') +diff --git a/mpn/x86_64/lshift.asm b/mpn/x86_64/lshift.asm +index f368944..990b3b8 100644 +--- a/mpn/x86_64/lshift.asm ++++ b/mpn/x86_64/lshift.asm +@@ -245,3 +245,4 @@ L(ast): mov (up), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/lshiftc.asm b/mpn/x86_64/lshiftc.asm +index c4ba04a..4fd4430 100644 +--- a/mpn/x86_64/lshiftc.asm ++++ b/mpn/x86_64/lshiftc.asm +@@ -180,3 +180,4 @@ L(ast): mov (up), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/lshsub_n.asm b/mpn/x86_64/lshsub_n.asm +index 4d428c0..d263565 100644 +--- a/mpn/x86_64/lshsub_n.asm ++++ b/mpn/x86_64/lshsub_n.asm +@@ -170,3 +170,4 @@ L(end): + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/missing.asm b/mpn/x86_64/missing.asm +index 9b65c89..7914b82 100644 +--- a/mpn/x86_64/missing.asm ++++ b/mpn/x86_64/missing.asm +@@ -128,3 +128,4 @@ PROLOGUE(__gmp_adcx) + ret + EPILOGUE() + PROTECT(__gmp_adcx) ++CF_PROT +diff --git a/mpn/x86_64/mod_1_1.asm b/mpn/x86_64/mod_1_1.asm +index 09b5dd1..287f61d 100644 +--- a/mpn/x86_64/mod_1_1.asm ++++ b/mpn/x86_64/mod_1_1.asm +@@ -234,3 +234,4 @@ L(z): + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/mod_1_2.asm b/mpn/x86_64/mod_1_2.asm +index 09d856e..1cd6dd1 100644 +--- a/mpn/x86_64/mod_1_2.asm ++++ b/mpn/x86_64/mod_1_2.asm +@@ -237,3 +237,4 @@ ifdef(`SHLD_SLOW',` + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/mod_1_4.asm b/mpn/x86_64/mod_1_4.asm +index ae34617..fb685ef 100644 +--- a/mpn/x86_64/mod_1_4.asm ++++ b/mpn/x86_64/mod_1_4.asm +@@ -268,3 +268,4 @@ ifdef(`SHLD_SLOW',` + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/mod_34lsub1.asm b/mpn/x86_64/mod_34lsub1.asm +index 62bdcfa..2cf5751 100644 +--- a/mpn/x86_64/mod_34lsub1.asm ++++ b/mpn/x86_64/mod_34lsub1.asm +@@ -135,46 +135,55 @@ L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(8), L(tab)) + TEXT + +-L(6): add (ap), %rax ++L(6): CFPROT_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +-L(3): add (ap), %rax ++L(3): CFPROT_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + jmp L(cj1) + +-L(7): add (ap), %rax ++L(7): CFPROT_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +-L(4): add (ap), %rax ++L(4): CFPROT_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +-L(1): add (ap), %rax ++L(1): CFPROT_ENDBR ++ add (ap), %rax + adc $0, %rcx + jmp L(cj2) + +-L(8): add (ap), %rax ++L(8): CFPROT_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +-L(5): add (ap), %rax ++L(5): CFPROT_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +-L(2): add (ap), %rax ++L(2): CFPROT_ENDBR ++ add (ap), %rax + adc 8(ap), %rcx + + L(cj2): adc $0, %rdx + L(cj1): adc $0, %r9 +-L(0): add %r9, %rax ++L(0): CFPROT_ENDBR ++ add %r9, %rax + adc $0, %rcx + adc $0, %rdx + adc $0, %rax +@@ -203,3 +212,4 @@ L(0): add %r9, %rax + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/mode1o.asm b/mpn/x86_64/mode1o.asm +index 2cd2b08..c10a5a6 100644 +--- a/mpn/x86_64/mode1o.asm ++++ b/mpn/x86_64/mode1o.asm +@@ -169,3 +169,4 @@ L(one): + + EPILOGUE(mpn_modexact_1c_odd) + EPILOGUE(mpn_modexact_1_odd) ++CF_PROT +diff --git a/mpn/x86_64/mul_1.asm b/mpn/x86_64/mul_1.asm +index b032afc..6ea9a4a 100644 +--- a/mpn/x86_64/mul_1.asm ++++ b/mpn/x86_64/mul_1.asm +@@ -181,3 +181,4 @@ IFDOS(``pop %rdi '') + IFDOS(``pop %rsi '') + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/mul_2.asm b/mpn/x86_64/mul_2.asm +index f408c52..6b73737 100644 +--- a/mpn/x86_64/mul_2.asm ++++ b/mpn/x86_64/mul_2.asm +@@ -190,3 +190,4 @@ L(m22): mul v1 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/mulx/aorsmul_1.asm b/mpn/x86_64/mulx/aorsmul_1.asm +index 285c073..942cf6a 100644 +--- a/mpn/x86_64/mulx/aorsmul_1.asm ++++ b/mpn/x86_64/mulx/aorsmul_1.asm +@@ -159,3 +159,4 @@ L(wd1): ADCSBB %rbx, 24(rp) + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/mulx/mul_1.asm b/mpn/x86_64/mulx/mul_1.asm +index 34a044d..4a0e6ef 100644 +--- a/mpn/x86_64/mulx/mul_1.asm ++++ b/mpn/x86_64/mulx/mul_1.asm +@@ -152,3 +152,4 @@ L(wd1): adc %r12, %rbx + ret + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/nano/dive_1.asm b/mpn/x86_64/nano/dive_1.asm +index e9a0763..d57c444 100644 +--- a/mpn/x86_64/nano/dive_1.asm ++++ b/mpn/x86_64/nano/dive_1.asm +@@ -164,3 +164,4 @@ L(one): shr R8(%rcx), %rax + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/pentium4/aors_n.asm b/mpn/x86_64/pentium4/aors_n.asm +index 8e6ee1b..d3daf6f 100644 +--- a/mpn/x86_64/pentium4/aors_n.asm ++++ b/mpn/x86_64/pentium4/aors_n.asm +@@ -194,3 +194,4 @@ L(ret): mov R32(%rbx), R32(%rax) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/pentium4/aorslshC_n.asm b/mpn/x86_64/pentium4/aorslshC_n.asm +index d03c6a3..a4cd689 100644 +--- a/mpn/x86_64/pentium4/aorslshC_n.asm ++++ b/mpn/x86_64/pentium4/aorslshC_n.asm +@@ -201,3 +201,4 @@ L(c3): mov $1, R8(%rax) + jmp L(rc3) + EPILOGUE() + ASM_END() ++CF_PROT +diff --git a/mpn/x86_64/pentium4/lshift.asm b/mpn/x86_64/pentium4/lshift.asm +index d3b5213..baa4820 100644 +--- a/mpn/x86_64/pentium4/lshift.asm ++++ b/mpn/x86_64/pentium4/lshift.asm +@@ -164,3 +164,4 @@ L(ast): movq (up), %mm2 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/pentium4/lshiftc.asm b/mpn/x86_64/pentium4/lshiftc.asm +index fc64676..e7ed07f 100644 +--- a/mpn/x86_64/pentium4/lshiftc.asm ++++ b/mpn/x86_64/pentium4/lshiftc.asm +@@ -177,3 +177,4 @@ L(ast): movq (up), %mm2 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/pentium4/mod_34lsub1.asm b/mpn/x86_64/pentium4/mod_34lsub1.asm +index f34b3f0..adb4ae6 100644 +--- a/mpn/x86_64/pentium4/mod_34lsub1.asm ++++ b/mpn/x86_64/pentium4/mod_34lsub1.asm +@@ -165,3 +165,4 @@ L(combine): + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/pentium4/rsh1aors_n.asm b/mpn/x86_64/pentium4/rsh1aors_n.asm +index 5528ce4..64a6322 100644 +--- a/mpn/x86_64/pentium4/rsh1aors_n.asm ++++ b/mpn/x86_64/pentium4/rsh1aors_n.asm +@@ -332,3 +332,4 @@ L(cj1): or %r14, %rbx + L(c3): mov $1, R8(%rax) + jmp L(rc3) + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/pentium4/rshift.asm b/mpn/x86_64/pentium4/rshift.asm +index b7c1ee2..758ca64 100644 +--- a/mpn/x86_64/pentium4/rshift.asm ++++ b/mpn/x86_64/pentium4/rshift.asm +@@ -167,3 +167,4 @@ L(ast): movq (up), %mm2 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/popham.asm b/mpn/x86_64/popham.asm +index 9005f81..a52ea0f 100644 +--- a/mpn/x86_64/popham.asm ++++ b/mpn/x86_64/popham.asm +@@ -175,3 +175,4 @@ L(end): + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/rsh1aors_n.asm b/mpn/x86_64/rsh1aors_n.asm +index a3e9cc5..d28cc32 100644 +--- a/mpn/x86_64/rsh1aors_n.asm ++++ b/mpn/x86_64/rsh1aors_n.asm +@@ -187,3 +187,4 @@ L(end): mov %rbx, (rp) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/rshift.asm b/mpn/x86_64/rshift.asm +index 3f344f1..2c45172 100644 +--- a/mpn/x86_64/rshift.asm ++++ b/mpn/x86_64/rshift.asm +@@ -174,3 +174,4 @@ L(ast): mov (up), %r10 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/sec_tabselect.asm b/mpn/x86_64/sec_tabselect.asm +index e8aed26..2198b4b 100644 +--- a/mpn/x86_64/sec_tabselect.asm ++++ b/mpn/x86_64/sec_tabselect.asm +@@ -174,3 +174,4 @@ L(b00): pop %r15 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/sqr_diag_addlsh1.asm b/mpn/x86_64/sqr_diag_addlsh1.asm +index 4ad034c..6db16f6 100644 +--- a/mpn/x86_64/sqr_diag_addlsh1.asm ++++ b/mpn/x86_64/sqr_diag_addlsh1.asm +@@ -114,3 +114,4 @@ L(end): add %r10, %r8 + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/sublsh1_n.asm b/mpn/x86_64/sublsh1_n.asm +index c6d829f..2f0fe01 100644 +--- a/mpn/x86_64/sublsh1_n.asm ++++ b/mpn/x86_64/sublsh1_n.asm +@@ -158,3 +158,4 @@ L(end): add R32(%rbp), R32(%rax) + FUNC_EXIT() + ret + EPILOGUE() ++CF_PROT +diff --git a/mpn/x86_64/x86_64-defs.m4 b/mpn/x86_64/x86_64-defs.m4 +index a626419..80f549e 100644 +--- a/mpn/x86_64/x86_64-defs.m4 ++++ b/mpn/x86_64/x86_64-defs.m4 +@@ -93,8 +93,38 @@ m4_assert_numargs(1) + ` GLOBL $1 + TYPE($1,`function') + $1: ++ CFPROT_ENDBR + ') + ++dnl Generates the endbr64 instructions ++dnl Using macro, so it can be easily extended to use some arch specific conditional defines ++define(`CFPROT_ENDBR', ++`` ++ endbr64'' ++) ++ ++dnl Append the .gnu-property to the end of files ++dnl This is needed for a -fcf-protection ++dnl Again, using macro for easy arch specific defines ++dnl ++define(`CF_PROT',`` ++ .section .note.gnu.property,"a" ++ .align 8 ++ .long 1f - 0f ++ .long 4f - 1f ++ .long 5 ++0: ++ .string "GNU" ++1: ++ .align 8 ++ .long 0xc0000002 ++ .long 3f - 2f ++2: ++ .long 0x3 ++3: ++ .align 8 ++4: ++'') + + dnl Usage: ASSERT([cond][,instructions]) + dnl diff --git a/SOURCES/gmp-mparam.h b/SOURCES/gmp-mparam.h new file mode 100644 index 0000000..1d4e087 --- /dev/null +++ b/SOURCES/gmp-mparam.h @@ -0,0 +1,88 @@ +/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000, 2001, 2002, 2003, +2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +/* + * This gmp-mparam.h is a wrapper include file for the original gmp-mparam.h, + * which has been renamed to gmp-mparam-.h. There are conflicts for the + * original gmp-mparam.h on multilib systems, which result from arch-specific + * configuration options. Please do not use the arch-specific file directly. + * + * Copyright (C) 2006 Red Hat, Inc. + * Thomas Woerner + */ + +#ifdef gmp_mparam_wrapper_h +#error "gmp_mparam_wrapper_h should not be defined!" +#endif +#define gmp_mparam_wrapper_h + +#if defined(__arm__) +#include "gmp-mparam-arm.h" +#elif defined(__i386__) +#include "gmp-mparam-i386.h" +#elif defined(__ia64__) +#include "gmp-mparam-ia64.h" +#elif defined(__powerpc64__) +# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include "gmp-mparam-ppc64.h" +# else +#include "gmp-mparam-ppc64le.h" +# endif +#elif defined(__powerpc__) +# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include "gmp-mparam-ppc.h" +# else +#include "gmp-mparam-ppcle.h" +# endif +#elif defined(__s390x__) +#include "gmp-mparam-s390x.h" +#elif defined(__s390__) +#include "gmp-mparam-s390.h" +#elif defined(__x86_64__) +#include "gmp-mparam-x86_64.h" +#elif defined(__alpha__) +#include "gmp-mparam-alpha.h" +#elif defined(__sh__) +#include "gmp-mparam-sh.h" +#elif defined(__sparc__) && defined (__arch64__) +#include "gmp-mparam-sparc64.h" +#elif defined(__sparc__) +#include "gmp-mparam-sparc.h" +#elif defined(__aarch64__) +#include "gmp-mparam-aarch64.h" +#elif defined(__mips64) && defined(__MIPSEL__) +#include "gmp-mparam-mips64el.h" +#elif defined(__mips64) +#include "gmp-mparam-mips64.h" +#elif defined(__mips) && defined(__MIPSEL__) +#include "gmp-mparam-mipsel.h" +#elif defined(__mips) +#include "gmp-mparam-mips.h" +#elif defined(__riscv) +#if __riscv_xlen == 64 +#include "gmp-mparam-riscv64.h" +#else +#error "No support for riscv32" +#endif +#else +#error "The gmp-devel package is not usable with the architecture." +#endif + +#undef gmp_mparam_wrapper_h diff --git a/SOURCES/gmp.h b/SOURCES/gmp.h new file mode 100644 index 0000000..0a91606 --- /dev/null +++ b/SOURCES/gmp.h @@ -0,0 +1,88 @@ +/* Definitions for GNU multiple precision functions. -*- mode: c -*- + +Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000, 2001, 2002, 2003, +2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +/* + * This gmp.h is a wrapper include file for the original gmp.h, which has been + * renamed to gmp-.h. There are conflicts for the original gmp.h on + * multilib systems, which result from arch-specific configuration options. + * Please do not use the arch-specific file directly. + * + * Copyright (C) 2006 Red Hat, Inc. + * Thomas Woerner + */ + +#ifdef gmp_wrapper_h +#error "gmp_wrapper_h should not be defined!" +#endif +#define gmp_wrapper_h + +#if defined(__arm__) +#include "gmp-arm.h" +#elif defined(__i386__) +#include "gmp-i386.h" +#elif defined(__ia64__) +#include "gmp-ia64.h" +#elif defined(__powerpc64__) +# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include "gmp-ppc64.h" +# else +#include "gmp-ppc64le.h" +# endif +#elif defined(__powerpc__) +# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include "gmp-ppc.h" +# else +#include "gmp-ppcle.h" +# endif +#elif defined(__s390x__) +#include "gmp-s390x.h" +#elif defined(__s390__) +#include "gmp-s390.h" +#elif defined(__x86_64__) +#include "gmp-x86_64.h" +#elif defined(__alpha__) +#include "gmp-alpha.h" +#elif defined(__sh__) +#include "gmp-sh.h" +#elif defined(__sparc__) && defined (__arch64__) +#include "gmp-sparc64.h" +#elif defined(__sparc__) +#include "gmp-sparc.h" +#elif defined(__aarch64__) +#include "gmp-aarch64.h" +#elif defined(__mips64) && defined(__MIPSEL__) +#include "gmp-mips64el.h" +#elif defined(__mips64) +#include "gmp-mips64.h" +#elif defined(__mips) && defined(__MIPSEL__) +#include "gmp-mipsel.h" +#elif defined(__mips) +#include "gmp-mips.h" +#elif defined(__riscv) +#if __riscv_xlen == 64 +#include "gmp-riscv64.h" +#else +#error "No support for riscv32" +#endif +#else +#error "The gmp-devel package is not usable with the architecture." +#endif + +#undef gmp_wrapper_h diff --git a/SOURCES/ibm_z13_simd_part1.patch b/SOURCES/ibm_z13_simd_part1.patch new file mode 100644 index 0000000..86bb9c3 --- /dev/null +++ b/SOURCES/ibm_z13_simd_part1.patch @@ -0,0 +1,596 @@ +Co-authored-by: Stefan Liebler +--- + mpn/s390_64/z13/addmul_1.c | 358 +++++++++++++++++++++++++++++++++++ + mpn/s390_64/z13/common-vec.h | 175 +++++++++++++++++ + mpn/s390_64/z13/mul_1.c | 31 +++ + 3 files changed, 564 insertions(+) + create mode 100644 mpn/s390_64/z13/addmul_1.c + create mode 100644 mpn/s390_64/z13/common-vec.h + create mode 100644 mpn/s390_64/z13/mul_1.c + +diff --git a/mpn/s390_64/z13/addmul_1.c b/mpn/s390_64/z13/addmul_1.c +new file mode 100644 +index 000000000..022e5edcc +--- /dev/null ++++ b/mpn/s390_64/z13/addmul_1.c +@@ -0,0 +1,359 @@ ++/* Addmul_1 / mul_1 for IBM z13 and later ++ Contributed by Marius Hillenbrand ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "gmp.h" ++#include "gmp-impl.h" ++#include "s390_64/z13/common-vec.h" ++ ++#undef FUNCNAME ++ ++#ifdef DO_INLINE ++# ifdef OPERATION_addmul_1 ++# define ADD ++# define FUNCNAME inline_addmul_1 ++# elif defined(OPERATION_mul_1) ++# define FUNCNAME inline_mul_1 ++# endif ++ ++#else ++# ifdef OPERATION_addmul_1 ++# define ADD ++# define FUNCNAME mpn_addmul_1 ++# elif defined(OPERATION_mul_1) ++# define FUNCNAME mpn_mul_1 ++# endif ++#endif ++ ++#ifdef DO_INLINE ++static inline mp_limb_t ++FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb) ++ __attribute__ ((always_inline)); ++ ++static inline ++#endif ++mp_limb_t ++FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb) ++{ ++ ASSERT (n >= 1); ++ ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n)); ++ ++ /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in ++ VRs (using each VR as a single 128-bit accumulator). ++ The inner loop is unrolled to four limbs, with two blocks of four ++ multiplications each. Since the MLGR operation operates on even/odd GPR ++ pairs, pin the products appropriately. */ ++ ++ /* products as GPR pairs */ ++ register mp_limb_t p0_high asm("r0"); ++ register mp_limb_t p0_low asm("r1"); ++ ++ register mp_limb_t p1_high asm("r8"); ++ register mp_limb_t p1_low asm("r9"); ++ ++ register mp_limb_t p2_high asm("r6"); ++ register mp_limb_t p2_low asm("r7"); ++ ++ register mp_limb_t p3_high asm("r10"); ++ register mp_limb_t p3_low asm("r11"); ++ ++ /* carry flag for 128-bit add in VR for first carry chain */ ++ vec_t carry_vec0 = { .dw = vec_splat_u64 (0) }; ++ mp_limb_t carry_limb = 0; ++ ++#ifdef ADD ++ /* 2nd carry flag for 2nd carry chain with addmul */ ++ vec_t carry_vec1 = { .dw = vec_splat_u64 (0) }; ++ vec_t sum0; ++ vec_t rp0_addend, rp1_addend; ++ rp0_addend.dw = vec_splat_u64 (0); ++ rp1_addend.dw = vec_splat_u64 (0); ++#endif ++ vec_t sum1; ++ ++ vec_t carry_prod = { .dw = vec_splat_u64 (0) }; ++ ++ /* The scalar multiplications compete with pointer and index increments for ++ * issue ports. Thus, increment the loop index in the middle of the loop so ++ * that the operations for the next iteration's multiplications can be ++ * loaded in time (looks horrible, yet helps performance) and make sure we ++ * use addressing with base reg + index reg + immediate displacement ++ * (so that only the single index needs incrementing, instead of multiple ++ * pointers). */ ++#undef LOOP_ADVANCE ++#undef IDX_OFFSET ++ ++#define LOOP_ADVANCE 4 * sizeof (mp_limb_t) ++#define IDX_OFFSET (LOOP_ADVANCE) ++ register ssize_t idx = 0 - IDX_OFFSET; ++ ++ /* ++ * branch-on-count implicitly hint to the branch prediction as taken, while ++ * compare-and-branch hints as not taken. currently, using branch-on-count ++ * has a performance advantage, but it is not clear that it is generally the ++ * better choice (e.g., branch-on-count requires decrementing the separate ++ * counter). so, allow switching the loop condition to enable either ++ * category of branch instructions: ++ * - idx is less than an upper bound, for compare-and-branch ++ * - iteration counter greater than zero, for branch-on-count ++ */ ++#define BRCTG ++#ifdef BRCTG ++ ssize_t iterations = (size_t)n / 4; ++#else ++ ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET; ++#endif ++ ++ /* products will be transferred into VRs before adding up. ++ * see main loop below for comments on accumulation scheme. */ ++ vec_t product0, product1, product2; ++ ++ product0.dw = vec_splat_u64 (0); ++ ++ switch ((size_t)n % 4) ++ { ++ case 0: ++ break; ++ ++ case 1: ++ idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET; ++ ++ p3_low = s1p[0]; ++ s390_umul_ppmm (p3_high, p3_low, s2limb); ++ ++#ifdef ADD ++ rp0_addend.dw[1] = rp[0]; ++ product0.dw[1] = p3_low; ++ ++ sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw); ++ carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0); ++ ++ rp[0] = sum0.dw[1]; ++#else ++ rp[0] = p3_low; ++#endif ++ ++ carry_limb = p3_high; ++ break; ++ ++ case 2: ++ p0_low = s1p[0]; ++ p3_low = s1p[1]; ++ idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET; ++ ++ s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb); ++ ++ carry_prod.dw[0] = p3_low; ++ ++ product0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++ carry_limb = p3_high; ++ ++#ifdef ADD ++ rp0_addend = vec_load_elements_reversed (rp, 0); ++ sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw); ++ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw); ++ ++ sum1.sw = vec_add_u128 (sum0.sw, product0.sw); ++ carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw); ++#else ++ sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw); ++ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw); ++#endif ++ ++ vec_store_elements_reversed (rp, 0, sum1); ++ ++ break; ++ ++ case 3: ++ idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET; ++ ++ p0_low = s1p[0]; ++ s390_umul_ppmm (p0_high, p0_low, s2limb); ++ ++#ifdef ADD ++ rp0_addend.dw[1] = rp[0]; ++ product0.dw[1] = p0_low; ++ ++ sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw); ++ carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0); ++ ++ rp[0] = sum0.dw[1]; ++#else ++ rp[0] = p0_low; ++#endif ++ carry_limb = p0_high; ++ ++ p1_low = s1p[1]; ++ p3_low = s1p[2]; ++ ++ s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb); ++ ++ carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb); ++ product1.dw = vec_load_2di_as_pair (p1_high, p1_low); ++ carry_limb = p3_high; ++ ++#ifdef ADD ++ rp0_addend = vec_load_elements_reversed (rp, 8); ++ sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw); ++ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw); ++ ++ sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw); ++ carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw); ++#else ++ sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw); ++ carry_vec0.sw ++ = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw); ++#endif ++ vec_store_elements_reversed (rp, 8, sum1); ++ break; ++ } ++ ++#ifdef BRCTG ++ for (; iterations > 0; iterations--) ++ { ++#else ++ while (idx < idx_bound) ++ { ++#endif ++ vec_t overlap_addend0; ++ vec_t overlap_addend1; ++ ++ /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the ++ * result in a GPR pair. One of the factors is taken from the GPR pair ++ * and overwritten. ++ * To reuse factors, it turned out cheaper to load limbs multiple times ++ * than copying GPR contents. Enforce that and the use of addressing by ++ * base + index gpr + immediate displacement via inline asm. ++ */ ++ ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET); ++ ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET); ++ ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET); ++ ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET); ++ ++ /* ++ * accumulate products as follows (for addmul): ++ * | rp[i+3] | rp[i+2] | rp[i+1] | rp[i] | ++ * p0_high | p0_low | ++ * p1_high | p1_low | carry-limb in ++ * p2_high | p2_low | ++ * c-limb out <- p3_high | p3_low | ++ * | < 128-bit VR > < 128-bit VR > ++ * ++ * < rp1_addend > < rp0_addend > ++ * carry-chain 0 <- + <- + <- carry_vec0[127] ++ * < product1 > < product0 > ++ * carry-chain 1 <- + <- + <- carry_vec1[127] ++ * < overlap_addend1 > < overlap_addend0 > ++ * ++ * note that a 128-bit add with carry in + out is built from two insns ++ * - vec_adde_u128 (vacq) provides sum ++ * - vec_addec_u128 (vacccq) provides the new carry bit ++ */ ++ ++ s390_double_umul_ppmm (p0_high, p0_low, p1_high, p1_low, s2limb); ++ ++ /* ++ * "barrier" to enforce scheduling loads for all limbs and first round ++ * of MLGR before anything else. ++ */ ++ asm volatile(""); ++ ++ product0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++#ifdef ADD ++ rp0_addend = vec_load_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET); ++ rp1_addend = vec_load_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET); ++#endif ++ /* increment loop index to unblock dependant loads of limbs for the next ++ * iteration (see above at #define LOOP_ADVANCE) */ ++ idx += LOOP_ADVANCE; ++ ++ s390_double_umul_ppmm (p2_high, p2_low, p3_high, p3_low, s2limb); ++ ++ overlap_addend0.dw = vec_load_2di_as_pair (p1_low, carry_limb); ++ asm volatile(""); ++ ++#ifdef ADD ++ sum0.sw = vec_adde_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw); ++ sum1.sw = vec_adde_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw); ++ ++ carry_vec0.sw ++ = vec_addec_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw); ++ carry_vec1.sw ++ = vec_addec_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw); ++#else ++ sum1.sw = vec_adde_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw); ++ carry_vec0.sw ++ = vec_addec_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw); ++#endif ++ ++ asm volatile(""); ++ product2.dw = vec_load_2di_as_pair (p2_high, p2_low); ++ overlap_addend1.dw = vec_load_2di_as_pair (p3_low, p1_high); ++ ++ vec_t sum4; ++ ++#ifdef ADD ++ vec_t sum3; ++ sum3.sw = vec_adde_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw); ++ sum4.sw = vec_adde_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw); ++ ++ carry_vec0.sw ++ = vec_addec_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw); ++ carry_vec1.sw ++ = vec_addec_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw); ++#else ++ sum4.sw = vec_adde_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw); ++ carry_vec0.sw ++ = vec_addec_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw); ++#endif ++ vec_store_elements_reversed_idx (rp, idx, IDX_OFFSET - LOOP_ADVANCE, ++ sum1); ++ vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE, ++ sum4); ++ ++ carry_limb = p3_high; ++ } ++ ++#ifdef ADD ++ carry_vec0.dw += carry_vec1.dw; ++ carry_limb += carry_vec0.dw[1]; ++#else ++ carry_limb += carry_vec0.dw[1]; ++#endif ++ ++ return carry_limb; ++} ++ ++#undef OPERATION_addmul_1 ++#undef OPERATION_mul_1 ++#undef FUNCNAME ++#undef ADD +diff --git a/mpn/s390_64/z13/common-vec.h b/mpn/s390_64/z13/common-vec.h +new file mode 100644 +index 000000000..a59e6eefe +--- /dev/null ++++ b/mpn/s390_64/z13/common-vec.h +@@ -0,0 +1,175 @@ ++/* Common vector helpers and macros for IBM z13 and later ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#ifndef __S390_64_Z13_COMMON_VEC_H ++#define __S390_64_Z13_COMMON_VEC_H ++ ++#include ++#include ++ ++/* ++ * Vector intrinsics use vector element types that kind-of make sense for the ++ * specific operation (e.g., vec_permi permutes doublewords). To use VRs ++ * interchangeably with different intrinsics, typedef the two variants and wrap ++ * them in a union. ++ */ ++#define VLEN_BYTES 16 ++typedef unsigned long long v2di __attribute__ ((vector_size (VLEN_BYTES))); ++typedef unsigned char v16qi __attribute__ ((vector_size (VLEN_BYTES))); ++ ++/* ++ * The Z vector intrinsics use vectors with different element types (e.g., ++ * v16qi for the 128-bit adds and v2di for vec_permi). ++ */ ++union vec ++{ ++ v2di dw; ++ v16qi sw; ++}; ++ ++typedef union vec vec_t; ++ ++/* ++ * single-instruction combine of two GPRs into a VR ++ */ ++static inline v2di ++vec_load_2di_as_pair (unsigned long a, unsigned long b) ++{ ++ v2di res; ++ __asm__("vlvgp\t%0,%1,%2" : "=v"(res) : "r"(a), "r"(b)); ++ return res; ++} ++ ++/* ++ * 64x64 mult where caller needs to care about proper register allocation: ++ * multiply xl with m1, treating both as unsigned, and place the result in ++ * xh:xl. ++ * mlgr operates on register pairs, so xh must be an even gpr followed by xl ++ */ ++#define s390_umul_ppmm(xh, xl, m1) \ ++ do \ ++ { \ ++ asm("mlgr\t%0,%3" : "=r"(xh), "=r"(xl) : "%1"(xl), "r"(m1)); \ ++ } \ ++ while (0); ++ ++/* ++ * two 64x64 multiplications, scheduled so that they will dispatch and issue to ++ * different sides: each mlgr is dispatched alone in an instruction group and ++ * subsequent groups will issue on different execution sides. ++ * there is a variant where both products use the same multiplicand and one ++ * that uses two different multiplicands. constraints from s390_umul_ppmm apply ++ * here. ++ */ ++#define s390_double_umul_ppmm(X0H, X0L, X1H, X1L, MX) \ ++ do \ ++ { \ ++ asm("mlgr\t%[x0h],%[mx]\n\t" \ ++ "mlgr\t%[x1h],%[mx]" \ ++ : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \ ++ [x1l] "=r"(X1L) \ ++ : "[x0l]"(X0L), "[x1l]"(X1L), [mx] "r"(MX)); \ ++ } \ ++ while (0); ++ ++#define s390_double_umul_ppmm_distinct(X0H, X0L, X1H, X1L, MX0, MX1) \ ++ do \ ++ { \ ++ asm("mlgr\t%[x0h],%[mx0]\n\t" \ ++ "mlgr\t%[x1h],%[mx1]" \ ++ : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \ ++ [x1l] "=r"(X1L) \ ++ : "[x0l]"(X0L), "[x1l]"(X1L), [mx0] "r"(MX0), [mx1] "r"(MX1)); \ ++ } \ ++ while (0); ++ ++#define ASM_LOADGPR_BASE(DST, BASE, OFFSET) \ ++ asm volatile("lg\t%[r],%[off](%[b])" \ ++ : [r] "=r"(DST) \ ++ : [b] "a"(BASE), [off] "L"(OFFSET) \ ++ : "memory"); ++ ++#define ASM_LOADGPR(DST, BASE, INDEX, OFFSET) \ ++ asm volatile("lg\t%[r],%[off](%[b],%[x])" \ ++ : [r] "=r"(DST) \ ++ : [b] "a"(BASE), [x] "a"(INDEX), [off] "L"(OFFSET) \ ++ : "memory"); ++ ++/* ++ * Load a vector register from memory and swap the two 64-bit doubleword ++ * elements. ++ */ ++static inline vec_t ++vec_load_elements_reversed_idx (mp_limb_t const *base, ssize_t const index, ++ ssize_t const offset) ++{ ++ vec_t res; ++ char *ptr = (char *)base; ++ ++ res.sw = *(v16qi *)(ptr + index + offset); ++ res.dw = vec_permi (res.dw, res.dw, 2); ++ ++ return res; ++} ++ ++static inline vec_t ++vec_load_elements_reversed (mp_limb_t const *base, ssize_t const offset) ++{ ++ return vec_load_elements_reversed_idx (base, 0, offset); ++} ++ ++/* ++ * Store a vector register to memory and swap the two 64-bit doubleword ++ * elements. ++ */ ++static inline void ++vec_store_elements_reversed_idx (mp_limb_t *base, ssize_t const index, ++ ssize_t const offset, vec_t vec) ++{ ++ char *ptr = (char *)base; ++ ++ vec.dw = vec_permi (vec.dw, vec.dw, 2); ++ *(v16qi *)(ptr + index + offset) = vec.sw; ++} ++ ++static inline void ++vec_store_elements_reversed (mp_limb_t *base, ssize_t const offset, vec_t vec) ++{ ++ vec_store_elements_reversed_idx (base, 0, offset, vec); ++} ++ ++#define ASM_VZERO(VEC) \ ++ do \ ++ { \ ++ asm("vzero\t%[vec]" : [vec] "=v"(VEC)); \ ++ } \ ++ while (0) ++ ++#endif +diff --git a/mpn/s390_64/z13/mul_1.c b/mpn/s390_64/z13/mul_1.c +new file mode 100644 +index 000000000..7584dc8c7 +--- /dev/null ++++ b/mpn/s390_64/z13/mul_1.c +@@ -0,0 +1,31 @@ ++/* mul_1 for IBM z13 or later ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "s390_64/z13/addmul_1.c" +-- +2.40.1 + diff --git a/SOURCES/ibm_z13_simd_part2.patch b/SOURCES/ibm_z13_simd_part2.patch new file mode 100644 index 0000000..347abd6 --- /dev/null +++ b/SOURCES/ibm_z13_simd_part2.patch @@ -0,0 +1,536 @@ +Co-authored-by: Stefan Liebler +--- + mpn/s390_64/z13/aormul_2.c | 476 +++++++++++++++++++++++++++++++++++ + mpn/s390_64/z13/gmp-mparam.h | 37 +++ + 2 files changed, 513 insertions(+) + create mode 100644 mpn/s390_64/z13/aormul_2.c + create mode 100644 mpn/s390_64/z13/gmp-mparam.h + +diff --git a/mpn/s390_64/z13/aormul_2.c b/mpn/s390_64/z13/aormul_2.c +new file mode 100644 +index 000000000..9a69fc38e +--- /dev/null ++++ b/mpn/s390_64/z13/aormul_2.c +@@ -0,0 +1,477 @@ ++/* Addmul_2 / mul_2 for IBM z13 or later ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "gmp.h" ++#include "gmp-impl.h" ++ ++#include "s390_64/z13/common-vec.h" ++ ++#undef FUNCNAME ++ ++#ifdef DO_INLINE ++# ifdef OPERATION_addmul_2 ++# define ADD ++# define FUNCNAME inline_addmul_2 ++# elif defined(OPERATION_mul_2) ++# define FUNCNAME inline_mul_2 ++# else ++# error Missing define for operation to perform ++# endif ++#else ++# ifdef OPERATION_addmul_2 ++# define ADD ++# define FUNCNAME mpn_addmul_2 ++# elif defined(OPERATION_mul_2) ++# define FUNCNAME mpn_mul_2 ++# else ++# error Missing define for operation to perform ++# endif ++#endif ++ ++#ifdef DO_INLINE ++static inline mp_limb_t ++FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, const mp_limb_t *vp) ++ __attribute__ ((always_inline)); ++ ++static inline ++#endif ++mp_limb_t ++FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, ++ const mp_limb_t *vp) ++{ ++ ++ /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in ++ VRs (using each VR as a single 128-bit accumulator). ++ The inner loop is unrolled to four limbs, with two blocks of four ++ multiplications each. Since the MLGR operation operates on even/odd GPR ++ pairs, pin the products appropriately. */ ++ ++ register mp_limb_t p0_high asm("r0"); ++ register mp_limb_t p0_low asm("r1"); ++ ++ register mp_limb_t p1_high asm("r8"); ++ register mp_limb_t p1_low asm("r9"); ++ ++ register mp_limb_t p2_high asm("r6"); ++ register mp_limb_t p2_low asm("r7"); ++ ++ register mp_limb_t p3_high asm("r10"); ++ register mp_limb_t p3_low asm("r11"); ++ ++ vec_t carry_prod = { .dw = vec_splat_u64 (0) }; ++ vec_t zero = { .dw = vec_splat_u64 (0) }; ++ ++ /* two carry-bits for the 128-bit VR adds - stored in VRs */ ++#ifdef ADD ++ vec_t carry_vec0 = { .dw = vec_splat_u64 (0) }; ++#endif ++ vec_t carry_vec1 = { .dw = vec_splat_u64 (0) }; ++ ++ vec_t tmp; ++ ++ vec_t sum0, sum1; ++ ++ /* products transferred into VRs for accumulating there */ ++ vec_t pv0, pv3; ++ vec_t pv1_low, pv1_high, pv2_low, pv2_high; ++ vec_t low, middle, high; ++#ifdef ADD ++ vec_t rp0, rp1; ++#endif ++ ++ register mp_limb_t v0 asm("r12"); ++ register mp_limb_t v1 asm("r5"); ++ v0 = vp[0]; ++ v1 = vp[1]; ++ ++ /* The scalar multiplications compete with pointer and index increments for ++ * issue ports. Thus, increment the loop index in the middle of the loop so ++ * that the operations for the next iteration's multiplications can be ++ * loaded in time (looks horrible, yet helps performance) and make sure we ++ * use addressing with base reg + index reg + immediate displacement ++ * (so that only the single index needs incrementing, instead of multiple ++ * pointers). */ ++#undef LOOP_ADVANCE ++#define LOOP_ADVANCE (4 * sizeof (mp_limb_t)) ++#define IDX_OFFSET (LOOP_ADVANCE) ++ ++ register ssize_t idx = 0 - IDX_OFFSET; ++#ifdef BRCTG ++ ssize_t iterations = (size_t)n / 4; ++#else ++ ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET; ++#endif ++ ++ /* ++ * To minimize latency in the carry chain, accumulate in VRs with 128-bit ++ * adds with carry in and out. As a downside, these require two insns for ++ * each add - one to calculate the sum, one to deliver the carry out. ++ * To reduce the overall number of insns to execute, combine adding up ++ * product limbs such that there cannot be a carry out and one (for mul) or ++ * two (for addmul) adds with carry chains. ++ * ++ * Since (2^64-1) * (2^64-1) = (2^128-1) - 2 * (2^64-1), we can add two ++ * limbs into each 128-bit product without causing carry out. ++ * ++ * For each block of 2 limbs * 2 limbs ++ * ++ * | u[i] * v[0] (p2) | ++ * | u[i] * v[1] (p0) | ++ * | u[i+1] * v[0](p1) | ++ * | u[i+1] * v[1](p3) | ++ * < 128 bits > < 128 bits > ++ * ++ * we can begin accumulating with "simple" carry-oblivious 128-bit adds: ++ * - p0 + low limb of p1 ++ * + high limb of p2 ++ * and combine resulting low limb with p2's low limb ++ * - p3 + high limb of p1 ++ * + high limb of sum above ++ * ... which will will result in two 128-bit limbs to be fed into the carry ++ * chain(s). ++ * Overall, that scheme saves instructions and improves performance, despite ++ * slightly increasing latency between multiplications and carry chain (yet ++ * not in the carry chain). ++ */ ++ ++#define LOAD_LOW_LIMB(VEC, LIMB) \ ++ do \ ++ { \ ++ asm("vzero\t%[vec]\n\t" \ ++ "vlvgg\t%[vec],%[limb],1" \ ++ : [vec] "=v"(VEC) \ ++ : [limb] "r"(LIMB)); \ ++ } \ ++ while (0) ++ ++ /* for the 128-bit adds in the carry chain, to calculate a + b + carry-in we ++ * need paired vec_adde_u128 (delivers sum) and vec_addec_u128 (delivers new ++ * carry) */ ++#define ADD_UP2_CARRY_INOUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \ ++ do \ ++ { \ ++ sum##SUMIDX.sw \ ++ = vec_adde_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \ ++ carry_vec##CARRYIDX.sw \ ++ = vec_addec_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \ ++ } \ ++ while (0) ++ ++#define ADD_UP_CARRY_INOUT(SUMIDX, ADDEND1, ADDEND2) \ ++ ADD_UP2_CARRY_INOUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2) ++ ++ /* variant without carry-in for prologue */ ++#define ADD_UP2_CARRY_OUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \ ++ do \ ++ { \ ++ sum##SUMIDX.sw = vec_add_u128 (ADDEND1.sw, ADDEND2.sw); \ ++ carry_vec##CARRYIDX.sw = vec_addc_u128 (ADDEND1.sw, ADDEND2.sw); \ ++ } \ ++ while (0) ++ ++#define ADD_UP_CARRY_OUT(SUMIDX, ADDEND1, ADDEND2) \ ++ ADD_UP2_CARRY_OUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2) ++ ++ /* prologue for 4x-unrolled main loop */ ++ switch ((size_t)n % 4) ++ { ++ case 1: ++ ASM_LOADGPR_BASE (p0_low, up, 0); ++ ASM_LOADGPR_BASE (p1_low, up, 0); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1); ++ carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low); ++ ++/* gcc tries to be too clever and vlr from a reg that is already zero. vzero is ++ * cheaper. */ ++# define NEW_CARRY(VEC, LIMB) \ ++ do \ ++ { \ ++ asm("vzero\t%[vec]\n\t" \ ++ "vlvgg\t%[vec],%[limb],1" \ ++ : [vec] "=v"(VEC) \ ++ : [limb] "r"(LIMB)); \ ++ } \ ++ while (0) ++ ++ NEW_CARRY (tmp, p0_high); ++ ++ carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw); ++#ifdef ADD ++ carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp); ++#else ++ rp[0] = p0_low; ++#endif ++ idx += sizeof (mp_limb_t); ++ break; ++ ++ case 2: ++ ASM_LOADGPR_BASE (p0_low, up, 0); ++ ASM_LOADGPR_BASE (p1_low, up, 8); ++ ASM_LOADGPR_BASE (p2_low, up, 0); ++ ASM_LOADGPR_BASE (p3_low, up, 8); ++ ++ asm("" ++ : "=r"(p0_low), "=r"(p2_low) ++ : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low)); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ LOAD_LOW_LIMB (pv1_low, p1_low); ++ LOAD_LOW_LIMB (pv1_high, p1_high); ++ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); ++ LOAD_LOW_LIMB (pv2_high, p2_high); ++ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ LOAD_LOW_LIMB (pv2_low, p2_low); ++ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); ++ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); ++ low.dw = vec_permi (middle.dw, pv2_low.dw, 3); ++ middle.dw = vec_permi (zero.dw, middle.dw, 0); ++ high.sw = vec_add_u128 (middle.sw, pv3.sw); ++#ifdef ADD ++ rp0 = vec_load_elements_reversed (rp, 0); ++ ADD_UP_CARRY_OUT (0, rp0, carry_prod); ++#else ++ sum0 = carry_prod; ++#endif ++ ADD_UP_CARRY_OUT (1, sum0, low); ++ vec_store_elements_reversed (rp, 0, sum1); ++ carry_prod = high; ++ ++ idx += 2 * sizeof (mp_limb_t); ++ break; ++ ++ case 3: ++ ASM_LOADGPR_BASE (p0_low, up, 0); ++ ASM_LOADGPR_BASE (p1_low, up, 0); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1); ++ carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low); ++ NEW_CARRY (tmp, p0_high); ++ carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw); ++ ++#ifdef ADD ++ carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp); ++#else ++ rp[0] = p0_low; ++#endif ++ ++ ASM_LOADGPR_BASE (p0_low, up, 8); ++ ASM_LOADGPR_BASE (p1_low, up, 16); ++ ASM_LOADGPR_BASE (p2_low, up, 8); ++ ASM_LOADGPR_BASE (p3_low, up, 16); ++ ++ asm("" ++ : "=r"(p0_low), "=r"(p2_low) ++ : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low)); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++ LOAD_LOW_LIMB (pv1_low, p1_low); ++ LOAD_LOW_LIMB (pv1_high, p1_high); ++ ++ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); ++ LOAD_LOW_LIMB (pv2_high, p2_high); ++ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ ++ LOAD_LOW_LIMB (pv2_low, p2_low); ++ ++ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); ++ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); ++ ++ low.dw = vec_permi (middle.dw, pv2_low.dw, 3); ++ middle.dw = vec_permi (zero.dw, middle.dw, 0); ++ high.sw = vec_add_u128 (middle.sw, pv3.sw); ++ ++#ifdef ADD ++ vec_t rp0 = vec_load_elements_reversed (rp, 8); ++ ADD_UP_CARRY_OUT (0, rp0, carry_prod); ++#else ++ sum0 = carry_prod; ++#endif ++ ADD_UP_CARRY_INOUT (1, sum0, low); ++ ++ vec_store_elements_reversed (rp, 8, sum1); ++ ++ carry_prod = high; ++ ++ idx += 3 * sizeof (mp_limb_t); ++ break; ++ } ++ ++ /* ++ * branch-on-count implicitly hint to the branch prediction as taken, while ++ * compare-and-branch hints as not taken. currently, using branch-on-count ++ * has a performance advantage, but it is not clear that it is generally ++ * the better choice (e.g., branch-on-count requires decrementing the ++ * separate counter). so, allow switching the loop condition to enable ++ * either category of branch instructions: ++ * - idx is less than an upper bound, for compare-and-branch ++ * - iteration counter greater than zero, for branch-on-count ++ */ ++#ifdef BRCTG ++ for (; iterations > 0; iterations--) ++ { ++#else ++ while (idx < idx_bound) ++ { ++#endif ++ /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the ++ * result in a GPR pair. One of the factors is taken from the GPR pair ++ * and overwritten. ++ * To reuse factors, it turned out cheaper to load limbs multiple times ++ * than copying GPR contents. Enforce that and the use of addressing by ++ * base + index gpr + immediate displacement via inline asm. ++ */ ++ ASM_LOADGPR (p0_low, up, idx, 0 + IDX_OFFSET); ++ ASM_LOADGPR (p1_low, up, idx, 8 + IDX_OFFSET); ++ ASM_LOADGPR (p2_low, up, idx, 0 + IDX_OFFSET); ++ ASM_LOADGPR (p3_low, up, idx, 8 + IDX_OFFSET); ++ ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ ++ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++ LOAD_LOW_LIMB (pv1_low, p1_low); ++ LOAD_LOW_LIMB (pv1_high, p1_high); ++ ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); ++ LOAD_LOW_LIMB (pv2_high, p2_high); ++ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ ++ LOAD_LOW_LIMB (pv2_low, p2_low); ++ ++ ASM_LOADGPR (p0_low, up, idx, 16 + IDX_OFFSET); ++ ASM_LOADGPR (p1_low, up, idx, 24 + IDX_OFFSET); ++ ASM_LOADGPR (p2_low, up, idx, 16 + IDX_OFFSET); ++ ASM_LOADGPR (p3_low, up, idx, 24 + IDX_OFFSET); ++ ++ idx += LOOP_ADVANCE; ++ ++ /* ++ * "barrier" to enforce scheduling the index increment before the second ++ * block of multiplications. not required for clang. ++ */ ++#ifndef __clang__ ++ asm("" ++ : "=r"(idx), "=r"(p0_high), "=r"(p2_high) ++ : "0"(idx), "1"(p0_high), "2"(p2_high)); ++#endif ++ ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ /* ++ * "barrier" to enforce scheduling all MLGRs first, before any adding ++ * up. note that clang produces better code without. ++ */ ++#ifndef __clang__ ++ asm("" ++ : "=v"(pv0.sw), "=v"(pv3.sw) ++ : "1"(pv3.sw), "0"(pv0.sw), "r"(p0_high), "r"(p2_high)); ++#endif ++ ++ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); ++ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); ++ ++ low.dw = vec_permi (middle.dw, pv2_low.dw, ++ 3); /* least-significant doubleword from both vectors */ ++ middle.dw = vec_permi (zero.dw, middle.dw, 0); ++ high.sw = vec_add_u128 (middle.sw, pv3.sw); ++ ++#ifdef ADD ++ rp0 = vec_load_elements_reversed_idx (rp, idx, ++ 0 + IDX_OFFSET - LOOP_ADVANCE); ++ ADD_UP_CARRY_INOUT (0, rp0, carry_prod); ++#else ++ sum0 = carry_prod; ++#endif ++ ADD_UP_CARRY_INOUT (1, sum0, low); ++ ++ vec_store_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET - LOOP_ADVANCE, ++ sum1); ++ ++ carry_prod = high; ++ ++ vec_t pv0_2, pv3_2; ++ vec_t pv1_low_2, pv1_high_2, pv2_low_2, pv2_high_2; ++ vec_t low_2, middle_2, high_2; ++ vec_t sum2, sum3; ++ ++ pv0_2.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ LOAD_LOW_LIMB (pv1_low_2, p1_low); ++ LOAD_LOW_LIMB (pv1_high_2, p1_high); ++ ++ pv0_2.sw = vec_add_u128 (pv0_2.sw, pv1_low_2.sw); ++ LOAD_LOW_LIMB (pv2_high_2, p2_high); ++ pv3_2.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ pv3_2.sw = vec_add_u128 (pv3_2.sw, pv1_high_2.sw); ++ middle_2.sw = vec_add_u128 (pv0_2.sw, pv2_high_2.sw); ++ ++ LOAD_LOW_LIMB (pv2_low_2, p2_low); ++ low_2.dw ++ = vec_permi (middle_2.dw, pv2_low_2.dw, ++ 3); /* least-significant doubleword from both vectors */ ++ middle_2.dw = vec_permi (zero.dw, middle_2.dw, 0); ++ high_2.sw = vec_add_u128 (middle_2.sw, pv3_2.sw); ++ ++ /* ++ * another "barrier" to influence scheduling. (also helps in clang) ++ */ ++ asm("" : : "v"(pv0_2.sw), "r"(p2_high), "r"(p3_high), "v"(pv3_2.sw)); ++ ++#ifdef ADD ++ rp1 = vec_load_elements_reversed_idx (rp, idx, ++ 16 + IDX_OFFSET - LOOP_ADVANCE); ++ ADD_UP2_CARRY_INOUT (2, 0, rp1, carry_prod); ++#else ++ sum2 = carry_prod; ++#endif ++ ADD_UP2_CARRY_INOUT (3, 1, sum2, low_2); ++ ++ vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE, ++ sum3); ++ ++ carry_prod = high_2; ++ } ++ ++#ifdef ADD ++ sum0.sw = vec_adde_u128 (carry_prod.sw, carry_vec0.sw, carry_vec1.sw); ++#else ++ sum0.sw = vec_add_u128 (carry_prod.sw, carry_vec1.sw); ++#endif ++ ++ *(mp_ptr) (((char *)rp) + idx + 0 + IDX_OFFSET) = (mp_limb_t)sum0.dw[1]; ++ ++ return (mp_limb_t)sum0.dw[0]; ++} +diff --git a/mpn/s390_64/z13/gmp-mparam.h b/mpn/s390_64/z13/gmp-mparam.h +new file mode 100644 +index 000000000..a17503fd0 +--- /dev/null ++++ b/mpn/s390_64/z13/gmp-mparam.h +@@ -0,0 +1,37 @@ ++/* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file. ++ ++Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#define GMP_LIMB_BITS 64 ++#define GMP_LIMB_BYTES 8 ++ ++#define HAVE_NATIVE_mpn_addmul_2 1 ++#define HAVE_NATIVE_mpn_mul_2 1 ++ ++#include "mpn/s390_64/gmp-mparam.h" +-- +2.40.1 diff --git a/SOURCES/ibm_z13_simd_part3.patch b/SOURCES/ibm_z13_simd_part3.patch new file mode 100644 index 0000000..19069ca --- /dev/null +++ b/SOURCES/ibm_z13_simd_part3.patch @@ -0,0 +1,139 @@ +Co-authored-by: Stefan Liebler +--- + mpn/s390_64/z13/mul_basecase.c | 124 +++++++++++++++++++++++++++++++++ + 1 file changed, 124 insertions(+) + create mode 100644 mpn/s390_64/z13/mul_basecase.c + +diff --git a/mpn/s390_64/z13/mul_basecase.c b/mpn/s390_64/z13/mul_basecase.c +new file mode 100644 +index 000000000..f1b7160b3 +--- /dev/null ++++ b/mpn/s390_64/z13/mul_basecase.c +@@ -0,0 +1,125 @@ ++/* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two ++ natural numbers of length m and n. ++ ++ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY ++ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include ++ ++#include "gmp.h" ++#include "gmp-impl.h" ++ ++/* Note: we explicitly inline all mul and addmul routines here to reduce the ++ * number of branches in prologues of unrolled functions. That comes at the ++ cost of duplicating common loop bodies in object code. */ ++#define DO_INLINE ++ ++/* ++ * tweak loop conditions in addmul subroutines to enable use of ++ * branch-relative-on-count (BRCTG) instructions, which currently results in ++ * better performance. ++ */ ++#define BRCTG ++ ++#include "s390_64/z13/common-vec.h" ++ ++#define OPERATION_mul_1 ++#include "s390_64/z13/addmul_1.c" ++#undef OPERATION_mul_1 ++ ++#define OPERATION_addmul_1 ++#include "s390_64/z13/addmul_1.c" ++#undef OPERATION_addmul_1 ++ ++#define OPERATION_mul_2 ++#include "s390_64/z13/aormul_2.c" ++#undef OPERATION_mul_2 ++ ++#define OPERATION_addmul_2 ++#include "s390_64/z13/aormul_2.c" ++#undef OPERATION_addmul_2 ++ ++void ++mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp, ++ mp_size_t vn) ++{ ++ ASSERT (un >= vn); ++ ASSERT (vn >= 1); ++ ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un)); ++ ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn)); ++ ++ /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch ++ * for un%4 and inline specific variants. */ ++ ++#define BRANCH_FOR_MOD(N) \ ++ do \ ++ { \ ++ if (vn >= 2) \ ++ { \ ++ rp[un + 1] = inline_mul_2 (rp, up, un, vp); \ ++ rp += 2, vp += 2, vn -= 2; \ ++ } \ ++ else \ ++ { \ ++ rp[un] = inline_mul_1 (rp, up, un, vp[0]); \ ++ return; \ ++ } \ ++ \ ++ while (vn >= 2) \ ++ { \ ++ rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp); \ ++ rp += 2, vp += 2, vn -= 2; \ ++ } \ ++ \ ++ while (vn >= 1) \ ++ { \ ++ rp[un] = inline_addmul_1 (rp, up, un, vp[0]); \ ++ rp += 1, vp += 1, vn -= 1; \ ++ } \ ++ } \ ++ while (0); ++ ++ switch (((size_t)un) % 4) ++ { ++ case 0: ++ BRANCH_FOR_MOD (0); ++ break; ++ case 1: ++ BRANCH_FOR_MOD (1); ++ break; ++ case 2: ++ BRANCH_FOR_MOD (2); ++ break; ++ case 3: ++ BRANCH_FOR_MOD (3); ++ break; ++ } ++} +-- +2.40.1 diff --git a/SOURCES/ibm_z13_simd_part4.patch b/SOURCES/ibm_z13_simd_part4.patch new file mode 100644 index 0000000..c87c17c --- /dev/null +++ b/SOURCES/ibm_z13_simd_part4.patch @@ -0,0 +1,151 @@ +From: Marius Hillenbrand + +--- + mpn/s390_64/z13/gmp-mparam.h | 129 ++++++++++++++++++++++++++++++++++- + 1 file changed, 127 insertions(+), 2 deletions(-) + +diff --git a/mpn/s390_64/z13/gmp-mparam.h b/mpn/s390_64/z13/gmp-mparam.h +index a17503fd0..50e7f39d1 100644 +--- a/mpn/s390_64/z13/gmp-mparam.h ++++ b/mpn/s390_64/z13/gmp-mparam.h +@@ -1,6 +1,6 @@ + /* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file. + +-Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. ++Copyright 2021 Free Software Foundation, Inc. + + This file is part of the GNU MP Library. + +@@ -34,4 +34,129 @@ see https://www.gnu.org/licenses/. */ + #define HAVE_NATIVE_mpn_addmul_2 1 + #define HAVE_NATIVE_mpn_mul_2 1 + +-#include "mpn/s390_64/gmp-mparam.h" ++/* Generated by tuneup.c, 2021-07-30, gcc 10.2 */ ++ ++#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define MOD_1_1P_METHOD 2 ++#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define MOD_1N_TO_MOD_1_1_THRESHOLD 17 ++#define MOD_1U_TO_MOD_1_1_THRESHOLD 15 ++#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ ++#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ ++#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5 ++#define USE_PREINV_DIVREM_1 1 ++#define DIV_QR_1N_PI1_METHOD 3 ++#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define DIV_QR_2_PI2_THRESHOLD 996 ++#define DIVEXACT_1_THRESHOLD 4 ++#define BMOD_1_TO_MOD_1_THRESHOLD 0 /* always */ ++ ++#define DIV_1_VS_MUL_1_PERCENT 404 ++ ++#define MUL_TOOM22_THRESHOLD 23 ++#define MUL_TOOM33_THRESHOLD 94 ++#define MUL_TOOM44_THRESHOLD 166 ++#define MUL_TOOM6H_THRESHOLD 286 ++#define MUL_TOOM8H_THRESHOLD 626 ++ ++#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113 ++#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 ++#define MUL_TOOM42_TO_TOOM53_THRESHOLD 143 ++#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 ++#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 ++ ++#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ ++#define SQR_TOOM2_THRESHOLD 12 ++#define SQR_TOOM3_THRESHOLD 84 ++#define SQR_TOOM4_THRESHOLD 234 ++#define SQR_TOOM6_THRESHOLD 318 ++#define SQR_TOOM8_THRESHOLD 478 ++ ++#define MULMID_TOOM42_THRESHOLD 42 ++ ++#define MULMOD_BNM1_THRESHOLD 13 ++#define SQRMOD_BNM1_THRESHOLD 7 ++ ++#define MUL_FFT_MODF_THRESHOLD 332 /* k = 5 */ ++#define MUL_FFT_TABLE3 \ ++ { { 332, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ ++ { 21, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ ++ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ ++ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ ++ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ ++ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ ++ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ ++ { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ ++ { 67,10}, { 47,11}, { 2048,12}, { 4096,13}, \ ++ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ ++ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ ++ {2097152,22}, {4194304,23}, {8388608,24} } ++#define MUL_FFT_TABLE3_SIZE 47 ++#define MUL_FFT_THRESHOLD 2752 ++ ++#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ ++#define SQR_FFT_TABLE3 \ ++ { { 240, 5}, { 8, 4}, { 17, 5}, { 13, 6}, \ ++ { 7, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ ++ { 9, 5}, { 19, 6}, { 15, 7}, { 8, 6}, \ ++ { 17, 7}, { 9, 6}, { 19, 7}, { 10, 6}, \ ++ { 21, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ ++ { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ ++ { 21, 9}, { 11, 8}, { 23, 9}, { 15, 8}, \ ++ { 31, 9}, { 19, 8}, { 39, 9}, { 23,10}, \ ++ { 15, 9}, { 39,10}, { 23,11}, { 15,10}, \ ++ { 31, 9}, { 63,10}, { 47,11}, { 2048,12}, \ ++ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ ++ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ ++ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } ++#define SQR_FFT_TABLE3_SIZE 52 ++#define SQR_FFT_THRESHOLD 1856 ++ ++#define MULLO_BASECASE_THRESHOLD 0 /* always */ ++#define MULLO_DC_THRESHOLD 25 ++#define MULLO_MUL_N_THRESHOLD 5397 ++#define SQRLO_BASECASE_THRESHOLD 0 /* always */ ++#define SQRLO_DC_THRESHOLD 396 ++#define SQRLO_SQR_THRESHOLD 3704 ++ ++#define DC_DIV_QR_THRESHOLD 15 ++#define DC_DIVAPPR_Q_THRESHOLD 50 ++#define DC_BDIV_QR_THRESHOLD 66 ++#define DC_BDIV_Q_THRESHOLD 202 ++ ++#define INV_MULMOD_BNM1_THRESHOLD 46 ++#define INV_NEWTON_THRESHOLD 29 ++#define INV_APPR_THRESHOLD 13 ++ ++#define BINV_NEWTON_THRESHOLD 312 ++#define REDC_1_TO_REDC_2_THRESHOLD 79 ++#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ ++ ++#define MU_DIV_QR_THRESHOLD 979 ++#define MU_DIVAPPR_Q_THRESHOLD 979 ++#define MUPI_DIV_QR_THRESHOLD 13 ++#define MU_BDIV_QR_THRESHOLD 942 ++#define MU_BDIV_Q_THRESHOLD 1367 ++ ++#define POWM_SEC_TABLE 3,19,215,1730 ++ ++#define GET_STR_DC_THRESHOLD 10 ++#define GET_STR_PRECOMPUTE_THRESHOLD 15 ++#define SET_STR_DC_THRESHOLD 882 ++#define SET_STR_PRECOMPUTE_THRESHOLD 2520 ++ ++#define FAC_DSC_THRESHOLD 228 ++#define FAC_ODD_THRESHOLD 24 ++ ++#define MATRIX22_STRASSEN_THRESHOLD 19 ++#define HGCD2_DIV1_METHOD 1 ++#define HGCD_THRESHOLD 61 ++#define HGCD_APPR_THRESHOLD 51 ++#define HGCD_REDUCE_THRESHOLD 1962 ++#define GCD_DC_THRESHOLD 217 ++#define GCDEXT_DC_THRESHOLD 263 ++#define JACOBI_BASE_METHOD 4 ++ +-- +2.40.1 diff --git a/SPECS/gmp.spec b/SPECS/gmp.spec new file mode 100644 index 0000000..40e3f02 --- /dev/null +++ b/SPECS/gmp.spec @@ -0,0 +1,712 @@ +# +# Important for %%{ix86}: +# This rpm has to be build on a CPU with sse2 support like Pentium 4 ! +# + +Summary: A GNU arbitrary precision library +Name: gmp +Version: 6.1.2 +Release: 12%{?dist} +Epoch: 1 +URL: http://gmplib.org/ +Source0: ftp://ftp.gmplib.org/pub/gmp-%{version}/gmp-%{version}.tar.bz2 +# or ftp://ftp.gnu.org/pub/gnu/gmp/gmp-%{version}.tar.xz +Source2: gmp.h +Source3: gmp-mparam.h +Patch2: gmp-6.0.0-debuginfo.patch +Patch3: gmp-fcf-protection.patch +Patch4: cve-2021-43618.patch +Patch5: ibm_z13_simd_part1.patch +Patch6: ibm_z13_simd_part2.patch +Patch7: ibm_z13_simd_part3.patch +Patch8: ibm_z13_simd_part4.patch +License: LGPLv3+ or GPLv2+ +Group: System Environment/Libraries +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +BuildRequires: autoconf automake libtool +BuildRequires: git +#autoreconf on arm needs: +BuildRequires: perl-Carp +BuildRequires: fipscheck + +%description +The gmp package contains GNU MP, a library for arbitrary precision +arithmetic, signed integers operations, rational numbers and floating +point numbers. GNU MP is designed for speed, for both small and very +large operands. GNU MP is fast because it uses fullwords as the basic +arithmetic type, it uses fast algorithms, it carefully optimizes +assembly code for many CPUs' most common inner loops, and it generally +emphasizes speed over simplicity/elegance in its operations. + +Install the gmp package if you need a fast arbitrary precision +library. + +%package c++ +Summary: C++ bindings for the GNU MP arbitrary precision library +Group: System Environment/Libraries +Requires: %{name}%{?_isa} = %{epoch}:%{version}-%{release} + +%description c++ +Bindings for using the GNU MP arbitrary precision library in C++ applications. + +%package devel +Summary: Development tools for the GNU MP arbitrary precision library +Group: Development/Libraries +Requires: %{name}%{?_isa} = %{epoch}:%{version}-%{release} +Requires: %{name}-c++%{?_isa} = %{epoch}:%{version}-%{release} +Requires(post): /sbin/install-info +Requires(preun): /sbin/install-info + +%description devel +The libraries, header files and documentation for using the GNU MP +arbitrary precision library in applications. + +If you want to develop applications which will use the GNU MP library, +you'll need to install the gmp-devel package. You'll also need to +install the gmp package. + +%package static +Summary: Development tools for the GNU MP arbitrary precision library +Group: Development/Libraries +Requires: %{name}-devel = %{epoch}:%{version}-%{release} + +%description static +The static libraries for using the GNU MP arbitrary precision library +in applications. + +%prep +%autosetup -S git + +# switch the defaults to new cpus on s390x +%ifarch s390x +( cd mpn/s390_64; ln -s z13 s390x ) +%endif + +%build +autoreconf -ifv +if as --help | grep -q execstack; then + # the object files do not require an executable stack + export CCAS="gcc -c -Wa,--noexecstack" +fi + +%ifarch %{ix86} + export CFLAGS=$(echo %{optflags} | sed -e "s/-mtune=[^ ]*//g" | sed -e "s/-march=[^ ]*/-march=i686/g") + export CXXFLAGS=$(echo %{optflags} | sed -e "s/-mtune=[^ ]*//g" | sed -e "s/-march=[^ ]*/-march=i686/g") +%endif + +export CCAS="$CCAS -Wa,--generate-missing-build-notes=yes" +export CFLAGS="$(echo %{optflags}) -fplugin=annobin" +export CXXFLAGS="$(echo %{optflags}) -fplugin=annobin" + +%configure --enable-cxx --enable-fat + +sed -e 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' \ + -e 's|^runpath_var=LD_RUN_PATH|runpath_var=DIE_RPATH_DIE|g' \ + -e 's|-lstdc++ -lm|-lstdc++|' \ + -i libtool +export LD_LIBRARY_PATH=`pwd`/.libs +make %{?_smp_mflags} +make check + +# Add generation of HMAC checksums of the final stripped binaries +# bz#1117188 +%define __spec_install_post \ + %{?__debug_package:%{__debug_install_post}} \ + %{__arch_install_post} \ + %{__os_install_post} \ + mkdir -p $RPM_BUILD_ROOT%{_libdir}/fipscheck \ + fipshmac -d $RPM_BUILD_ROOT%{_libdir}/fipscheck $RPM_BUILD_ROOT%{_libdir}/libgmp.so.10.3.2 \ + ln -s libgmp.so.10.3.2.hmac $RPM_BUILD_ROOT%{_libdir}/fipscheck/libgmp.so.10.hmac \ + %{nil} + +%install +export LD_LIBRARY_PATH=`pwd`/.libs +make install DESTDIR=$RPM_BUILD_ROOT +install -m 644 gmp-mparam.h ${RPM_BUILD_ROOT}%{_includedir} +rm -f $RPM_BUILD_ROOT%{_libdir}/lib{gmp,mp,gmpxx}.la +rm -f $RPM_BUILD_ROOT%{_infodir}/dir +/sbin/ldconfig -n $RPM_BUILD_ROOT%{_libdir} +ln -sf libgmpxx.so.4 $RPM_BUILD_ROOT%{_libdir}/libgmpxx.so + +# Rename gmp.h to gmp-.h and gmp-mparam.h to gmp-mparam-.h to +# avoid file conflicts on multilib systems and install wrapper include files +# gmp.h and gmp-mparam-.h +basearch=%{_arch} +# always use i386 for iX86 +%ifarch %{ix86} +basearch=i386 +%endif +# always use arm for arm* +%ifarch %{arm} +basearch=arm +%endif +# superH architecture support +%ifarch sh3 sh4 +basearch=sh +%endif +# Rename files and install wrappers + +mv %{buildroot}/%{_includedir}/gmp.h %{buildroot}/%{_includedir}/gmp-${basearch}.h +install -m644 %{SOURCE2} %{buildroot}/%{_includedir}/gmp.h +mv %{buildroot}/%{_includedir}/gmp-mparam.h %{buildroot}/%{_includedir}/gmp-mparam-${basearch}.h +install -m644 %{SOURCE3} %{buildroot}/%{_includedir}/gmp-mparam.h + + +%check +%ifnarch ppc +export LD_LIBRARY_PATH=`pwd`/.libs +make %{?_smp_mflags} check +%endif + +%post -p /sbin/ldconfig + +%postun -p /sbin/ldconfig + +%post c++ -p /sbin/ldconfig + +%postun c++ -p /sbin/ldconfig + +%post devel +if [ -f %{_infodir}/gmp.info.gz ]; then + /sbin/install-info %{_infodir}/gmp.info.gz %{_infodir}/dir || : +fi +exit 0 + +%preun devel +if [ $1 = 0 ]; then + if [ -f %{_infodir}/gmp.info.gz ]; then + /sbin/install-info --delete %{_infodir}/gmp.info.gz %{_infodir}/dir || : + fi +fi +exit 0 + +%files +%defattr(-,root,root,-) +%{!?_licensedir:%global license %%doc} +%license COPYING COPYING.LESSERv3 COPYINGv2 COPYINGv3 +%doc NEWS README +%{_libdir}/libgmp.so.* +%{_libdir}/fipscheck/libgmp.so.10.3.2.hmac +%{_libdir}/fipscheck/libgmp.so.10.hmac + +%files c++ +%{_libdir}/libgmpxx.so.* + +%files devel +%defattr(-,root,root,-) +%{_libdir}/libgmp.so +%{_libdir}/libgmpxx.so +%{_includedir}/*.h +%{_infodir}/gmp.info* + +%files static +%defattr(-,root,root,-) +%{_libdir}/libgmp.a +%{_libdir}/libgmpxx.a + +%changelog +* Mon Feb 05 2024 Jakub Martisko - 1:6.1.2-12 +- Add s390x optimizations +Resolves: RHEL-10549 + +* Mon Jan 29 2024 Jakub Martisko - 1:6.1.2-11 +- Fix: CVE-2021-43618 +Resolves: RHEL-23055 + +* Fri Jun 14 2019 Jakub Martisko - 1:6.1.2-10 +- Add gating.yaml +Related: #1681026 + +* Tue Jun 11 2019 Jakub Martisko - 1:6.1.2-9 +- Add support for intel CET and -fcf-protection +- Add missing compiler/linker flags +Related: #1630567 + +* Thu Jul 26 2018 David Kaspar [Dee'Kej] - 1:6.1.2-8 +- Missing fipschecks added into build process (bug #1553679) +- --enable-fat option added to %%configure (bug #1493218) + +* Wed Feb 07 2018 Fedora Release Engineering - 1:6.1.2-7 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild + +* Wed Aug 02 2017 Fedora Release Engineering - 1:6.1.2-6 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild + +* Wed Jul 26 2017 Fedora Release Engineering - 1:6.1.2-5 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Mass_Rebuild + +* Mon Mar 13 2017 David Kaspar [Dee'Kej] - 1:6.1.2-4 +- Fix the build process for ix89 family + +* Fri Feb 17 2017 David Kaspar [Dee'Kej] - 1:6.1.2-3 +- Build process updated to correctly build .debug_info for i386 + and to correctly use hardening flags + +* Fri Feb 10 2017 Fedora Release Engineering - 1:6.1.2-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_26_Mass_Rebuild + +* Tue Dec 20 2016 Frantisek Kluknavsky - 1:6.1.2-1 +- rebase + +* Wed Jun 22 2016 Frantisek Kluknavsky - 1:6.1.1-1 +- rebase + +* Fri Apr 08 2016 Yaakov Selkowitz - 1:6.1.0-3 +- Split c++ subpackage (#1325439) + +* Wed Feb 03 2016 Fedora Release Engineering - 1:6.1.0-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_24_Mass_Rebuild + +* Wed Nov 25 2015 Frantisek Kluknavsky - 1:6.1.0-1 +- rebase to 6.1.0 +- gmp-6.0.0-ppc64.patch already upstream, dropped + +* Mon Sep 14 2015 Frantisek Kluknavsky - 1:6.0.0-13 +- do not package sse2 variant, use --enable-fat instead (a bit dangerous, some low level routines will be skipped in `make check`) + +* Fri Sep 04 2015 Michal Toman - 1:6.0.0-12 +- Add support for MIPS architecture to gmp.h and gmp-mparam.h + +* Wed Jun 17 2015 Fedora Release Engineering - 1:6.0.0-11 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild + +* Sat May 02 2015 Kalev Lember - 1:6.0.0-10 +- Rebuilt for GCC 5 C++11 ABI change + +* Thu Apr 02 2015 Frantisek Kluknavsky - 1:6.0.0-9 +- bug965318 - improve debuginfo of assembler sources + +* Thu Sep 04 2014 Dan Horák - 1:6.0.0-8 +- drop s390x patch, support is already in upstream + +* Sat Aug 16 2014 Fedora Release Engineering - 1:6.0.0-7 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_22_Mass_Rebuild + +* Sat Jul 12 2014 Tom Callaway - 1:6.0.0-6 +- fix license handling + +* Thu Jul 10 2014 Brent Baude - 1:6.0.0-5 +- Fix gmp headers for ppc64le (#1083429) + +* Sat Jun 07 2014 Fedora Release Engineering - 1:6.0.0-4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_Mass_Rebuild + +* Thu Apr 24 2014 Karsten Hopp 6.0.0-3 +- set default for BMOD_1_TO_MOD_1_THRESHOLD on ppc64, patch by + Torbjorn Granlund: + https://gmplib.org/repo/gmp/rev/4a6d258b467f + +* Mon Apr 14 2014 Frantisek Kluknavsky - 1:6.0.0-2 +- rebase + +* Wed Nov 06 2013 Frantisek Kluknavsky - 1:5.1.3-2 +- support for aarch64 + +* Wed Nov 06 2013 Frantisek Kluknavsky - 1:5.1.3-1 +- rebase to 5.1.3 + +* Sat Aug 03 2013 Fedora Release Engineering - 1:5.1.2-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild + +* Thu May 30 2013 Frantisek Kluknavsky - 1:5.1.2-1 +- rebase to 5.1.2 + +* Thu Mar 28 2013 Frantisek Kluknavsky - 1:5.1.1-3 +- added build dependency needed to autoreconf on arm + +* Thu Feb 14 2013 Frantisek Kluknavsky - 1:5.1.1-2 +- rebase to 5.1.1 +- deleted unapplicable part of gmp-4.0.1-s390.patch + +* Fri Jan 25 2013 Frantisek Kluknavsky - 1:5.1.0-1 +- rebase to 5.1.0, de-ansi patch no longer applicable +- upstream dropped libmp.so (bsdmp-like interface) +- silenced bogus date in changelog + +* Tue Jan 22 2013 Peter Robinson 1:5.0.5-6 +- Rebuild against new binutils to fix FTBFS on ARM + +* Fri Nov 23 2012 Frantisek Kluknavsky - 1:5.0.5-5 +- minor spec cleanup + +* Fri Jul 20 2012 Peter Schiffer 1:5.0.5-3 +- fixed FTBFS + +* Thu Jul 19 2012 Fedora Release Engineering - 1:5.0.5-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_18_Mass_Rebuild + +* Mon Jun 25 2012 Peter Schiffer 1:5.0.5-1 +- resolves: #820897 + update to 5.0.5 + +* Thu Apr 19 2012 Peter Schiffer 1:5.0.4-1 +- resolves: #785116 + update to 5.0.4 + +* Tue Feb 28 2012 Fedora Release Engineering - 1:5.0.2-6 +- Rebuilt for c++ ABI breakage + +* Thu Jan 19 2012 Peter Schiffer 1:5.0.2-5 +- fixed FTBFS with gcc 4.7 on 32bit arch + +* Fri Jan 13 2012 Fedora Release Engineering - 1:5.0.2-4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_17_Mass_Rebuild + +* Fri Oct 14 2011 Peter Schiffer 1:5.0.2-3 +- removed old compatibility library + +* Mon Sep 26 2011 Peter Schiffer 1:5.0.2-2 +- temporary build wild old compatibility library version + +* Tue Sep 20 2011 Peter Schiffer 1:5.0.2-1 +- resolves: #702919 + update to 5.0.2 +- resolves: #738091 + removed unused direct shlib dependency on libm + updated license in gmp.h and gmp-mparam.h files + +* Mon Jun 13 2011 Ivana Hutarova Varekova 1:4.3.2-4 +- Resolves: #706374 + fix sse2/libgmp.so.3.5.2 debuginfo data + +* Tue Feb 08 2011 Fedora Release Engineering - 1:4.3.2-3 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_15_Mass_Rebuild + +* Wed Nov 24 2010 Ivana Hutarova Varekova 1:4.3.2-2 +- fix Requires tag + +* Wed Nov 24 2010 Ivana Hutarova Varekova 1:4.3.2-1 +- downgrade from 5.0.1 to 4.3.2 + +* Mon May 24 2010 Ivana Hutarova Varekova 5.0.1-1 +- update to 5.0.1 + +* Tue Mar 2 2010 Ivana Hutarova Varekova 4.3.1-7 +- fix the license tag + +* Fri Nov 27 2009 Ivana Hutarova Varekova 4.3.1-6 +- remove unnecessary dependences + remove duplicated documentation + +* Mon Aug 10 2009 Ivana Varekova 4.3.1-5 +- fix installation with --excludedocs option (#515947) + +* Fri Jul 24 2009 Fedora Release Engineering - 4.3.1-4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_12_Mass_Rebuild + +* Wed Jun 17 2009 Ivana Varekova 4.3.1-3 +- rebuild + +* Mon Jun 15 2009 Ivana Varekova 4.3.1-2 +- Resolves: #505592 + add RPM_OPT_FLAGS + +* Thu May 28 2009 Ivana Varekova 4.3.1-1 +- update to 4.3.1 +- remove configure macro (built problem) + +* Thu Apr 09 2009 Dennis Gilmore - 4.2.4-6 +- no check that --host and --target are the same when building i586 or sparcv9 they are not + +* Tue Feb 24 2009 Fedora Release Engineering - 4.2.4-5 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_11_Mass_Rebuild + +* Tue Dec 23 2008 Ivana Varekova 4.2.4-4 +- fix spec file + +* Mon Dec 8 2008 Ivana Varekova 4.2.4-3 +- remove useless option (#475073) + +* Wed Dec 3 2008 Stepan Kasal 4.2.4-2 +- Run full autoreconf, add automake to BuildRequires. + +* Mon Nov 10 2008 Ivana Varekova 4.2.4-1 +- update to 4.2.4 + +* Fri Nov 7 2008 Ivana Varekova 4.2.2-9 +- remove useless patch (#470200) + +* Thu Apr 24 2008 Tom "spot" Callaway 4.2.2-8 +- add sparc/sparc64 support + +* Wed Mar 19 2008 Ivana Varekova 4.2.2-7 +- add superH support (#437688) + +* Wed Feb 13 2008 Ivana varekova 4.2.2-6 +- fix gcc-4.3 problem - add (#432336) + +* Fri Feb 8 2008 Ivana Varekova 4.2.2-5 +- split the devel subpackage to devel and static parts + +* Thu Feb 7 2008 Ivana Varekova 4.2.2-4 +- change license tag + +* Mon Sep 24 2007 Ivana Varekova 4.2.2-3 +- fix libgmpxx.so link + +* Thu Sep 20 2007 Ivana Varekova 4.2.2-2 +- fix check tag + +* Wed Sep 19 2007 Ivana Varekova 4.2.2-1 +- update to 4.2.2 + +* Mon Aug 20 2007 Ivana Varekova 4.2.1-3 +- spec file cleanup (#253439) + +* Tue Aug 7 2007 Ivana Varekova 4.2.1-2 +- add arm support (#245456) + thanks to Lennert Buytenhek + +* Mon Aug 6 2007 Ivana Varekova 4.2.1-1 +- update to 4.2.1 +- do some spec cleanups +- fix 238794 - gmp-devel depends on {version} but not on + {version}-{release} +- remove mpfr (moved to separate package) + +* Thu Jul 05 2007 Florian La Roche 4.1.4-13 +- don't fail scripts to e.g. allow excludedocs installs + +* Tue Apr 24 2007 Karsten Hopp 4.1.4-12.3 +- fix library permissions + +* Wed Mar 14 2007 Karsten Hopp 4.1.4-12.2 +- fix typo + +* Wed Mar 14 2007 Thomas Woerner 4.1.4-12.1 +- added alpha support for gmp.h and gmp-mparam.h wrappers + +* Fri Feb 23 2007 Karsten Hopp 4.1.4-12 +- remove trailing dot from summary +- fix buildroot +- fix post/postun/... requirements +- use make install DESTDIR=... +- replace tabs with spaces +- convert changelog to utf-8 + +* Wed Jan 17 2007 Jakub Jelinek 4.1.4-11 +- make sure libmpfr.a doesn't contain SSE2 instructions on i?86 (#222371) +- rebase to mpfr 2.2.1 from 2.2.0 + cumulative fixes + +* Thu Nov 2 2006 Thomas Woerner 4.1.4-10 +- fixed arch order in gmp.h and gmp-mparam.h wrapper for all architectures + +* Thu Nov 2 2006 Joe Orton 4.1.4-10 +- include ppc64 header on ppc64 not ppc header + +* Fri Oct 27 2006 Thomas Woerner - 4.1.4-9 +- fixed multilib devel conflicts for gmp (#212286) + +* Thu Oct 26 2006 Jakub Jelinek - 4.1.4-8 +- upgrade mpfr to 2.2.0 (#211971) +- apply mpfr 2.2.0 cumulative patch + +* Fri Jul 14 2006 Thomas Woerner - 4.1.4-7 +- release bump + +* Fri Feb 10 2006 Jesse Keating - 4.1.4-6.2.1 +- bump again for double-long bug on ppc(64) + +* Tue Feb 07 2006 Jesse Keating - 4.1.4-6.2 +- rebuilt for new gcc4.1 snapshot and glibc changes + +* Fri Dec 09 2005 Jesse Keating +- rebuilt + +* Mon Apr 18 2005 Thomas Woerner 4.1.4-6 +- fixed __setfpucw call in mpfr-test.h + +* Wed Mar 02 2005 Karsten Hopp 4.1.4-5 +- build with gcc-4 + +* Wed Feb 09 2005 Karsten Hopp 4.1.4-4 +- rebuilt + +* Sun Sep 26 2004 Florian La Roche +- 4.1.4 +- disable ppc64 patch, now fixed upstream + +* Tue Jun 15 2004 Elliot Lee +- rebuilt + +* Mon May 24 2004 Thomas Woerner 4.1.3-1 +- new version 4.1.3 + +* Wed Mar 31 2004 Thomas Woerner 4.1.2-14 +- dropped RPATH (#118506) + +* Sat Mar 06 2004 Florian La Roche +- also build SSE2 DSOs, patch from Ulrich Drepper + +* Tue Mar 02 2004 Elliot Lee +- rebuilt + +* Fri Feb 13 2004 Elliot Lee +- rebuilt + +* Thu Jan 29 2004 Thomas Woerner 4.1.2-11 +- BuildRequires for automake16 + +* Mon Dec 01 2003 Florian La Roche +- fix symlink to libgmpxx.so.3 #111135 +- add patch to factorize.c from gmp homepage + +* Thu Oct 23 2003 Joe Orton 4.1.2-9 +- build with -Wa,--noexecstack + +* Thu Oct 23 2003 Joe Orton 4.1.2-8 +- build assembly code with -Wa,--execstack +- use parallel make +- run tests, and fix C++ therein + +* Thu Oct 02 2003 Florian La Roche +- enable mpfr #104395 +- enable cxx #80195 +- add COPYING.LIB +- add fixes from gmp web-site +- remove some cruft patches for older libtool releases + +* Wed Jun 04 2003 Elliot Lee +- rebuilt + +* Tue Jun 03 2003 Florian La Roche +- make configure.in work with newer autoconf + +* Sun Jun 01 2003 Florian La Roche +- do not set extra_functions for s390x #92001 + +* Thu Feb 13 2003 Elliot Lee 4.1.2-3 +- Add ppc64 patch, accompanied by running auto* + +* Wed Jan 22 2003 Tim Powers +- rebuilt + +* Wed Jan 01 2003 Florian La Roche +- update to 4.1.2 + +* Tue Dec 03 2002 Florian La Roche +- update to 4.1.1 +- remove un-necessary patches +- adjust s390/x86_64 patch + +* Sun Oct 06 2002 Florian La Roche +- add s390x patch +- disable current x86-64 support in longlong.h + +* Mon Jul 8 2002 Trond Eivind Glomsrød 4.1-4 +- Add 4 patches, among them one for #67918 +- Update URL +- s/Copyright/License/ + +* Mon Jul 8 2002 Trond Eivind Glomsrød 4.1-3 +- Redefine the configure macro, the included configure + script isn't happy about the rpm default one (#68190). Also, make + sure the included libtool isn't replaced, + +* Fri Jun 21 2002 Tim Powers +- automated rebuild + +* Sat May 25 2002 Florian La Roche +- update to version 4.1 +- patch s390 gmp-mparam.h to match other archs. + +* Thu May 23 2002 Tim Powers +- automated rebuild + +* Mon Mar 11 2002 Trond Eivind Glomsrød 4.0.1-3 +- Use standard %%configure macro and edit %%{_tmppath} + +* Tue Feb 26 2002 Trond Eivind Glomsrød 4.0.1-2 +- Rebuild + +* Tue Jan 22 2002 Florian La Roche +- update to 4.0.1 +- bzip2 src + +* Wed Jan 09 2002 Tim Powers +- automated rebuild + +* Sun Jun 24 2001 Elliot Lee +- Bump release + rebuild. + +* Mon Feb 05 2001 Philipp Knirsch +- Fixed bugzilla bug #25515 where GMP wouldn't work on IA64 as IA64 is not +correctly identified as a 64 bit platform. + +* Mon Dec 18 2000 Preston Brown +- include bsd mp library + +* Tue Oct 17 2000 Florian La Roche +- update to 3.1.1 + +* Sun Sep 3 2000 Florian La Roche +- update to 3.1 + +* Sat Aug 19 2000 Preston Brown +- devel subpackage depends on main package so that .so symlink is OK. + +* Thu Jul 13 2000 Prospector +- automatic rebuild + +* Sat Jun 3 2000 Nalin Dahyabhai +- switch to the configure and makeinstall macros +- FHS-compliance fixing +- move docs to non-devel package + +* Fri Apr 28 2000 Bill Nottingham +- libtoolize for ia64 + +* Fri Apr 28 2000 Florian La Roche +- update to 3.0.1 + +* Thu Apr 27 2000 Jakub Jelinek +- sparc64 fixes for 3.0 + +* Wed Apr 26 2000 Florian La Roche +- update to 3.0 + +* Mon Feb 14 2000 Matt Wilson +- #include in files that use string functions + +* Wed Feb 02 2000 Cristian Gafton +- fix description and summary + +* Mon Dec 06 1999 Michael K. Johnson +- s/GPL/LGPL/ +- build as non-root (#7604) + +* Mon Sep 06 1999 Jakub Jelinek +- merge in some debian gmp fixes +- Ulrich Drepper's __gmp_scale2 fix +- my mpf_set_q fix +- sparc64 fixes + +* Wed Apr 28 1999 Cristian Gafton +- add sparc patch for PIC handling + +* Sun Mar 21 1999 Cristian Gafton +- auto rebuild in the new build environment (release 8) + +* Thu Feb 11 1999 Michael Johnson +- include the private header file gmp-mparam.h because several + apps seem to assume that they are building against the gmp + source tree and require it. Sigh. + +* Tue Jan 12 1999 Michael K. Johnson +- libtoolize to work on arm + +* Thu Sep 10 1998 Cristian Gafton +- yet another touch of the spec file + +* Wed Sep 2 1998 Michael Fulbright +- looked over before inclusion in RH 5.2 + +* Sun May 24 1998 Dick Porter +- Patch Makefile.in, not Makefile +- Don't specify i586, let configure decide the arch + +* Sat Jan 24 1998 Marc Ewing +- started with package from Toshio Kuratomi +- cleaned up file list +- fixed up install-info support +