You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
129 lines
5.8 KiB
129 lines
5.8 KiB
From bd531bd82852808f7fa403e3ee159bd62b1c08cc Mon Sep 17 00:00:00 2001
|
|
From: Sayed Adel <seiko@imavr.com>
|
|
Date: Tue, 28 Jan 2020 15:16:48 +0200
|
|
Subject: [PATCH] core:vsx fix inline asm constraints
|
|
|
|
generalize constraints to 'wa' for VSX registers
|
|
---
|
|
cmake/checks/cpu_vsx_asm.cpp | 2 +-
|
|
.../include/opencv2/core/hal/intrin_vsx.hpp | 4 +-
|
|
.../core/include/opencv2/core/vsx_utils.hpp | 50 ++++++++-----------
|
|
3 files changed, 25 insertions(+), 31 deletions(-)
|
|
|
|
diff --git a/cmake/checks/cpu_vsx_asm.cpp b/cmake/checks/cpu_vsx_asm.cpp
|
|
index bb4c25507e3..9c1bf7a946a 100644
|
|
--- a/cmake/checks/cpu_vsx_asm.cpp
|
|
+++ b/cmake/checks/cpu_vsx_asm.cpp
|
|
@@ -16,6 +16,6 @@ int main()
|
|
{
|
|
__vector float vf;
|
|
__vector signed int vi;
|
|
- __asm__ __volatile__ ("xvcvsxwsp %x0,%x1" : "=wf" (vf) : "wa" (vi));
|
|
+ __asm__ __volatile__ ("xvcvsxwsp %x0,%x1" : "=wa" (vf) : "wa" (vi));
|
|
return 0;
|
|
}
|
|
\ No newline at end of file
|
|
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
|
|
index bda1d8558f8..6e8b439182f 100644
|
|
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
|
|
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
|
|
@@ -1338,7 +1338,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
|
|
return v_float32x4(vec_extract_fp_from_shorth(vf16));
|
|
#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
|
|
vec_float4 vf32;
|
|
- __asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wf" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
|
|
+ __asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wa" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
|
|
return v_float32x4(vf32);
|
|
#else
|
|
const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
|
|
@@ -1363,7 +1363,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
|
|
// fixme: Is there any builtin op or intrinsic that cover "xvcvsphp"?
|
|
#if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
|
|
vec_ushort8 vf16;
|
|
- __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wf" (v.val));
|
|
+ __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (v.val));
|
|
vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
|
|
#else
|
|
const vec_int4 signmask = vec_int4_sp(0x80000000);
|
|
diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp
|
|
index d7c71406072..bcc97fe5297 100644
|
|
--- a/modules/core/include/opencv2/core/vsx_utils.hpp
|
|
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
|
|
@@ -110,9 +110,9 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
|
|
#if defined(__GNUG__) && !defined(__clang__)
|
|
|
|
// inline asm helper
|
|
-#define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \
|
|
-VSX_FINLINE(rt) fnm(const rg& a) \
|
|
-{ rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "="#rto (rs) : #rgo (a)); return rs; }
|
|
+#define VSX_IMPL_1RG(rt, rg, opc, fnm) \
|
|
+VSX_FINLINE(rt) fnm(const rg& a) \
|
|
+{ rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "=wa" (rs) : "wa" (a)); return rs; }
|
|
|
|
#define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
|
|
VSX_FINLINE(rt) fnm(const rg& a) \
|
|
@@ -257,44 +257,38 @@ VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
|
|
VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
|
|
|
|
// converts word and doubleword to double-precision
|
|
-#ifdef vec_ctd
|
|
-# undef vec_ctd
|
|
-#endif
|
|
-VSX_IMPL_1RG(vec_double2, wd, vec_int4, wa, xvcvsxwdp, vec_ctdo)
|
|
-VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, vec_ctdo)
|
|
-VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, vec_ctd)
|
|
-VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, vec_ctd)
|
|
+#undef vec_ctd
|
|
+VSX_IMPL_1RG(vec_double2, vec_int4, xvcvsxwdp, vec_ctdo)
|
|
+VSX_IMPL_1RG(vec_double2, vec_uint4, xvcvuxwdp, vec_ctdo)
|
|
+VSX_IMPL_1RG(vec_double2, vec_dword2, xvcvsxddp, vec_ctd)
|
|
+VSX_IMPL_1RG(vec_double2, vec_udword2, xvcvuxddp, vec_ctd)
|
|
|
|
// converts word and doubleword to single-precision
|
|
#undef vec_ctf
|
|
-VSX_IMPL_1RG(vec_float4, wf, vec_int4, wa, xvcvsxwsp, vec_ctf)
|
|
-VSX_IMPL_1RG(vec_float4, wf, vec_uint4, wa, xvcvuxwsp, vec_ctf)
|
|
-VSX_IMPL_1RG(vec_float4, wf, vec_dword2, wi, xvcvsxdsp, vec_ctfo)
|
|
-VSX_IMPL_1RG(vec_float4, wf, vec_udword2, wi, xvcvuxdsp, vec_ctfo)
|
|
+VSX_IMPL_1RG(vec_float4, vec_int4, xvcvsxwsp, vec_ctf)
|
|
+VSX_IMPL_1RG(vec_float4, vec_uint4, xvcvuxwsp, vec_ctf)
|
|
+VSX_IMPL_1RG(vec_float4, vec_dword2, xvcvsxdsp, vec_ctfo)
|
|
+VSX_IMPL_1RG(vec_float4, vec_udword2, xvcvuxdsp, vec_ctfo)
|
|
|
|
// converts single and double precision to signed word
|
|
#undef vec_cts
|
|
-VSX_IMPL_1RG(vec_int4, wa, vec_double2, wd, xvcvdpsxws, vec_ctso)
|
|
-VSX_IMPL_1RG(vec_int4, wa, vec_float4, wf, xvcvspsxws, vec_cts)
|
|
+VSX_IMPL_1RG(vec_int4, vec_double2, xvcvdpsxws, vec_ctso)
|
|
+VSX_IMPL_1RG(vec_int4, vec_float4, xvcvspsxws, vec_cts)
|
|
|
|
// converts single and double precision to unsigned word
|
|
#undef vec_ctu
|
|
-VSX_IMPL_1RG(vec_uint4, wa, vec_double2, wd, xvcvdpuxws, vec_ctuo)
|
|
-VSX_IMPL_1RG(vec_uint4, wa, vec_float4, wf, xvcvspuxws, vec_ctu)
|
|
+VSX_IMPL_1RG(vec_uint4, vec_double2, xvcvdpuxws, vec_ctuo)
|
|
+VSX_IMPL_1RG(vec_uint4, vec_float4, xvcvspuxws, vec_ctu)
|
|
|
|
// converts single and double precision to signed doubleword
|
|
-#ifdef vec_ctsl
|
|
-# undef vec_ctsl
|
|
-#endif
|
|
-VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, vec_ctsl)
|
|
-VSX_IMPL_1RG(vec_dword2, wi, vec_float4, wf, xvcvspsxds, vec_ctslo)
|
|
+#undef vec_ctsl
|
|
+VSX_IMPL_1RG(vec_dword2, vec_double2, xvcvdpsxds, vec_ctsl)
|
|
+VSX_IMPL_1RG(vec_dword2, vec_float4, xvcvspsxds, vec_ctslo)
|
|
|
|
// converts single and double precision to unsigned doubleword
|
|
-#ifdef vec_ctul
|
|
-# undef vec_ctul
|
|
-#endif
|
|
-VSX_IMPL_1RG(vec_udword2, wi, vec_double2, wd, xvcvdpuxds, vec_ctul)
|
|
-VSX_IMPL_1RG(vec_udword2, wi, vec_float4, wf, xvcvspuxds, vec_ctulo)
|
|
+#undef vec_ctul
|
|
+VSX_IMPL_1RG(vec_udword2, vec_double2, xvcvdpuxds, vec_ctul)
|
|
+VSX_IMPL_1RG(vec_udword2, vec_float4, xvcvspuxds, vec_ctulo)
|
|
|
|
// just in case if GCC doesn't define it
|
|
#ifndef vec_xl
|