commit d308b11fa94728507984b4ccc949219511273ab6 Author: Jonathan Wakely Date: Fri Nov 11 15:22:02 2022 +0000 libstdc++: Fix wstring conversions in filesystem::path [PR95048] In commit r9-7381-g91756c4abc1757 I changed filesystem::path to use std::codecvt for conversions from all wide strings to UTF-8, instead of using std::codecvt_utf8. This was done because for 16-bit wchar_t, std::codecvt_utf8 only supports UCS-2 and not UTF-16. The rationale for the change was sound, but the actual fix was not. It's OK to use std::codecvt for char16_t or char32_t, because the specializations for those types always use UTF-8 , but std::codecvt uses the current locale's encodings, and the narrow encoding is probably ASCII and can't support non-ASCII characters. The correct fix is to use std::codecvt only for char16_t and char32_t. For 32-bit wchar_t we could have continued using std::codecvt_utf8 because that uses UTF-32 which is fine, switching to std::codecvt broke non-Windows targets with 32-bit wchar_t. For 16-bit wchar_t we did need to change, but should have changed to std::codecvt_utf8_utf16 instead, as that always uses UTF-16 not UCS-2. I actually noted that in the commit message for r9-7381-g91756c4abc1757 but didn't use that option. Oops. This replaces the unconditional std::codecvt with a type defined via template specialization, so it can vary depending on the wide character type. The code is also simplified to remove some of the mess of #ifdef and if-constexpr conditions. libstdc++-v3/ChangeLog: PR libstdc++/95048 * include/bits/fs_path.h (path::_Codecvt): New class template that selects the kind of code conversion done. (path::_Codecvt): Select based on sizeof(wchar_t). (_GLIBCXX_CONV_FROM_UTF8): New macro to allow the same code to be used for Windows and POSIX. (path::_S_convert(const EcharT*, const EcharT*)): Simplify by using _Codecvt and _GLIBCXX_CONV_FROM_UTF8 abstractions. (path::_S_str_convert(basic_string_view, const A&)): Simplify nested conditions. * include/experimental/bits/fs_path.h (path::_Cvt): Define nested typedef controlling type of code conversion done. (path::_Cvt::_S_wconvert): Use new typedef. (path::string(const A&)): Likewise. * testsuite/27_io/filesystem/path/construct/95048.cc: New test. * testsuite/experimental/filesystem/path/construct/95048.cc: New test. (cherry picked from commit b331bf303bdc1edead41e2b3d11d1a7804b433cf) diff --git a/libstdc++-v3/include/bits/fs_path.h b/libstdc++-v3/include/bits/fs_path.h index 1188e585b14..029a6f365b3 100644 --- a/libstdc++-v3/include/bits/fs_path.h +++ b/libstdc++-v3/include/bits/fs_path.h @@ -684,6 +684,8 @@ namespace __detail _List _M_cmpts; struct _Parser; + + template struct _Codecvt; }; /// @{ @@ -811,55 +813,72 @@ namespace __detail size_t _M_pos; }; + // path::_Codecvt Performs conversions between C and path::string_type. + // The native encoding of char strings is the OS-dependent current + // encoding for pathnames. FIXME: We assume this is UTF-8 everywhere, + // but should use a Windows API to query it. + + // Converts between native pathname encoding and char16_t or char32_t. + template + struct path::_Codecvt + // Need derived class here because std::codecvt has protected destructor. + : std::codecvt<_EcharT, char, mbstate_t> + { }; + + // Converts between native pathname encoding and native wide encoding. + // The native encoding for wide strings is the execution wide-character + // set encoding. FIXME: We assume that this is either UTF-32 or UTF-16 + // (depending on the width of wchar_t). That matches GCC's default, + // but can be changed with -fwide-exec-charset. + // We need a custom codecvt converting the native pathname encoding + // to/from the native wide encoding. + template<> + struct path::_Codecvt + : conditional_t, // UTF-8 <-> UTF-32 + std::codecvt_utf8_utf16> // UTF-8 <-> UTF-16 + { }; + template auto path::_S_convert(const _EcharT* __f, const _EcharT* __l) { static_assert(__detail::__is_encoded_char<_EcharT>); +#ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS +# define _GLIBCXX_CONV_FROM_UTF8(S) __detail::__wstr_from_utf8(S) +#else +# define _GLIBCXX_CONV_FROM_UTF8(S) S +#endif + if constexpr (is_same_v<_EcharT, value_type>) return basic_string_view(__f, __l - __f); -#if !defined _GLIBCXX_FILESYSTEM_IS_WINDOWS && defined _GLIBCXX_USE_CHAR8_T +#ifdef _GLIBCXX_USE_CHAR8_T else if constexpr (is_same_v<_EcharT, char8_t>) - // For POSIX converting from char8_t to char is also 'noconv' - return string_view(reinterpret_cast(__f), __l - __f); -#endif - else { + string_view __str(reinterpret_cast(__f), __l - __f); + return _GLIBCXX_CONV_FROM_UTF8(__str); + } +#endif #ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS + else if constexpr (is_same_v<_EcharT, char>) + { std::wstring __wstr; - if constexpr (is_same_v<_EcharT, char>) - { - struct _UCvt : std::codecvt - { } __cvt; - if (__str_codecvt_in_all(__f, __l, __wstr, __cvt)) - return __wstr; - } -#ifdef _GLIBCXX_USE_CHAR8_T - else if constexpr (is_same_v<_EcharT, char8_t>) - { - const auto __f2 = reinterpret_cast(__f); - return __detail::__wstr_from_utf8(string_view(__f2, __l - __f)); - } + path::_Codecvt __cvt; + if (__str_codecvt_in_all(__f, __l, __wstr, __cvt)) + return __wstr; + } #endif - else // char16_t or char32_t - { - struct _UCvt : std::codecvt<_EcharT, char, std::mbstate_t> - { } __cvt; - std::string __str; - if (__str_codecvt_out_all(__f, __l, __str, __cvt)) - return __detail::__wstr_from_utf8(__str); - } -#else // ! windows - struct _UCvt : std::codecvt<_EcharT, char, std::mbstate_t> - { } __cvt; + else + { + path::_Codecvt<_EcharT> __cvt; std::string __str; if (__str_codecvt_out_all(__f, __l, __str, __cvt)) - return __str; -#endif - __detail::__throw_conversion_error(); + return _GLIBCXX_CONV_FROM_UTF8(__str); } + __detail::__throw_conversion_error(); } +#undef _GLIBCXX_CONV_FROM_UTF8 /// @endcond @@ -1028,7 +1047,9 @@ namespace __detail if (__str.size() == 0) return _WString(__a); -#ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS +#ifndef _GLIBCXX_FILESYSTEM_IS_WINDOWS + string_view __u8str = __str; +#else // First convert native string from UTF-16 to to UTF-8. // XXX This assumes that the execution wide-character set is UTF-16. std::codecvt_utf8_utf16 __cvt; @@ -1038,35 +1059,30 @@ namespace __detail _String __u8str{_CharAlloc{__a}}; const value_type* __wfirst = __str.data(); const value_type* __wlast = __wfirst + __str.size(); - if (__str_codecvt_out_all(__wfirst, __wlast, __u8str, __cvt)) { + if (!__str_codecvt_out_all(__wfirst, __wlast, __u8str, __cvt)) + __detail::__throw_conversion_error(); if constexpr (is_same_v<_CharT, char>) return __u8str; // XXX assumes native ordinary encoding is UTF-8. - else { - - const char* __first = __u8str.data(); - const char* __last = __first + __u8str.size(); -#else - const value_type* __first = __str.data(); - const value_type* __last = __first + __str.size(); -#endif - - // Convert UTF-8 string to requested format. -#ifdef _GLIBCXX_USE_CHAR8_T - if constexpr (is_same_v<_CharT, char8_t>) - return _WString(__first, __last, __a); else #endif { - // Convert UTF-8 to wide string. - _WString __wstr(__a); - struct _UCvt : std::codecvt<_CharT, char, std::mbstate_t> { } __cvt; - if (__str_codecvt_in_all(__first, __last, __wstr, __cvt)) - return __wstr; - } + const char* __first = __u8str.data(); + const char* __last = __first + __u8str.size(); -#ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS - } } + // Convert UTF-8 string to requested format. +#ifdef _GLIBCXX_USE_CHAR8_T + if constexpr (is_same_v<_CharT, char8_t>) + return _WString(__first, __last, __a); + else #endif + { + // Convert UTF-8 to wide string. + _WString __wstr(__a); + path::_Codecvt<_CharT> __cvt; + if (__str_codecvt_in_all(__first, __last, __wstr, __cvt)) + return __wstr; + } + } __detail::__throw_conversion_error(); } /// @endcond diff --git a/libstdc++-v3/include/experimental/bits/fs_path.h b/libstdc++-v3/include/experimental/bits/fs_path.h index 830b51e45a0..65d63e433e0 100644 --- a/libstdc++-v3/include/experimental/bits/fs_path.h +++ b/libstdc++-v3/include/experimental/bits/fs_path.h @@ -748,15 +748,48 @@ namespace __detail template<> struct path::_Cvt { + // We need this type to be defined because we don't have `if constexpr` + // in C++11 and so path::string(const A&) needs to be able to + // declare a variable of this type and pass it to __str_codecvt_in_all. + using __codecvt_utf8_to_wide = _Cvt; + // Dummy overload used for unreachable calls in path::string. + template + friend bool + __str_codecvt_in_all(const char*, const char*, + _WStr&, __codecvt_utf8_to_wide&) noexcept + { return true; } + template static string_type _S_convert(_Iter __first, _Iter __last) { return string_type{__first, __last}; } }; + // Performs conversions from _CharT to path::string_type. template struct path::_Cvt { + // FIXME: We currently assume that the native wide encoding for wchar_t + // is either UTF-32 or UTF-16 (depending on the width of wchar_t). + // See comments in for further details. + using __codecvt_utf8_to_wchar + = typename conditional, // from UTF-32 + std::codecvt_utf8_utf16 // from UTF-16 + >::type; + + // Converts from char16_t or char32_t using std::codecvt. + // Need derived class here because std::codecvt has protected destructor. + struct __codecvt_utf8_to_utfNN : std::codecvt<_CharT, char, mbstate_t> + { }; + + // Convert from native pathname format (assumed to be UTF-8 everywhere) + // to the encoding implied by the wide character type _CharT. + using __codecvt_utf8_to_wide + = typename conditional::value, + __codecvt_utf8_to_wchar, + __codecvt_utf8_to_utfNN>::type; + #ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS #ifdef _GLIBCXX_USE_CHAR8_T static string_type @@ -774,8 +807,7 @@ namespace __detail static string_type _S_wconvert(const char* __f, const char* __l, const char*) { - using _Cvt = std::codecvt; - const auto& __cvt = std::use_facet<_Cvt>(std::locale{}); + std::codecvt_utf8_utf16 __cvt; std::wstring __wstr; if (__str_codecvt_in_all(__f, __l, __wstr, __cvt)) return __wstr; @@ -787,8 +819,7 @@ namespace __detail static string_type _S_wconvert(const _CharT* __f, const _CharT* __l, const void*) { - struct _UCvt : std::codecvt<_CharT, char, std::mbstate_t> - { } __cvt; + __codecvt_utf8_to_wide __cvt; std::string __str; if (__str_codecvt_out_all(__f, __l, __str, __cvt)) { @@ -819,8 +850,7 @@ namespace __detail else #endif { - struct _UCvt : std::codecvt<_CharT, char, std::mbstate_t> - { } __cvt; + __codecvt_utf8_to_wide __cvt; std::string __str; if (__str_codecvt_out_all(__f, __l, __str, __cvt)) return __str; @@ -990,7 +1020,7 @@ namespace __detail inline std::basic_string<_CharT, _Traits, _Allocator> path::string(const _Allocator& __a) const { - if (is_same<_CharT, value_type>::value) + if _GLIBCXX_CONSTEXPR (is_same<_CharT, value_type>::value) return { _M_pathname.begin(), _M_pathname.end(), __a }; using _WString = basic_string<_CharT, _Traits, _Allocator>; @@ -1026,9 +1056,8 @@ namespace __detail else #endif { - // Convert UTF-8 to wide string. - struct _UCvt : std::codecvt<_CharT, char, std::mbstate_t> - { } __cvt; + // Convert UTF-8 to char16_t or char32_t string. + typename path::_Cvt<_CharT>::__codecvt_utf8_to_wide __cvt; const char* __f = __from.data(); const char* __l = __f + __from.size(); if (__str_codecvt_in_all(__f, __l, __to, __cvt)) @@ -1041,14 +1070,14 @@ namespace __detail if (auto* __p = __dispatch(__u8str, __wstr, is_same<_CharT, char>{})) return *__p; } -#else +#else // ! Windows #ifdef _GLIBCXX_USE_CHAR8_T if constexpr (is_same<_CharT, char8_t>::value) return _WString(__first, __last, __a); else #endif { - struct _UCvt : std::codecvt<_CharT, char, std::mbstate_t> { } __cvt; + typename path::_Cvt<_CharT>::__codecvt_utf8_to_wide __cvt; _WString __wstr(__a); if (__str_codecvt_in_all(__first, __last, __wstr, __cvt)) return __wstr; diff --git a/libstdc++-v3/testsuite/27_io/filesystem/path/construct/95048.cc b/libstdc++-v3/testsuite/27_io/filesystem/path/construct/95048.cc new file mode 100644 index 00000000000..c1a382d1420 --- /dev/null +++ b/libstdc++-v3/testsuite/27_io/filesystem/path/construct/95048.cc @@ -0,0 +1,45 @@ +// { dg-do run { target c++17 } } + +// C++17 30.10.8.4.1 path constructors [fs.path.construct] + +#include +#include + +using std::filesystem::path; + +#define CHECK(E, S) (path(E##S) == path(u8##S)) + +void +test_wide() +{ + VERIFY( CHECK(L, "\u00E4") ); // PR libstdc++/95048 + VERIFY( CHECK(L, "\U0001F4C1") ); // folder + VERIFY( CHECK(L, "\U0001F4C2") ); // open folder + VERIFY( CHECK(L, "\U0001F4C4") ); // filing cabient +} + +void +test_u16() +{ + VERIFY( CHECK(u, "\u00E4") ); // PR libstdc++/95048 + VERIFY( CHECK(u, "\U0001F4C1") ); // folder + VERIFY( CHECK(u, "\U0001F4C2") ); // open folder + VERIFY( CHECK(u, "\U0001F4C4") ); // filing cabient +} + +void +test_u32() +{ + VERIFY( CHECK(U, "\u00E4") ); // PR libstdc++/95048 + VERIFY( CHECK(U, "\U0001F4C1") ); // folder + VERIFY( CHECK(U, "\U0001F4C2") ); // open folder + VERIFY( CHECK(U, "\U0001F4C4") ); // filing cabient +} + +int +main() +{ + test_wide(); + test_u16(); + test_u32(); +} diff --git a/libstdc++-v3/testsuite/experimental/filesystem/path/construct/95048.cc b/libstdc++-v3/testsuite/experimental/filesystem/path/construct/95048.cc new file mode 100644 index 00000000000..b7a93f3c985 --- /dev/null +++ b/libstdc++-v3/testsuite/experimental/filesystem/path/construct/95048.cc @@ -0,0 +1,47 @@ +// { dg-options "-lstdc++fs" } +// { dg-do run { target c++11 } } +// { dg-require-filesystem-ts "" } + +// 8.4.1 path constructors [path.construct] + +#include +#include + +using std::experimental::filesystem::path; + +#define CHECK(E, S) (path(E##S) == path(u8##S)) + +void +test_wide() +{ + VERIFY( CHECK(L, "\u00E4") ); // PR libstdc++/95048 + VERIFY( CHECK(L, "\U0001F4C1") ); // folder + VERIFY( CHECK(L, "\U0001F4C2") ); // open folder + VERIFY( CHECK(L, "\U0001F4C4") ); // filing cabient +} + +void +test_u16() +{ + VERIFY( CHECK(u, "\u00E4") ); // PR libstdc++/95048 + VERIFY( CHECK(u, "\U0001F4C1") ); // folder + VERIFY( CHECK(u, "\U0001F4C2") ); // open folder + VERIFY( CHECK(u, "\U0001F4C4") ); // filing cabient +} + +void +test_u32() +{ + VERIFY( CHECK(U, "\u00E4") ); // PR libstdc++/95048 + VERIFY( CHECK(U, "\U0001F4C1") ); // folder + VERIFY( CHECK(U, "\U0001F4C2") ); // open folder + VERIFY( CHECK(U, "\U0001F4C4") ); // filing cabient +} + +int +main() +{ + test_wide(); + test_u16(); + test_u32(); +}