From 5df60d85f71215b091430a32c7834dd9c90bf79a Mon Sep 17 00:00:00 2001 From: sfan5 Date: Sun, 24 Mar 2024 19:53:54 +0100 Subject: [PATCH] Cache iconv context per-thread --- src/unittest/test_utilities.cpp | 7 +++++ src/util/string.cpp | 49 ++++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/src/unittest/test_utilities.cpp b/src/unittest/test_utilities.cpp index 5c653a529..16032d130 100644 --- a/src/unittest/test_utilities.cpp +++ b/src/unittest/test_utilities.cpp @@ -318,6 +318,7 @@ void TestUtilities::testUTF8() UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("")), ""); UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("the shovel dug a crumbly node!")), "the shovel dug a crumbly node!"); + UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide(u8"-ä-")), u8"-ä-"); UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide(u8"-\U0002000b-")), @@ -326,6 +327,12 @@ void TestUtilities::testUTF8() const auto *literal = U"-\U0002000b-"; UASSERT(utf8_to_wide(u8"-\U0002000b-") == reinterpret_cast(literal)); } + + // try to check that the conversion function does not accidentally keep + // its internal state across invocations. + // \xC4\x81 is UTF-8 for \u0101 + utf8_to_wide("\xC4"); + UASSERT(utf8_to_wide("\x81") != L"\u0101"); } void TestUtilities::testRemoveEscapes() diff --git a/src/util/string.cpp b/src/util/string.cpp index eb65373a4..0c896e6ec 100644 --- a/src/util/string.cpp +++ b/src/util/string.cpp @@ -41,28 +41,49 @@ with this program; if not, write to the Free Software Foundation, Inc., #ifndef _WIN32 -static bool convert(const char *to, const char *from, char *outbuf, - size_t *outbuf_size, char *inbuf, size_t inbuf_size) +namespace { + class IconvSmartPointer { + iconv_t m_cd; + static const iconv_t null_value; + public: + IconvSmartPointer() : m_cd(null_value) {} + ~IconvSmartPointer() { reset(); } + + DISABLE_CLASS_COPY(IconvSmartPointer) + ALLOW_CLASS_MOVE(IconvSmartPointer) + + iconv_t get() const { return m_cd; } + operator bool() const { return m_cd != null_value; } + void reset(iconv_t cd = null_value) { + if (m_cd != null_value) + iconv_close(m_cd); + m_cd = cd; + } + }; + + // note that this can't be constexpr if iconv_t is a pointer + const iconv_t IconvSmartPointer::null_value = (iconv_t) -1; +} + +static bool convert(iconv_t cd, char *outbuf, size_t *outbuf_size, + char *inbuf, size_t inbuf_size) { - iconv_t cd = iconv_open(to, from); + // reset conversion state + iconv(cd, nullptr, nullptr, nullptr, nullptr); char *inbuf_ptr = inbuf; char *outbuf_ptr = outbuf; - size_t *inbuf_left_ptr = &inbuf_size; - const size_t old_outbuf_size = *outbuf_size; size_t old_size = inbuf_size; while (inbuf_size > 0) { - iconv(cd, &inbuf_ptr, inbuf_left_ptr, &outbuf_ptr, outbuf_size); + iconv(cd, &inbuf_ptr, &inbuf_size, &outbuf_ptr, outbuf_size); if (inbuf_size == old_size) { - iconv_close(cd); return false; } old_size = inbuf_size; } - iconv_close(cd); *outbuf_size = old_outbuf_size - *outbuf_size; return true; } @@ -80,6 +101,10 @@ constexpr auto DEFAULT_ENCODING = ([] () -> const char* { std::wstring utf8_to_wide(std::string_view input) { + thread_local IconvSmartPointer cd; + if (!cd) + cd.reset(iconv_open(DEFAULT_ENCODING, "UTF-8")); + const size_t inbuf_size = input.length(); // maximum possible size, every character is sizeof(wchar_t) bytes size_t outbuf_size = input.length() * sizeof(wchar_t); @@ -90,7 +115,7 @@ std::wstring utf8_to_wide(std::string_view input) out.resize(outbuf_size / sizeof(wchar_t)); char *outbuf = reinterpret_cast(&out[0]); - if (!convert(DEFAULT_ENCODING, "UTF-8", outbuf, &outbuf_size, inbuf, inbuf_size)) { + if (!convert(cd.get(), outbuf, &outbuf_size, inbuf, inbuf_size)) { infostream << "Couldn't convert UTF-8 string 0x" << hex_encode(input) << " into wstring" << std::endl; delete[] inbuf; @@ -104,6 +129,10 @@ std::wstring utf8_to_wide(std::string_view input) std::string wide_to_utf8(std::wstring_view input) { + thread_local IconvSmartPointer cd; + if (!cd) + cd.reset(iconv_open("UTF-8", DEFAULT_ENCODING)); + const size_t inbuf_size = input.length() * sizeof(wchar_t); // maximum possible size: utf-8 encodes codepoints using 1 up to 4 bytes size_t outbuf_size = input.length() * 4; @@ -113,7 +142,7 @@ std::string wide_to_utf8(std::wstring_view input) std::string out; out.resize(outbuf_size); - if (!convert("UTF-8", DEFAULT_ENCODING, &out[0], &outbuf_size, inbuf, inbuf_size)) { + if (!convert(cd.get(), &out[0], &outbuf_size, inbuf, inbuf_size)) { infostream << "Couldn't convert wstring 0x" << hex_encode(inbuf, inbuf_size) << " into UTF-8 string" << std::endl; delete[] inbuf;