apache · goel-skd · Jun 19, 2026 · Jun 19, 2026 · Jun 26, 2026 · wgtmac
diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake
@@ -421,6 +421,62 @@ function(resolve_croaring_dependency)
       PARENT_SCOPE)
 endfunction()
 
+# ----------------------------------------------------------------------
+# utf8proc
+
+function(resolve_utf8proc_dependency)
+  prepare_fetchcontent()
+
+  if(DEFINED ENV{ICEBERG_UTF8PROC_URL})
+    set(UTF8PROC_URL "$ENV{ICEBERG_UTF8PROC_URL}")
+  else()
+    set(UTF8PROC_URL
+        "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.10.0.tar.gz")
+  endif()
+
+  fetchcontent_declare(utf8proc
+                       ${FC_DECLARE_COMMON_OPTIONS}
+                       URL ${UTF8PROC_URL}
+                           FIND_PACKAGE_ARGS
+                           NAMES
+                           utf8proc
+                           CONFIG)
+  fetchcontent_makeavailable(utf8proc)
+
+  if(utf8proc_SOURCE_DIR)
+    if(NOT TARGET utf8proc::utf8proc)
+      add_library(utf8proc::utf8proc INTERFACE IMPORTED)
+      target_link_libraries(utf8proc::utf8proc INTERFACE utf8proc)
+      target_include_directories(utf8proc::utf8proc INTERFACE ${utf8proc_SOURCE_DIR})
+    endif()
+
+    set(UTF8PROC_VENDORED TRUE)
+    # utf8proc's CMake puts a raw build-tree path in INTERFACE_INCLUDE_DIRECTORIES, which
+    # install(EXPORT) rejects. Wrap it in BUILD_INTERFACE so the export is valid; utf8proc
+    # is a private dependency, so installed consumers never need its headers.
+    set_target_properties(utf8proc
+                          PROPERTIES OUTPUT_NAME "iceberg_vendored_utf8proc"
+                                     POSITION_INDEPENDENT_CODE ON
+                                     INTERFACE_INCLUDE_DIRECTORIES
+                                     "$<BUILD_INTERFACE:${utf8proc_SOURCE_DIR}>")
+    install(TARGETS utf8proc
+            EXPORT iceberg_targets
+            RUNTIME DESTINATION "${ICEBERG_INSTALL_BINDIR}"
+            ARCHIVE DESTINATION "${ICEBERG_INSTALL_LIBDIR}"
+            LIBRARY DESTINATION "${ICEBERG_INSTALL_LIBDIR}")
+  else()
+    set(UTF8PROC_VENDORED FALSE)
+    list(APPEND ICEBERG_SYSTEM_DEPENDENCIES utf8proc)
+  endif()
+
+  set(ICEBERG_SYSTEM_DEPENDENCIES
+      ${ICEBERG_SYSTEM_DEPENDENCIES}
+      PARENT_SCOPE)
+  set(UTF8PROC_VENDORED
+      ${UTF8PROC_VENDORED}
+      PARENT_SCOPE)
+endfunction()
+
 # ----------------------------------------------------------------------
 # nlohmann-json
 
@@ -719,6 +775,7 @@ endfunction()
 resolve_zlib_dependency()
 resolve_nanoarrow_dependency()
 resolve_croaring_dependency()
+resolve_utf8proc_dependency()
 resolve_nlohmann_json_dependency()
 resolve_spdlog_dependency()
 

diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
@@ -145,23 +145,27 @@ list(APPEND
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,nanoarrow::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_static>,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>"
      nlohmann_json::nlohmann_json
      spdlog::spdlog
+     utf8proc::utf8proc
      ZLIB::ZLIB)
 list(APPEND
      ICEBERG_SHARED_BUILD_INTERFACE_LIBS
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,nanoarrow::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_shared>,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>"
      nlohmann_json::nlohmann_json
      spdlog::spdlog
+     utf8proc::utf8proc
      ZLIB::ZLIB)
 list(APPEND
      ICEBERG_STATIC_INSTALL_INTERFACE_LIBS
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,iceberg::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_static>,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>"
      "$<IF:$<BOOL:${NLOHMANN_JSON_VENDORED}>,iceberg::nlohmann_json,$<IF:$<TARGET_EXISTS:nlohmann_json::nlohmann_json>,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>"
-     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>")
+     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>"
+     "$<IF:$<BOOL:${UTF8PROC_VENDORED}>,iceberg::utf8proc,utf8proc::utf8proc>")
 list(APPEND
      ICEBERG_SHARED_INSTALL_INTERFACE_LIBS
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,iceberg::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_shared>,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>"
      "$<IF:$<BOOL:${NLOHMANN_JSON_VENDORED}>,iceberg::nlohmann_json,$<IF:$<TARGET_EXISTS:nlohmann_json::nlohmann_json>,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>"
-     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>")
+     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>"
+     "$<IF:$<BOOL:${UTF8PROC_VENDORED}>,iceberg::utf8proc,utf8proc::utf8proc>")
 
 add_iceberg_lib(iceberg
                 SOURCES

diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
@@ -190,8 +190,15 @@ nanoarrow_dep = dependency('nanoarrow')
 nlohmann_json_dep = dependency('nlohmann_json')
 spdlog_dep = dependency('spdlog')
 zlib_dep = dependency('zlib')
+utf8proc_dep = dependency('libutf8proc')
 
-iceberg_deps = [nanoarrow_dep, nlohmann_json_dep, spdlog_dep, zlib_dep]
+iceberg_deps = [
+    nanoarrow_dep,
+    nlohmann_json_dep,
+    spdlog_dep,
+    zlib_dep,
+    utf8proc_dep,
+]
 
 iceberg_lib = library(
     'iceberg',

diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc
@@ -41,19 +41,30 @@ TEST(StringUtilsTest, ToUpper) {
   ASSERT_EQ(StringUtils::ToUpper("123"), "123");
 }
 
-// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative when stored
-// in a signed char. Only ASCII letters are converted; multibyte bytes pass through
-// unchanged. The non-ASCII strings are written as explicit UTF-8 byte escapes so the test
-// does not depend on the source-file encoding. See
-// https://github.com/apache/iceberg-cpp/issues/613.
-TEST(StringUtilsTest, NonAsciiPassThrough) {
-  // "Naïve" -> "naïve" (ï = U+00EF = 0xC3 0xAF; only the ASCII letters change).
-  ASSERT_EQ(StringUtils::ToLower("Na\xC3\xAFve"), "na\xC3\xAFve");
-  // "café" -> "CAFé" (é = U+00E9 = 0xC3 0xA9 stays unchanged).
-  ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9");
-  // "日本語" (0xE6 0x97 0xA5 0xE6 0x9C 0xAC 0xE8 0xAA 0x9E) is returned verbatim.
+// Non-ASCII strings are written as explicit UTF-8 byte escapes so the test does not
+// depend on the source-file encoding. An escape is split before a following hex digit
+// (e.g. "...\x9E" "E") so the \x does not absorb it.
+// See https://github.com/apache/iceberg-cpp/issues/613.
+TEST(StringUtilsTest, ToLowerUnicode) {
+  // "CAFÉ" -> "café" (É U+00C9 = 0xC3 0x89 -> é U+00E9 = 0xC3 0xA9).
+  ASSERT_EQ(StringUtils::ToLower("CAF\xC3\x89"), "caf\xC3\xA9");
+  // "GROẞE" -> "große": capital sharp S (ẞ U+1E9E) lower-cases to ß (U+00DF), not "ss"
+  // as casefolding would produce.
+  ASSERT_EQ(StringUtils::ToLower("GRO\xE1\xBA\x9E"
+                                 "E"),
+            "gro\xC3\x9F"
+            "e");
+  // "日本語" has no case mapping and is returned verbatim.
   ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
             "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
+  // Invalid UTF-8 (a lone 0xFF byte) is returned unchanged rather than erroring.
+  ASSERT_EQ(StringUtils::ToLower("\xFF"), "\xFF");
+}
+
+// ToUpper is intentionally ASCII-only; non-ASCII (multibyte UTF-8) bytes pass through.
+TEST(StringUtilsTest, ToUpperAsciiOnly) {
+  // "café" -> "CAFé" (é stays unchanged).
+  ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9");
   ASSERT_EQ(StringUtils::ToUpper("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
             "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
 }
@@ -63,9 +74,15 @@ TEST(StringUtilsTest, EqualsIgnoreCase) {
   ASSERT_TRUE(StringUtils::EqualsIgnoreCase("", ""));
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abcd"));
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abd"));
-  // ASCII case is folded; non-ASCII bytes are compared as-is. ("Café" vs "café")
-  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Caf\xC3\xA9", "caf\xC3\xA9"));
-  // "café" vs "cafe": the multibyte é differs from ASCII 'e'.
+  // Unicode-aware: "CAFÉ" matches "café".
+  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("CAF\xC3\x89", "caf\xC3\xA9"));
+  // "GROẞE" matches "große" under lowercasing (ẞ -> ß).
+  ASSERT_TRUE(
+      StringUtils::EqualsIgnoreCase("GRO\xE1\xBA\x9E"
+                                    "E",
+                                    "gro\xC3\x9F"
+                                    "e"));
+  // Different letters still differ ("café" vs "cafe").
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe"));
 }
 

diff --git a/src/iceberg/util/string_util.cc b/src/iceberg/util/string_util.cc
@@ -19,10 +19,41 @@
 
 #include "iceberg/util/string_util.h"
 
+#include <utf8proc.h>
+
+#include <array>
+
 #include "iceberg/util/macros.h"
 
 namespace iceberg {
 
+std::string StringUtils::ToLower(std::string_view str) {
+  std::string result;
+  result.reserve(str.size());
+
+  const auto* data = reinterpret_cast<const utf8proc_uint8_t*>(str.data());
+  const auto size = static_cast<utf8proc_ssize_t>(str.size());
+  utf8proc_ssize_t offset = 0;
+  while (offset < size) {
+    utf8proc_int32_t code_point = 0;
+    utf8proc_ssize_t consumed =
+        utf8proc_iterate(data + offset, size - offset, &code_point);
+    if (consumed < 0) {
+      // Invalid UTF-8: return the input unchanged rather than erroring.
+      return std::string(str);
+    }
+    // utf8proc has no string-level lower-case helper, so map and re-encode each code
+    // point individually. utf8proc_tolower is a simple 1:1 mapping (not casefolding).
+    const utf8proc_int32_t lowered = utf8proc_tolower(code_point);
+    std::array<utf8proc_uint8_t, 4> encoded{};
+    const utf8proc_ssize_t written = utf8proc_encode_char(lowered, encoded.data());
+    result.append(reinterpret_cast<const char*>(encoded.data()),
+                  static_cast<size_t>(written));
+    offset += consumed;
+  }
+  return result;
+}
+
 Result<std::vector<uint8_t>> StringUtils::HexStringToBytes(std::string_view hex) {
   if (hex.size() % 2 != 0) [[unlikely]] {
     return InvalidArgument("Hex string must have even length, got: {}", hex.size());

diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h
@@ -20,7 +20,6 @@
 #pragma once
 
 #include <algorithm>
-#include <cctype>
 #include <cerrno>
 #include <charconv>
 #include <ranges>
@@ -41,22 +40,24 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); };
 
 class ICEBERG_EXPORT StringUtils {
  public:
-  // NOTE: These convert ASCII letters only; all other bytes, including non-ASCII
-  // (multibyte UTF-8) bytes, are passed through unchanged.
-  // See https://github.com/apache/iceberg-cpp/issues/613.
-  static std::string ToLower(std::string_view str) {
-    return str | std::ranges::views::transform(ToLowerAscii) |
-           std::ranges::to<std::string>();
-  }
-
+  /// \brief Lower-case a UTF-8 string using Unicode simple case mapping.
+  ///
+  /// Mirrors Iceberg Java's case-insensitive handling, which lower-cases names with
+  /// toLowerCase(Locale.ROOT). Invalid UTF-8 input is returned unchanged.
+  /// See https://github.com/apache/iceberg-cpp/issues/613.
+  static std::string ToLower(std::string_view str);
+
+  /// \brief Upper-case ASCII letters; non-ASCII (multibyte UTF-8) bytes pass through
+  /// unchanged.
+  ///
+  /// Unlike ToLower this is ASCII-only, since upper-casing is not used for name matching.
   static std::string ToUpper(std::string_view str) {
     return str | std::ranges::views::transform(ToUpperAscii) |
            std::ranges::to<std::string>();
   }
 
   static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) {
-    return std::ranges::equal(
-        lhs, rhs, [](char lc, char rc) { return ToLowerAscii(lc) == ToLowerAscii(rc); });
+    return ToLower(lhs) == ToLower(rhs);
   }
 
   static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) {
@@ -134,14 +135,8 @@ class ICEBERG_EXPORT StringUtils {
   }
 
  private:
-  // ASCII-only case conversion using explicit range checks rather than
-  // std::tolower/std::toupper. This is independent of the current C locale and never
-  // touches non-ASCII (high-bit) bytes, so multibyte UTF-8 sequences are preserved. It
-  // also sidesteps the undefined behavior of passing a negative char to <cctype>.
-  static constexpr char ToLowerAscii(char c) noexcept {
-    return (c >= 'A' && c <= 'Z') ? static_cast<char>(c - 'A' + 'a') : c;
-  }
-
+  // Avoids std::toupper, which is locale-dependent and has undefined behavior for
+  // negative char values.
   static constexpr char ToUpperAscii(char c) noexcept {
     return (c >= 'a' && c <= 'z') ? static_cast<char>(c - 'a' + 'A') : c;
   }

diff --git a/subprojects/utf8proc.wrap b/subprojects/utf8proc.wrap
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[wrap-file]
+directory = utf8proc-2.10.0
+source_url = https://github.com/JuliaStrings/utf8proc/releases/download/v2.10.0/utf8proc-2.10.0.tar.gz
+source_filename = utf8proc-2.10.0.tar.gz
+source_hash = 276a37dc4d1dd24d7896826a579f4439d1e5fe33603add786bb083cab802e23e
+patch_filename = utf8proc_2.10.0-1_patch.zip
+patch_url = https://wrapdb.mesonbuild.com/v2/utf8proc_2.10.0-1/get_patch
+patch_hash = be16c4514603e922f9636045699fe1a6f844d340b9b7c14b809e47253b06a844
+source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/utf8proc_2.10.0-1/utf8proc-2.10.0.tar.gz
+wrapdb_version = 2.10.0-1
+
+[provide]
+libutf8proc = utf8proc_dep