Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions cmake_modules/IcebergThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,62 @@ function(resolve_croaring_dependency)
PARENT_SCOPE)
endfunction()

# ----------------------------------------------------------------------
# utf8proc

function(resolve_utf8proc_dependency)
prepare_fetchcontent()

if(DEFINED ENV{ICEBERG_UTF8PROC_URL})
set(UTF8PROC_URL "$ENV{ICEBERG_UTF8PROC_URL}")
else()
set(UTF8PROC_URL
"https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.10.0.tar.gz")
endif()

fetchcontent_declare(utf8proc
${FC_DECLARE_COMMON_OPTIONS}
URL ${UTF8PROC_URL}
FIND_PACKAGE_ARGS
NAMES
utf8proc
CONFIG)
fetchcontent_makeavailable(utf8proc)

if(utf8proc_SOURCE_DIR)
if(NOT TARGET utf8proc::utf8proc)
add_library(utf8proc::utf8proc INTERFACE IMPORTED)
target_link_libraries(utf8proc::utf8proc INTERFACE utf8proc)
target_include_directories(utf8proc::utf8proc INTERFACE ${utf8proc_SOURCE_DIR})
endif()

set(UTF8PROC_VENDORED TRUE)
Comment thread
wgtmac marked this conversation as resolved.
# utf8proc's CMake puts a raw build-tree path in INTERFACE_INCLUDE_DIRECTORIES, which
# install(EXPORT) rejects. Wrap it in BUILD_INTERFACE so the export is valid; utf8proc
# is a private dependency, so installed consumers never need its headers.
set_target_properties(utf8proc
PROPERTIES OUTPUT_NAME "iceberg_vendored_utf8proc"
POSITION_INDEPENDENT_CODE ON
INTERFACE_INCLUDE_DIRECTORIES
"$<BUILD_INTERFACE:${utf8proc_SOURCE_DIR}>")
install(TARGETS utf8proc
EXPORT iceberg_targets
RUNTIME DESTINATION "${ICEBERG_INSTALL_BINDIR}"
ARCHIVE DESTINATION "${ICEBERG_INSTALL_LIBDIR}"
LIBRARY DESTINATION "${ICEBERG_INSTALL_LIBDIR}")
else()
set(UTF8PROC_VENDORED FALSE)
list(APPEND ICEBERG_SYSTEM_DEPENDENCIES utf8proc)
endif()

set(ICEBERG_SYSTEM_DEPENDENCIES
${ICEBERG_SYSTEM_DEPENDENCIES}
PARENT_SCOPE)
set(UTF8PROC_VENDORED
${UTF8PROC_VENDORED}
PARENT_SCOPE)
endfunction()

# ----------------------------------------------------------------------
# nlohmann-json

Expand Down Expand Up @@ -719,6 +775,7 @@ endfunction()
resolve_zlib_dependency()
resolve_nanoarrow_dependency()
resolve_croaring_dependency()
resolve_utf8proc_dependency()
resolve_nlohmann_json_dependency()
resolve_spdlog_dependency()

Expand Down
8 changes: 6 additions & 2 deletions src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -145,23 +145,27 @@ list(APPEND
"$<IF:$<BOOL:${NANOARROW_VENDORED}>,nanoarrow::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_static>,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>"
nlohmann_json::nlohmann_json
spdlog::spdlog
utf8proc::utf8proc
ZLIB::ZLIB)
list(APPEND
ICEBERG_SHARED_BUILD_INTERFACE_LIBS
"$<IF:$<BOOL:${NANOARROW_VENDORED}>,nanoarrow::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_shared>,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>"
nlohmann_json::nlohmann_json
spdlog::spdlog
utf8proc::utf8proc
ZLIB::ZLIB)
list(APPEND
ICEBERG_STATIC_INSTALL_INTERFACE_LIBS
"$<IF:$<BOOL:${NANOARROW_VENDORED}>,iceberg::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_static>,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>"
"$<IF:$<BOOL:${NLOHMANN_JSON_VENDORED}>,iceberg::nlohmann_json,$<IF:$<TARGET_EXISTS:nlohmann_json::nlohmann_json>,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>"
"$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>")
"$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>"
"$<IF:$<BOOL:${UTF8PROC_VENDORED}>,iceberg::utf8proc,utf8proc::utf8proc>")
list(APPEND
ICEBERG_SHARED_INSTALL_INTERFACE_LIBS
"$<IF:$<BOOL:${NANOARROW_VENDORED}>,iceberg::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_shared>,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>"
"$<IF:$<BOOL:${NLOHMANN_JSON_VENDORED}>,iceberg::nlohmann_json,$<IF:$<TARGET_EXISTS:nlohmann_json::nlohmann_json>,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>"
"$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>")
"$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>"
"$<IF:$<BOOL:${UTF8PROC_VENDORED}>,iceberg::utf8proc,utf8proc::utf8proc>")

add_iceberg_lib(iceberg
SOURCES
Expand Down
9 changes: 8 additions & 1 deletion src/iceberg/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,15 @@ nanoarrow_dep = dependency('nanoarrow')
nlohmann_json_dep = dependency('nlohmann_json')
spdlog_dep = dependency('spdlog')
zlib_dep = dependency('zlib')
utf8proc_dep = dependency('libutf8proc')

iceberg_deps = [nanoarrow_dep, nlohmann_json_dep, spdlog_dep, zlib_dep]
iceberg_deps = [
nanoarrow_dep,
nlohmann_json_dep,
spdlog_dep,
zlib_dep,
utf8proc_dep,
]

iceberg_lib = library(
'iceberg',
Expand Down
45 changes: 31 additions & 14 deletions src/iceberg/test/string_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,30 @@ TEST(StringUtilsTest, ToUpper) {
ASSERT_EQ(StringUtils::ToUpper("123"), "123");
}

// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative when stored
// in a signed char. Only ASCII letters are converted; multibyte bytes pass through
// unchanged. The non-ASCII strings are written as explicit UTF-8 byte escapes so the test
// does not depend on the source-file encoding. See
// https://github.com/apache/iceberg-cpp/issues/613.
TEST(StringUtilsTest, NonAsciiPassThrough) {
// "Naïve" -> "naïve" (ï = U+00EF = 0xC3 0xAF; only the ASCII letters change).
ASSERT_EQ(StringUtils::ToLower("Na\xC3\xAFve"), "na\xC3\xAFve");
// "café" -> "CAFé" (é = U+00E9 = 0xC3 0xA9 stays unchanged).
ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9");
// "日本語" (0xE6 0x97 0xA5 0xE6 0x9C 0xAC 0xE8 0xAA 0x9E) is returned verbatim.
// Non-ASCII strings are written as explicit UTF-8 byte escapes so the test does not
// depend on the source-file encoding. An escape is split before a following hex digit
// (e.g. "...\x9E" "E") so the \x does not absorb it.
// See https://github.com/apache/iceberg-cpp/issues/613.
TEST(StringUtilsTest, ToLowerUnicode) {
// "CAFÉ" -> "café" (É U+00C9 = 0xC3 0x89 -> é U+00E9 = 0xC3 0xA9).
ASSERT_EQ(StringUtils::ToLower("CAF\xC3\x89"), "caf\xC3\xA9");
// "GROẞE" -> "große": capital sharp S (ẞ U+1E9E) lower-cases to ß (U+00DF), not "ss"
// as casefolding would produce.
ASSERT_EQ(StringUtils::ToLower("GRO\xE1\xBA\x9E"
"E"),
"gro\xC3\x9F"
"e");
// "日本語" has no case mapping and is returned verbatim.
ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
"\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
// Invalid UTF-8 (a lone 0xFF byte) is returned unchanged rather than erroring.
ASSERT_EQ(StringUtils::ToLower("\xFF"), "\xFF");
}

// ToUpper is intentionally ASCII-only; non-ASCII (multibyte UTF-8) bytes pass through.
TEST(StringUtilsTest, ToUpperAsciiOnly) {
// "café" -> "CAFé" (é stays unchanged).
ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9");
ASSERT_EQ(StringUtils::ToUpper("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
"\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
}
Expand All @@ -63,9 +74,15 @@ TEST(StringUtilsTest, EqualsIgnoreCase) {
ASSERT_TRUE(StringUtils::EqualsIgnoreCase("", ""));
ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abcd"));
ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abd"));
// ASCII case is folded; non-ASCII bytes are compared as-is. ("Café" vs "café")
ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Caf\xC3\xA9", "caf\xC3\xA9"));
// "café" vs "cafe": the multibyte é differs from ASCII 'e'.
// Unicode-aware: "CAFÉ" matches "café".
ASSERT_TRUE(StringUtils::EqualsIgnoreCase("CAF\xC3\x89", "caf\xC3\xA9"));
// "GROẞE" matches "große" under lowercasing (ẞ -> ß).
ASSERT_TRUE(
StringUtils::EqualsIgnoreCase("GRO\xE1\xBA\x9E"
"E",
"gro\xC3\x9F"
"e"));
// Different letters still differ ("café" vs "cafe").
ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe"));
}

Expand Down
31 changes: 31 additions & 0 deletions src/iceberg/util/string_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,41 @@

#include "iceberg/util/string_util.h"

#include <utf8proc.h>

#include <array>

#include "iceberg/util/macros.h"

namespace iceberg {

std::string StringUtils::ToLower(std::string_view str) {
std::string result;
result.reserve(str.size());

const auto* data = reinterpret_cast<const utf8proc_uint8_t*>(str.data());
const auto size = static_cast<utf8proc_ssize_t>(str.size());
utf8proc_ssize_t offset = 0;
while (offset < size) {
utf8proc_int32_t code_point = 0;
utf8proc_ssize_t consumed =
utf8proc_iterate(data + offset, size - offset, &code_point);
if (consumed < 0) {
// Invalid UTF-8: return the input unchanged rather than erroring.
return std::string(str);
}
// utf8proc has no string-level lower-case helper, so map and re-encode each code
// point individually. utf8proc_tolower is a simple 1:1 mapping (not casefolding).
const utf8proc_int32_t lowered = utf8proc_tolower(code_point);
std::array<utf8proc_uint8_t, 4> encoded{};
const utf8proc_ssize_t written = utf8proc_encode_char(lowered, encoded.data());
result.append(reinterpret_cast<const char*>(encoded.data()),
static_cast<size_t>(written));
offset += consumed;
}
return result;
}

Result<std::vector<uint8_t>> StringUtils::HexStringToBytes(std::string_view hex) {
if (hex.size() % 2 != 0) [[unlikely]] {
return InvalidArgument("Hex string must have even length, got: {}", hex.size());
Expand Down
33 changes: 14 additions & 19 deletions src/iceberg/util/string_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#pragma once

#include <algorithm>
#include <cctype>
#include <cerrno>
#include <charconv>
#include <ranges>
Expand All @@ -41,22 +40,24 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); };

class ICEBERG_EXPORT StringUtils {
public:
// NOTE: These convert ASCII letters only; all other bytes, including non-ASCII
// (multibyte UTF-8) bytes, are passed through unchanged.
// See https://github.com/apache/iceberg-cpp/issues/613.
static std::string ToLower(std::string_view str) {
return str | std::ranges::views::transform(ToLowerAscii) |
std::ranges::to<std::string>();
}

/// \brief Lower-case a UTF-8 string using Unicode simple case mapping.
///
/// Mirrors Iceberg Java's case-insensitive handling, which lower-cases names with
Comment thread
goel-skd marked this conversation as resolved.
Outdated
/// toLowerCase(Locale.ROOT). Invalid UTF-8 input is returned unchanged.
/// See https://github.com/apache/iceberg-cpp/issues/613.
static std::string ToLower(std::string_view str);

/// \brief Upper-case ASCII letters; non-ASCII (multibyte UTF-8) bytes pass through
/// unchanged.
///
/// Unlike ToLower this is ASCII-only, since upper-casing is not used for name matching.
Comment thread
goel-skd marked this conversation as resolved.
Outdated
static std::string ToUpper(std::string_view str) {
return str | std::ranges::views::transform(ToUpperAscii) |
std::ranges::to<std::string>();
}

static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) {
return std::ranges::equal(
lhs, rhs, [](char lc, char rc) { return ToLowerAscii(lc) == ToLowerAscii(rc); });
return ToLower(lhs) == ToLower(rhs);
Comment thread
goel-skd marked this conversation as resolved.
}

static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that EqualsIgnoreCase inherits ToLower's Unicode simple-mapping behavior, this helper should not slice str by prefix.size() before lowercasing. ToLower can change UTF-8 byte length; for example, İ is two bytes but maps to i here. So StartsWithIgnoreCase("\xC4\xB0x", "i") currently slices an invalid one-byte prefix and returns false, even though the lowercased string starts with the lowercased prefix.

Could we compare ToLower(str).starts_with(ToLower(prefix)) or otherwise compare by decoded code point, and add a test for this case?

Expand Down Expand Up @@ -134,14 +135,8 @@ class ICEBERG_EXPORT StringUtils {
}

private:
// ASCII-only case conversion using explicit range checks rather than
// std::tolower/std::toupper. This is independent of the current C locale and never
// touches non-ASCII (high-bit) bytes, so multibyte UTF-8 sequences are preserved. It
// also sidesteps the undefined behavior of passing a negative char to <cctype>.
static constexpr char ToLowerAscii(char c) noexcept {
return (c >= 'A' && c <= 'Z') ? static_cast<char>(c - 'A' + 'a') : c;
}

// Avoids std::toupper, which is locale-dependent and has undefined behavior for
// negative char values.
static constexpr char ToUpperAscii(char c) noexcept {
return (c >= 'a' && c <= 'z') ? static_cast<char>(c - 'a' + 'A') : c;
}
Expand Down
30 changes: 30 additions & 0 deletions subprojects/utf8proc.wrap
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

[wrap-file]
directory = utf8proc-2.10.0
source_url = https://github.com/JuliaStrings/utf8proc/releases/download/v2.10.0/utf8proc-2.10.0.tar.gz
source_filename = utf8proc-2.10.0.tar.gz
source_hash = 276a37dc4d1dd24d7896826a579f4439d1e5fe33603add786bb083cab802e23e
patch_filename = utf8proc_2.10.0-1_patch.zip
patch_url = https://wrapdb.mesonbuild.com/v2/utf8proc_2.10.0-1/get_patch
patch_hash = be16c4514603e922f9636045699fe1a6f844d340b9b7c14b809e47253b06a844
source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/utf8proc_2.10.0-1/utf8proc-2.10.0.tar.gz
wrapdb_version = 2.10.0-1

[provide]
libutf8proc = utf8proc_dep
Loading