From ab327d20fb01e4907380f592c271d44ac727f66c Mon Sep 17 00:00:00 2001 From: "Daniel Q. Kim" Date: Thu, 21 May 2026 16:43:37 +0200 Subject: [PATCH] Fix Iceberg ARRAY columns with dot-separated names returning empty lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When querying an Iceberg table through the `iceberg(...)` table function or a DataLakeCatalog, a column whose name contains a `.` and whose type is `Array(T)` (e.g. `` `a.b` ARRAY ``) returned empty arrays instead of the stored values. The same data read by Spark returned the expected values. Fixes #90731. The Parquet V3 reader path (`SchemaConverter` + `ColumnMapper` + `FormatFilterInfo`) is already correct after the dotted-name field-id work in 0a218cd4e8b, 4b733bae561 and f24c1a46063. This change addresses two residual upstream defects that affect dotted-name `Array(T)` columns regardless of source: * `ColumnsDescription::getAllRegisteredNames` explicitly filtered out any column whose name contained `.`, under the assumption such names were always flattened Nested subcolumns. A column whose stored name literally contains a dot (allowed by MergeTree with backticks, and produced by Iceberg / Spark) is a first-class registered name and must appear in `IHints` misspelling suggestions. The function is only consumed by `IHints`-style suggestion paths (and by `StorageSystemZooKeeper` for column-name iteration, where no dotted names exist), so relaxing it has no effect on parsing, planning, storage, or wire protocol. * `NestedUtils::getSubcolumnsOfNested` treated every `Array(T)` column whose name contained `.` as a flattened element of a synthetic `Nested` structure named after the prefix. This caused the Arrow, ORC and pre-V3 Parquet readers to look for a struct field with the prefix name in the data file rather than the literal dotted column, returning an empty array. The fix uses a two-pass scan: a synthetic `Nested` entry is only emitted when at least two `Array(T)` columns share the same dotted prefix. A lone column such as `a.b: Array(T)` no longer appears in the synthetic-Nested map. Genuine flattened `Nested` with multiple fields is unaffected; the existing early-continue on `isNested()` also covers the one-field-Nested edge case. Tests: * `tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py::test_dotted_array_column` — end-to-end repro of #90731 against s3, azure and local storage. * `test_dotted_array_alongside_real_nested` in the same file — mixed- schema regression guard verifying a lone dotted `Array` column coexists with genuine flattened-Nested siblings. * `tests/queries/0_stateless/04259_dotted_array_not_nested.sql` — isolates Bug B without Iceberg. * `tests/queries/0_stateless/04260_dotted_column_in_hints.sh` — verifies Bug A by checking the misspelling hint output. Changelog category (leave one): - Bug Fix (user-visible misbehavior in an official stable release) Changelog entry: Fix reading Iceberg tables whose `ARRAY` column names contain a dot (e.g. `` `a.b` ARRAY ``), which previously returned empty arrays. Two upstream defects were responsible: `ColumnsDescription::getAllRegisteredNames` filtered out dotted names, and `NestedUtils::getSubcolumnsOfNested` misclassified lone dotted `Array(T)` columns as flattened `Nested` children. (cherry picked from commit f8467afa849f7ce5aec7a7d372b00fdabf13b4b1) --- src/DataTypes/NestedUtils.cpp | 24 ++++- src/Storages/ColumnsDescription.cpp | 5 +- .../test_column_names_with_dots.py | 98 +++++++++++++++++++ .../04259_dotted_array_not_nested.reference | 3 + .../04259_dotted_array_not_nested.sql | 18 ++++ .../04260_dotted_column_in_hints.reference | 1 + .../04260_dotted_column_in_hints.sh | 19 ++++ 7 files changed, 160 insertions(+), 8 deletions(-) create mode 100644 tests/queries/0_stateless/04259_dotted_array_not_nested.reference create mode 100644 tests/queries/0_stateless/04259_dotted_array_not_nested.sql create mode 100644 tests/queries/0_stateless/04260_dotted_column_in_hints.reference create mode 100755 tests/queries/0_stateless/04260_dotted_column_in_hints.sh diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index 56ebe66c2ecc..531d29b66250 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -212,21 +212,37 @@ using NameToDataType = std::map; NameToDataType getSubcolumnsOfNested(const NamesAndTypesList & names_and_types) { - std::unordered_map nested; + /// Pass 1: count how many Array(T) columns share each dotted prefix. + /// A lone column like `a.b Array(T)` must not be collapsed into a synthetic + /// Nested parent — only genuine flat-Nested groups (n.x, n.y, ...) qualify. + std::unordered_map prefix_count; for (const auto & name_type : names_and_types) { /// Skip subcolumns (e.g. `c0.c2.null` derived from `c0.c2 Array(Nullable(Tuple()))`). - /// They are not real flat-nested columns like `n.a Array(T)`, `n.b Array(T)`. if (name_type.isSubcolumn()) continue; const auto * type_arr = typeid_cast(name_type.type.get()); - - /// Ignore true Nested type, but try to unite flatten arrays to Nested type. if (!isNested(name_type.type) && type_arr) { auto split = splitName(name_type.name); if (!split.second.empty()) + ++prefix_count[split.first]; + } + } + + /// Pass 2: build Nested only for prefixes shared by at least two columns. + std::unordered_map nested; + for (const auto & name_type : names_and_types) + { + if (name_type.isSubcolumn()) + continue; + + const auto * type_arr = typeid_cast(name_type.type.get()); + if (!isNested(name_type.type) && type_arr) + { + auto split = splitName(name_type.name); + if (!split.second.empty() && prefix_count[split.first] >= 2) nested[split.first].emplace_back(split.second, type_arr->getNestedType()); } } diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 45c32efa8d59..1c0c6c14f50b 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -976,10 +976,7 @@ std::vector ColumnsDescription::getAllRegisteredNames() const std::vector names; names.reserve(columns.size()); for (const auto & column : columns) - { - if (!column.name.contains('.')) - names.push_back(column.name); - } + names.emplace_back(column.name); return names; } diff --git a/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py b/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py index cb239e1e8372..0bf9ae27d539 100644 --- a/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py +++ b/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py @@ -216,3 +216,101 @@ def test_deeply_nested_struct_with_dotted_names(started_cluster_iceberg_with_spa ).strip() expected = "deep_value1\ndeep_value2\ndeep_value3" assert result == expected, f"Expected:\n{expected}\nGot:\n{result}" + + +@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) +def test_dotted_array_column(started_cluster_iceberg_with_spark, storage_type): + """ + Regression test for issue #90731. + A top-level ARRAY column whose name literally contains a dot (e.g. `a.b`) + must be returned with its actual values, not as an empty array. + """ + instance = started_cluster_iceberg_with_spark.instances["node1"] + spark = started_cluster_iceberg_with_spark.spark_session + TABLE_NAME = "test_dotted_array_column_" + storage_type + "_" + get_uuid_str() + + from pyspark.sql.types import ArrayType + + data = [(["a", "b", "c"],)] + schema = StructType([ + StructField("a.b", ArrayType(StringType())), + ]) + df = spark.createDataFrame(data=data, schema=schema) + + write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2") + + default_upload_directory( + started_cluster_iceberg_with_spark, + storage_type, + f"/iceberg_data/default/{TABLE_NAME}/", + f"/iceberg_data/default/{TABLE_NAME}/", + ) + + # Test via table function + table_function_expr = get_creation_expression( + storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True + ) + + result = instance.query( + f"SELECT `a.b` FROM {table_function_expr}" + ).strip() + assert result == "['a','b','c']", f"Expected ['a','b','c'], got: {result}" + + # Test via table engine + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark) + + result = instance.query( + f"SELECT `a.b` FROM {TABLE_NAME}" + ).strip() + assert result == "['a','b','c']", f"Expected ['a','b','c'], got: {result}" + + +@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) +def test_dotted_array_alongside_real_nested(started_cluster_iceberg_with_spark, storage_type): + """ + Regression guard: a lone dotted Array column (`a.b`) must not interfere with + a genuine flat-Nested group (`c.x`, `c.y`) that shares a different prefix. + All three columns must round-trip correctly. + """ + instance = started_cluster_iceberg_with_spark.instances["node1"] + spark = started_cluster_iceberg_with_spark.spark_session + TABLE_NAME = "test_dotted_array_alongside_real_nested_" + storage_type + "_" + get_uuid_str() + + from pyspark.sql.types import ArrayType, IntegerType as SparkIntegerType + + data = [(["a", "b", "c"], [1, 2], ["p", "q"])] + schema = StructType([ + StructField("a.b", ArrayType(StringType())), + StructField("c.x", ArrayType(SparkIntegerType())), + StructField("c.y", ArrayType(StringType())), + ]) + df = spark.createDataFrame(data=data, schema=schema) + + write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2") + + default_upload_directory( + started_cluster_iceberg_with_spark, + storage_type, + f"/iceberg_data/default/{TABLE_NAME}/", + f"/iceberg_data/default/{TABLE_NAME}/", + ) + + # Test via table function + table_function_expr = get_creation_expression( + storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True + ) + + result = instance.query( + f"SELECT `a.b`, `c.x`, `c.y` FROM {table_function_expr}" + ).strip() + assert result == "['a','b','c']\t[1,2]\t['p','q']", \ + f"Unexpected result via table function: {result}" + + # Test via table engine + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark) + + result = instance.query( + f"SELECT `a.b`, `c.x`, `c.y` FROM {TABLE_NAME}" + ).strip() + assert result == "['a','b','c']\t[1,2]\t['p','q']", \ + f"Unexpected result via table engine: {result}" diff --git a/tests/queries/0_stateless/04259_dotted_array_not_nested.reference b/tests/queries/0_stateless/04259_dotted_array_not_nested.reference new file mode 100644 index 000000000000..e6c5c3fcae88 --- /dev/null +++ b/tests/queries/0_stateless/04259_dotted_array_not_nested.reference @@ -0,0 +1,3 @@ +['a','b','c'] +['a','b','c'] +[1,2] ['p','q'] diff --git a/tests/queries/0_stateless/04259_dotted_array_not_nested.sql b/tests/queries/0_stateless/04259_dotted_array_not_nested.sql new file mode 100644 index 000000000000..ba9396f11826 --- /dev/null +++ b/tests/queries/0_stateless/04259_dotted_array_not_nested.sql @@ -0,0 +1,18 @@ +-- Regression test for #90731. +-- A lone Array(T) column with a dot in its name must not be collapsed into +-- a synthetic Nested structure and must be readable as a plain array. + +CREATE TABLE t1 (`a.b` Array(String)) ENGINE = Memory; +INSERT INTO t1 VALUES (['a','b','c']); +SELECT `a.b` FROM t1; + +-- In a mixed table, the lone dotted column must not interfere with the +-- genuine flat-Nested group (c.x / c.y share prefix 'c'). +CREATE TABLE t2 (`a.b` Array(String), `c.x` Array(Int32), `c.y` Array(String)) + ENGINE = Memory; +INSERT INTO t2 VALUES (['a','b','c'], [1,2], ['p','q']); +SELECT `a.b` FROM t2; +SELECT `c.x`, `c.y` FROM t2; + +DROP TABLE t1; +DROP TABLE t2; diff --git a/tests/queries/0_stateless/04260_dotted_column_in_hints.reference b/tests/queries/0_stateless/04260_dotted_column_in_hints.reference new file mode 100644 index 000000000000..9766475a4185 --- /dev/null +++ b/tests/queries/0_stateless/04260_dotted_column_in_hints.reference @@ -0,0 +1 @@ +ok diff --git a/tests/queries/0_stateless/04260_dotted_column_in_hints.sh b/tests/queries/0_stateless/04260_dotted_column_in_hints.sh new file mode 100755 index 000000000000..a7cfe12dee4b --- /dev/null +++ b/tests/queries/0_stateless/04260_dotted_column_in_hints.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Regression test for #90731. +# ColumnsDescription::getAllRegisteredNames must include columns whose names +# contain a dot, so they appear in IHints suggestions after a typo. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q " + CREATE TABLE t_dotted_hint (\`a.b\` Array(String)) + ENGINE = MergeTree ORDER BY tuple(); +" + +# Misspell the column name; the error message must suggest the real name 'a.b'. +$CLICKHOUSE_CLIENT -q "ALTER TABLE t_dotted_hint MODIFY COLUMN a_b Array(String);" 2>&1 \ + | grep -qF "a.b" && echo "ok" || echo "FAIL" + +$CLICKHOUSE_CLIENT -q "DROP TABLE t_dotted_hint;"