From ab327d20fb01e4907380f592c271d44ac727f66c Mon Sep 17 00:00:00 2001
From: "Daniel Q. Kim" <daniel.kim@altinity.com>
Date: Thu, 21 May 2026 16:43:37 +0200
Subject: [PATCH] Fix Iceberg ARRAY columns with dot-separated names returning
 empty lists
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When querying an Iceberg table through the `iceberg(...)` table function
or a DataLakeCatalog, a column whose name contains a `.` and whose type
is `Array(T)` (e.g. `` `a.b` ARRAY<STRING> ``) returned empty arrays
instead of the stored values. The same data read by Spark returned the
expected values. Fixes #90731.

The Parquet V3 reader path (`SchemaConverter` + `ColumnMapper` +
`FormatFilterInfo`) is already correct after the dotted-name field-id
work in 0a218cd4e8b, 4b733bae561 and f24c1a46063. This change addresses
two residual upstream defects that affect dotted-name `Array(T)`
columns regardless of source:

* `ColumnsDescription::getAllRegisteredNames` explicitly filtered out
  any column whose name contained `.`, under the assumption such names
  were always flattened Nested subcolumns. A column whose stored name
  literally contains a dot (allowed by MergeTree with backticks, and
  produced by Iceberg / Spark) is a first-class registered name and
  must appear in `IHints` misspelling suggestions. The function is only
  consumed by `IHints`-style suggestion paths (and by
  `StorageSystemZooKeeper` for column-name iteration, where no dotted
  names exist), so relaxing it has no effect on parsing, planning,
  storage, or wire protocol.

* `NestedUtils::getSubcolumnsOfNested` treated every `Array(T)` column
  whose name contained `.` as a flattened element of a synthetic
  `Nested` structure named after the prefix. This caused the Arrow,
  ORC and pre-V3 Parquet readers to look for a struct field with the
  prefix name in the data file rather than the literal dotted column,
  returning an empty array. The fix uses a two-pass scan: a synthetic
  `Nested` entry is only emitted when at least two `Array(T)` columns
  share the same dotted prefix. A lone column such as `a.b: Array(T)`
  no longer appears in the synthetic-Nested map. Genuine flattened
  `Nested` with multiple fields is unaffected; the existing
  early-continue on `isNested()` also covers the one-field-Nested
  edge case.

Tests:
* `tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py::test_dotted_array_column` —
  end-to-end repro of #90731 against s3, azure and local storage.
* `test_dotted_array_alongside_real_nested` in the same file — mixed-
  schema regression guard verifying a lone dotted `Array` column
  coexists with genuine flattened-Nested siblings.
* `tests/queries/0_stateless/04259_dotted_array_not_nested.sql` —
  isolates Bug B without Iceberg.
* `tests/queries/0_stateless/04260_dotted_column_in_hints.sh` —
  verifies Bug A by checking the misspelling hint output.

Changelog category (leave one):
- Bug Fix (user-visible misbehavior in an official stable release)

Changelog entry:
Fix reading Iceberg tables whose `ARRAY` column names contain a dot
(e.g. `` `a.b` ARRAY<STRING> ``), which previously returned empty
arrays. Two upstream defects were responsible:
`ColumnsDescription::getAllRegisteredNames` filtered out dotted names,
and `NestedUtils::getSubcolumnsOfNested` misclassified lone dotted
`Array(T)` columns as flattened `Nested` children.

(cherry picked from commit f8467afa849f7ce5aec7a7d372b00fdabf13b4b1)
---
 src/DataTypes/NestedUtils.cpp                 | 24 ++++-
 src/Storages/ColumnsDescription.cpp           |  5 +-
 .../test_column_names_with_dots.py            | 98 +++++++++++++++++++
 .../04259_dotted_array_not_nested.reference   |  3 +
 .../04259_dotted_array_not_nested.sql         | 18 ++++
 .../04260_dotted_column_in_hints.reference    |  1 +
 .../04260_dotted_column_in_hints.sh           | 19 ++++
 7 files changed, 160 insertions(+), 8 deletions(-)
 create mode 100644 tests/queries/0_stateless/04259_dotted_array_not_nested.reference
 create mode 100644 tests/queries/0_stateless/04259_dotted_array_not_nested.sql
 create mode 100644 tests/queries/0_stateless/04260_dotted_column_in_hints.reference
 create mode 100755 tests/queries/0_stateless/04260_dotted_column_in_hints.sh
diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp
index 56ebe66c2ecc..531d29b66250 100644
--- a/src/DataTypes/NestedUtils.cpp
+++ b/src/DataTypes/NestedUtils.cpp
@@ -212,21 +212,37 @@ using NameToDataType = std::map<String, DataTypePtr>;
 
 NameToDataType getSubcolumnsOfNested(const NamesAndTypesList & names_and_types)
 {
-    std::unordered_map<String, NamesAndTypesList> nested;
+    /// Pass 1: count how many Array(T) columns share each dotted prefix.
+    /// A lone column like `a.b Array(T)` must not be collapsed into a synthetic
+    /// Nested parent — only genuine flat-Nested groups (n.x, n.y, ...) qualify.
+    std::unordered_map<String, size_t> prefix_count;
     for (const auto & name_type : names_and_types)
     {
         /// Skip subcolumns (e.g. `c0.c2.null` derived from `c0.c2 Array(Nullable(Tuple()))`).
-        /// They are not real flat-nested columns like `n.a Array(T)`, `n.b Array(T)`.
         if (name_type.isSubcolumn())
             continue;
 
         const auto * type_arr = typeid_cast<const DataTypeArray *>(name_type.type.get());
-
-        /// Ignore true Nested type, but try to unite flatten arrays to Nested type.
         if (!isNested(name_type.type) && type_arr)
         {
             auto split = splitName(name_type.name);
             if (!split.second.empty())
+                ++prefix_count[split.first];
+        }
+    }
+
+    /// Pass 2: build Nested only for prefixes shared by at least two columns.
+    std::unordered_map<String, NamesAndTypesList> nested;
+    for (const auto & name_type : names_and_types)
+    {
+        if (name_type.isSubcolumn())
+            continue;
+
+        const auto * type_arr = typeid_cast<const DataTypeArray *>(name_type.type.get());
+        if (!isNested(name_type.type) && type_arr)
+        {
+            auto split = splitName(name_type.name);
+            if (!split.second.empty() && prefix_count[split.first] >= 2)
                 nested[split.first].emplace_back(split.second, type_arr->getNestedType());
         }
     }
diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp
index 45c32efa8d59..1c0c6c14f50b 100644
--- a/src/Storages/ColumnsDescription.cpp
+++ b/src/Storages/ColumnsDescription.cpp
@@ -976,10 +976,7 @@ std::vector<String> ColumnsDescription::getAllRegisteredNames() const
     std::vector<String> names;
     names.reserve(columns.size());
     for (const auto & column : columns)
-    {
-        if (!column.name.contains('.'))
-            names.push_back(column.name);
-    }
+        names.emplace_back(column.name);
     return names;
 }
 
diff --git a/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py b/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py
index cb239e1e8372..0bf9ae27d539 100644
--- a/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py
+++ b/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py
@@ -216,3 +216,101 @@ def test_deeply_nested_struct_with_dotted_names(started_cluster_iceberg_with_spa
     ).strip()
     expected = "deep_value1\ndeep_value2\ndeep_value3"
     assert result == expected, f"Expected:\n{expected}\nGot:\n{result}"
+
+
+@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
+def test_dotted_array_column(started_cluster_iceberg_with_spark, storage_type):
+    """
+    Regression test for issue #90731.
+    A top-level ARRAY column whose name literally contains a dot (e.g. `a.b`)
+    must be returned with its actual values, not as an empty array.
+    """
+    instance = started_cluster_iceberg_with_spark.instances["node1"]
+    spark = started_cluster_iceberg_with_spark.spark_session
+    TABLE_NAME = "test_dotted_array_column_" + storage_type + "_" + get_uuid_str()
+
+    from pyspark.sql.types import ArrayType
+
+    data = [(["a", "b", "c"],)]
+    schema = StructType([
+        StructField("a.b", ArrayType(StringType())),
+    ])
+    df = spark.createDataFrame(data=data, schema=schema)
+
+    write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2")
+
+    default_upload_directory(
+        started_cluster_iceberg_with_spark,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        f"/iceberg_data/default/{TABLE_NAME}/",
+    )
+
+    # Test via table function
+    table_function_expr = get_creation_expression(
+        storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True
+    )
+
+    result = instance.query(
+        f"SELECT `a.b` FROM {table_function_expr}"
+    ).strip()
+    assert result == "['a','b','c']", f"Expected ['a','b','c'], got: {result}"
+
+    # Test via table engine
+    create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark)
+
+    result = instance.query(
+        f"SELECT `a.b` FROM {TABLE_NAME}"
+    ).strip()
+    assert result == "['a','b','c']", f"Expected ['a','b','c'], got: {result}"
+
+
+@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
+def test_dotted_array_alongside_real_nested(started_cluster_iceberg_with_spark, storage_type):
+    """
+    Regression guard: a lone dotted Array column (`a.b`) must not interfere with
+    a genuine flat-Nested group (`c.x`, `c.y`) that shares a different prefix.
+    All three columns must round-trip correctly.
+    """
+    instance = started_cluster_iceberg_with_spark.instances["node1"]
+    spark = started_cluster_iceberg_with_spark.spark_session
+    TABLE_NAME = "test_dotted_array_alongside_real_nested_" + storage_type + "_" + get_uuid_str()
+
+    from pyspark.sql.types import ArrayType, IntegerType as SparkIntegerType
+
+    data = [(["a", "b", "c"], [1, 2], ["p", "q"])]
+    schema = StructType([
+        StructField("a.b", ArrayType(StringType())),
+        StructField("c.x", ArrayType(SparkIntegerType())),
+        StructField("c.y", ArrayType(StringType())),
+    ])
+    df = spark.createDataFrame(data=data, schema=schema)
+
+    write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2")
+
+    default_upload_directory(
+        started_cluster_iceberg_with_spark,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        f"/iceberg_data/default/{TABLE_NAME}/",
+    )
+
+    # Test via table function
+    table_function_expr = get_creation_expression(
+        storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True
+    )
+
+    result = instance.query(
+        f"SELECT `a.b`, `c.x`, `c.y` FROM {table_function_expr}"
+    ).strip()
+    assert result == "['a','b','c']\t[1,2]\t['p','q']", \
+        f"Unexpected result via table function: {result}"
+
+    # Test via table engine
+    create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark)
+
+    result = instance.query(
+        f"SELECT `a.b`, `c.x`, `c.y` FROM {TABLE_NAME}"
+    ).strip()
+    assert result == "['a','b','c']\t[1,2]\t['p','q']", \
+        f"Unexpected result via table engine: {result}"
diff --git a/tests/queries/0_stateless/04259_dotted_array_not_nested.reference b/tests/queries/0_stateless/04259_dotted_array_not_nested.reference
new file mode 100644
index 000000000000..e6c5c3fcae88
--- /dev/null
+++ b/tests/queries/0_stateless/04259_dotted_array_not_nested.reference
@@ -0,0 +1,3 @@
+['a','b','c']
+['a','b','c']
+[1,2]	['p','q']
diff --git a/tests/queries/0_stateless/04259_dotted_array_not_nested.sql b/tests/queries/0_stateless/04259_dotted_array_not_nested.sql
new file mode 100644
index 000000000000..ba9396f11826
--- /dev/null
+++ b/tests/queries/0_stateless/04259_dotted_array_not_nested.sql
@@ -0,0 +1,18 @@
+-- Regression test for #90731.
+-- A lone Array(T) column with a dot in its name must not be collapsed into
+-- a synthetic Nested structure and must be readable as a plain array.
+
+CREATE TABLE t1 (`a.b` Array(String)) ENGINE = Memory;
+INSERT INTO t1 VALUES (['a','b','c']);
+SELECT `a.b` FROM t1;
+
+-- In a mixed table, the lone dotted column must not interfere with the
+-- genuine flat-Nested group (c.x / c.y share prefix 'c').
+CREATE TABLE t2 (`a.b` Array(String), `c.x` Array(Int32), `c.y` Array(String))
+    ENGINE = Memory;
+INSERT INTO t2 VALUES (['a','b','c'], [1,2], ['p','q']);
+SELECT `a.b` FROM t2;
+SELECT `c.x`, `c.y` FROM t2;
+
+DROP TABLE t1;
+DROP TABLE t2;
diff --git a/tests/queries/0_stateless/04260_dotted_column_in_hints.reference b/tests/queries/0_stateless/04260_dotted_column_in_hints.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/tests/queries/0_stateless/04260_dotted_column_in_hints.reference
@@ -0,0 +1 @@
+ok
diff --git a/tests/queries/0_stateless/04260_dotted_column_in_hints.sh b/tests/queries/0_stateless/04260_dotted_column_in_hints.sh
new file mode 100755
index 000000000000..a7cfe12dee4b
--- /dev/null
+++ b/tests/queries/0_stateless/04260_dotted_column_in_hints.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Regression test for #90731.
+# ColumnsDescription::getAllRegisteredNames must include columns whose names
+# contain a dot, so they appear in IHints suggestions after a typo.
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+$CLICKHOUSE_CLIENT -q "
+    CREATE TABLE t_dotted_hint (\`a.b\` Array(String))
+    ENGINE = MergeTree ORDER BY tuple();
+"
+
+# Misspell the column name; the error message must suggest the real name 'a.b'.
+$CLICKHOUSE_CLIENT -q "ALTER TABLE t_dotted_hint MODIFY COLUMN a_b Array(String);" 2>&1 \
+    | grep -qF "a.b" && echo "ok" || echo "FAIL"
+
+$CLICKHOUSE_CLIENT -q "DROP TABLE t_dotted_hint;"