Skip to content
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ members = [
"arrow-schema",
"arrow-select",
"arrow-string",
"arrow-variant",
"parquet",
"parquet_derive",
"parquet_derive_test",
Expand Down
5 changes: 5 additions & 0 deletions arrow-schema/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ pub enum ArrowError {
DictionaryKeyOverflowError,
/// Error when the run end index in a REE array is bigger than the array length
RunEndIndexOverflowError,
/// Error during Variant operations in `arrow-variant`.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we add a new variant to this enum, it will be a "breaking API change" as then downstream projects would potentially have to update their code to handle new variants

We make releases with API changes every three months,
https://github.com/apache/arrow-rs?tab=readme-ov-file#release-versioning-and-schedule

So in other words, it would be great to remove this change from the PR so we can merge it faster.

VariantError(String),
}

impl ArrowError {
Expand Down Expand Up @@ -126,6 +128,9 @@ impl Display for ArrowError {
ArrowError::RunEndIndexOverflowError => {
write!(f, "Run end encoded array index overflow error")
}
ArrowError::VariantError(desc) => {
write!(f, "Variant error: {desc}")
}
}
}
}
Expand Down
12 changes: 12 additions & 0 deletions arrow-schema/src/extension/canonical/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ mod uuid;
pub use uuid::Uuid;
mod variable_shape_tensor;
pub use variable_shape_tensor::{VariableShapeTensor, VariableShapeTensorMetadata};
mod variant;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recommend we postpone adding the canonical extension type classes until we get farther along in the process and are in a better position to write tests.

In other words I recommend removing the changes in arrow-schema/src/extension/ as well in this pR

pub use variant::Variant;

use crate::{ArrowError, Field};

Expand Down Expand Up @@ -77,6 +79,9 @@ pub enum CanonicalExtensionType {
///
/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
Bool8(Bool8),

/// The extension type for `Variant`.
Variant(Variant),
}

impl TryFrom<&Field> for CanonicalExtensionType {
Expand All @@ -93,6 +98,7 @@ impl TryFrom<&Field> for CanonicalExtensionType {
Uuid::NAME => value.try_extension_type::<Uuid>().map(Into::into),
Opaque::NAME => value.try_extension_type::<Opaque>().map(Into::into),
Bool8::NAME => value.try_extension_type::<Bool8>().map(Into::into),
Variant::NAME => value.try_extension_type::<Variant>().map(Into::into),
_ => Err(ArrowError::InvalidArgumentError(format!("Unsupported canonical extension type: {name}"))),
},
// Name missing the expected prefix
Expand Down Expand Up @@ -140,3 +146,9 @@ impl From<Bool8> for CanonicalExtensionType {
CanonicalExtensionType::Bool8(value)
}
}

impl From<Variant> for CanonicalExtensionType {
fn from(value: Variant) -> Self {
CanonicalExtensionType::Variant(value)
}
}
286 changes: 286 additions & 0 deletions arrow-schema/src/extension/canonical/variant.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Variant
//!
//! <https://arrow.apache.org/docs/format/CanonicalExtensions.html#variant>
Comment thread
PinkCrow007 marked this conversation as resolved.
Outdated

use crate::{extension::ExtensionType, ArrowError, DataType};

/// The extension type for `Variant`.
///
/// Extension name: `arrow.variant`.
///
/// The storage type of this extension is **Struct containing two binary fields**:
/// - metadata: Binary field containing the variant metadata
/// - value: Binary field containing the serialized variant data
///
/// A Variant is a flexible structure that can store **Primitives, Arrays, or Objects**.
///
/// Both metadata and value fields are required.
///
/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#variant>
#[derive(Debug, Clone, PartialEq)]
pub struct Variant {
metadata: Vec<u8>, // Required binary metadata
value: Vec<u8>, // Required binary value
}

impl Variant {
/// Creates a new `Variant` with metadata and value.
pub fn new(metadata: Vec<u8>, value: Vec<u8>) -> Self {
Self { metadata, value }
}

/// Creates a Variant representing an empty structure.
pub fn empty() -> Result<Self, ArrowError> {
Err(ArrowError::InvalidArgumentError(
"Variant cannot be empty because metadata and value are required".to_owned(),
))
}

/// Returns the metadata as a byte array.
pub fn metadata(&self) -> &[u8] {
&self.metadata
}

/// Returns the value as an byte array.
pub fn value(&self) -> &[u8] {
&self.value
}

/// Sets the value of the Variant.
pub fn set_value(mut self, value: Vec<u8>) -> Self {
self.value = value;
self
}
}

impl ExtensionType for Variant {
const NAME: &'static str = "arrow.variant";

type Metadata = &'static str;

fn metadata(&self) -> &Self::Metadata {
&""
}

fn serialize_metadata(&self) -> Option<String> {
Some(String::default())
}

fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
if metadata.is_some_and(str::is_empty) {
Ok("")
} else {
Err(ArrowError::InvalidArgumentError(
"Variant extension type expects an empty string as metadata".to_owned(),
))
}
}

fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
match data_type {
DataType::Struct(fields) => {
if fields.len() != 2 {
return Err(ArrowError::InvalidArgumentError(
"Variant struct must have exactly two fields".to_owned(),
));
}

let metadata_field =
fields
.iter()
.find(|f| f.name() == "metadata")
.ok_or_else(|| {
ArrowError::InvalidArgumentError(
"Variant struct must have a field named 'metadata'".to_owned(),
)
})?;

let value_field = fields.iter().find(|f| f.name() == "value").ok_or_else(|| {
ArrowError::InvalidArgumentError(
"Variant struct must have a field named 'value'".to_owned(),
)
})?;

match (metadata_field.data_type(), value_field.data_type()) {
(DataType::Binary, DataType::Binary)
| (DataType::LargeBinary, DataType::LargeBinary) => {
if metadata_field.is_nullable() || value_field.is_nullable() {
return Err(ArrowError::InvalidArgumentError(
"Variant struct fields must not be nullable".to_owned(),
));
}
Ok(())
}
_ => Err(ArrowError::InvalidArgumentError(
"Variant struct fields must both be Binary or LargeBinary".to_owned(),
)),
}
}
_ => Err(ArrowError::InvalidArgumentError(format!(
"Variant data type mismatch, expected Struct, found {data_type}"
))),
}
}

fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
// First validate the data type
let variant = Variant::new(Vec::new(), Vec::new());
variant.supports_data_type(data_type)?;
Ok(variant)
}
}

#[cfg(test)]
mod tests {
#[cfg(feature = "canonical_extension_types")]
use crate::extension::CanonicalExtensionType;
use crate::{
extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
DataType, Field,
};

use super::*;

#[test]
fn valid() -> Result<(), ArrowError> {
let struct_type = DataType::Struct(
vec![
Field::new("metadata", DataType::Binary, false),
Field::new("value", DataType::Binary, false),
]
.into(),
);

let mut field = Field::new("", struct_type, false);
let variant = Variant::new(Vec::new(), Vec::new());

field.try_with_extension_type(variant.clone())?;
field.try_extension_type::<Variant>()?;

#[cfg(feature = "canonical_extension_types")]
assert_eq!(
field.try_canonical_extension_type()?,
CanonicalExtensionType::Variant(variant)
);

Ok(())
}

#[test]
#[should_panic(expected = "Field extension type name missing")]
fn missing_name() {
let struct_type = DataType::Struct(
vec![
Field::new("metadata", DataType::Binary, false),
Field::new("value", DataType::Binary, false),
]
.into(),
);

let field = Field::new("", struct_type, false).with_metadata(
[(EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned())]
.into_iter()
.collect(),
);
field.extension_type::<Variant>();
}

#[test]
#[should_panic(expected = "Variant data type mismatch")]
fn invalid_type() {
Field::new("", DataType::Int8, false).with_extension_type(Variant::new(vec![], vec![]));
}

#[test]
#[should_panic(expected = "Variant extension type expects an empty string as metadata")]
fn invalid_metadata() {
let struct_type = DataType::Struct(
vec![
Field::new("metadata", DataType::Binary, false),
Field::new("value", DataType::Binary, false),
]
.into(),
);

let field = Field::new("", struct_type, false).with_metadata(
[
(EXTENSION_TYPE_NAME_KEY.to_owned(), Variant::NAME.to_owned()),
(
EXTENSION_TYPE_METADATA_KEY.to_owned(),
"non-empty".to_owned(),
),
]
.into_iter()
.collect(),
);
field.extension_type::<Variant>();
}

#[test]
fn variant_supports_valid_data_types() {
// Test valid struct types
let valid_types = [
DataType::Struct(
vec![
Field::new("metadata", DataType::Binary, false),
Field::new("value", DataType::Binary, false),
]
.into(),
),
DataType::Struct(
vec![
Field::new("metadata", DataType::LargeBinary, false),
Field::new("value", DataType::LargeBinary, false),
]
.into(),
),
];

for data_type in valid_types {
let variant = Variant::new(vec![1], vec![2]);
assert!(variant.supports_data_type(&data_type).is_ok());
}

// Test invalid types
let invalid_types = [
DataType::Utf8,
DataType::Struct(vec![Field::new("single", DataType::Binary, false)].into()),
DataType::Struct(
vec![
Field::new("wrong1", DataType::Binary, false),
Field::new("wrong2", DataType::Binary, false),
]
.into(),
),
DataType::Struct(
vec![
Field::new("metadata", DataType::Binary, true), // nullable
Field::new("value", DataType::Binary, false),
]
.into(),
),
];

for data_type in invalid_types {
let variant = Variant::new(vec![1], vec![2]);
assert!(variant.supports_data_type(&data_type).is_err());
}
}
}
Loading
Loading