Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
87b438d
add json_to_variant
harshmotw-db Jun 25, 2025
a946ac6
comment fix
harshmotw-db Jun 25, 2025
bca3b81
Added another sample buffer managger
harshmotw-db Jun 25, 2025
339e880
minor refactoring
harshmotw-db Jun 25, 2025
fe798c3
Merge branch 'main' of https://github.com/harshmotw-db/arrow-rs into …
harshmotw-db Jun 25, 2025
882d3a7
Merge branch 'main' of https://github.com/harshmotw-db/arrow-rs into …
harshmotw-db Jun 25, 2025
3c18fdf
incorporated new changes
harshmotw-db Jun 25, 2025
67a83fe
minor changes
harshmotw-db Jun 26, 2025
c9aa519
Merge branch 'main' of https://github.com/harshmotw-db/arrow-rs into …
harshmotw-db Jun 26, 2025
dede88d
test fix based on recent commit
harshmotw-db Jun 26, 2025
fa3befc
constant fix
harshmotw-db Jun 26, 2025
38bac59
fix
harshmotw-db Jun 26, 2025
cd530ee
addressed comments
harshmotw-db Jun 26, 2025
57b3eb0
fix
harshmotw-db Jun 26, 2025
71b7d6f
fixed VariantBufferManager
harshmotw-db Jun 26, 2025
031c916
deduped a bit of code
harshmotw-db Jun 26, 2025
d4fc876
deduplicated code
harshmotw-db Jun 26, 2025
c41af4e
moved serde code out of variant.rs
harshmotw-db Jun 26, 2025
ecaf557
incorporated Ryan's latest comments
harshmotw-db Jun 26, 2025
4abc598
add more object tests
harshmotw-db Jun 26, 2025
0842ef8
fix
harshmotw-db Jun 26, 2025
94531af
doc fix
harshmotw-db Jun 26, 2025
28d0012
Merge branch 'main' into harsh-motwani_data/from_json
harshmotw-db Jun 27, 2025
e2788f5
Removed dependency on VariantBufferManager and leave that to later
harshmotw-db Jun 27, 2025
3178449
Merge branch 'harsh-motwani_data/from_json' of https://github.com/har…
harshmotw-db Jun 27, 2025
0455685
fix
harshmotw-db Jun 27, 2025
cc0b66e
fix
harshmotw-db Jun 30, 2025
d2a7516
resolved test comment
harshmotw-db Jun 30, 2025
a29b5c3
fixed more comments
harshmotw-db Jun 30, 2025
7f23cf5
fix decimal to string
harshmotw-db Jul 1, 2025
50f4b25
fmt and clippy
harshmotw-db Jul 1, 2025
560e430
fix
harshmotw-db Jul 1, 2025
3249d93
Merge remote-tracking branch 'apache/main' into harsh-motwani_data/fr…
alamb Jul 1, 2025
07d5688
clippy
alamb Jul 1, 2025
af937ac
Split tests into separate functions
alamb Jul 1, 2025
388f188
partially removed decimal dependency
harshmotw-db Jul 2, 2025
7407776
merge
harshmotw-db Jul 2, 2025
3b42d91
removed decimal type from variant json parsing
harshmotw-db Jul 2, 2025
43d6ea5
comment fix
harshmotw-db Jul 2, 2025
e9deda9
refined lifetimes
harshmotw-db Jul 3, 2025
eb11890
Fix clippy, remove unused dependency
alamb Jul 3, 2025
3531540
Merge remote-tracking branch 'apache/main' into harsh-motwani_data/fr…
alamb Jul 3, 2025
ea5b573
Update parquet-variant/src/from_json.rs
alamb Jul 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 13 additions & 26 deletions parquet-variant/examples/variant_from_json_examples.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,26 @@

use parquet_variant::{
json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value,
SampleBoxBasedVariantBufferManager, SampleVecBasedVariantBufferManager, VariantBufferManager,
SampleVecBasedVariantBufferManager,
};

fn from_json_example<T: VariantBufferManager>(
variant_buffer_manager: &mut T,
) -> Result<(), Box<dyn std::error::Error>> {
fn main() -> Result<(), Box<dyn std::error::Error>> {
Comment thread
harshmotw-db marked this conversation as resolved.
// The caller must provide an object implementing the `VariantBufferManager` trait to the library.
// This allows the library to write the constructed variant to buffers provided by the caller.
// This way, the caller has direct control over the output buffers.
let mut variant_buffer_manager = SampleVecBasedVariantBufferManager {
value_buffer: vec![0u8; 1],
metadata_buffer: vec![0u8; 1],
};

let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string()
+ "\"email\":\"[email protected]\", \"is_active\": true, \"score\": 95.7,"
+ "\"additional_info\": null}";
let (metadata_size, value_size) = json_to_variant(&person_string, variant_buffer_manager)?;
let (metadata_size, value_size) = json_to_variant(&person_string, &mut variant_buffer_manager)?;

let variant = parquet_variant::Variant::try_new(
&variant_buffer_manager.get_immutable_metadata_buffer()[..metadata_size],
&variant_buffer_manager.get_immutable_value_buffer()[..value_size],
&variant_buffer_manager.metadata_buffer[..metadata_size],
&variant_buffer_manager.value_buffer[..value_size],
)?;

let json_string = variant_to_json_string(&variant)?;
Expand All @@ -50,22 +56,3 @@ fn from_json_example<T: VariantBufferManager>(

Ok(())
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
// The caller must provide an object implementing the `VariantBufferManager` trait to the library.
// This allows the library to write the constructed variant to buffers provided by the caller.
// This way, the caller has direct control over the output buffers.
let mut box_based_buffer_manager = SampleBoxBasedVariantBufferManager {
value_buffer: vec![0u8; 1].into_boxed_slice(),
metadata_buffer: vec![0u8; 1].into_boxed_slice(),
};

let mut vec_based_buffer_manager = SampleVecBasedVariantBufferManager {
value_buffer: vec![0u8; 1],
metadata_buffer: vec![0u8; 1],
};

from_json_example(&mut box_based_buffer_manager)?;
from_json_example(&mut vec_based_buffer_manager)?;
Ok(())
}
36 changes: 36 additions & 0 deletions parquet-variant/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,42 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
}
}

pub(crate) trait AppendVariantHelper {
Comment thread
harshmotw-db marked this conversation as resolved.
Outdated
fn append_value_helper<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T);

fn new_list_helper(&mut self) -> ListBuilder;

fn new_object_helper(&mut self) -> ObjectBuilder;
}
Comment thread
harshmotw-db marked this conversation as resolved.
Outdated

impl AppendVariantHelper for ListBuilder<'_> {
fn append_value_helper<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
self.append_value(value);
}

fn new_list_helper(&mut self) -> ListBuilder {
self.new_list()
}

fn new_object_helper(&mut self) -> ObjectBuilder {
self.new_object()
}
}

impl AppendVariantHelper for VariantBuilder {
fn append_value_helper<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
self.append_value(value);
}

fn new_list_helper(&mut self) -> ListBuilder {
self.new_list()
}

fn new_object_helper(&mut self) -> ObjectBuilder {
self.new_object()
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
137 changes: 89 additions & 48 deletions parquet-variant/src/from_json.rs
Comment thread
alamb marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
pub use crate::variant::{VariantDecimal4, VariantDecimal8};
use crate::variant_buffer_manager::VariantBufferManager;
use crate::{ListBuilder, ObjectBuilder, Variant, VariantBuilder};
use crate::{AppendVariantHelper, ListBuilder, ObjectBuilder, Variant, VariantBuilder};
use arrow_schema::ArrowError;
use serde_json::{Map, Value};
use rust_decimal::prelude::*;
use serde_json::{Map, Number, Value};

/// Eventually, internal writes should also be performed using VariantBufferManager instead of
/// ValueBuffer and MetadataBuffer so the caller has control of the memory.
/// Returns a pair <value_size, metadata_size>
pub fn json_to_variant<T: VariantBufferManager>(
pub fn json_to_variant(
json: &str,
variant_buffer_manager: &mut T,
variant_buffer_manager: &mut impl VariantBufferManager,
) -> Result<(usize, usize), ArrowError> {
let mut builder = VariantBuilder::new();
let json: Value = serde_json::from_str(json)
Expand All @@ -21,32 +23,81 @@ pub fn json_to_variant<T: VariantBufferManager>(

// Write to caller's buffers - Remove this when the library internally writes to the caller's
// buffers anyway
variant_buffer_manager.ensure_metadata_buffer_size(metadata_size)?;
variant_buffer_manager.ensure_value_buffer_size(value_size)?;

let caller_metadata_buffer = variant_buffer_manager.borrow_metadata_buffer();
let caller_metadata_buffer =
variant_buffer_manager.ensure_size_and_borrow_metadata_buffer(metadata_size)?;
caller_metadata_buffer[..metadata_size].copy_from_slice(metadata.as_slice());
let caller_value_buffer = variant_buffer_manager.borrow_value_buffer();
let caller_value_buffer =
variant_buffer_manager.ensure_size_and_borrow_value_buffer(value_size)?;
caller_value_buffer[..value_size].copy_from_slice(value.as_slice());
Ok((metadata_size, value_size))
}

fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowError> {
append_json(json, builder)?;
Ok(())
}

fn variant_from_number<'a, 'b>(n: &Number) -> Result<Variant<'a, 'b>, ArrowError> {
Comment thread
harshmotw-db marked this conversation as resolved.
Outdated
if let Some(i) = n.as_i64() {
// Find minimum Integer width to fit
if i as i8 as i64 == i {
Ok((i as i8).into())
} else if i as i16 as i64 == i {
Ok((i as i16).into())
} else if i as i32 as i64 == i {
Ok((i as i32).into())
} else {
Ok(i.into())
}
} else {
// Try decimal
// TODO: Replace with custom decimal parsing as the rust_decimal library only supports
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should look at what arrow-json does for this

// a max unscaled value of 2^96.
match Decimal::from_str_exact(n.as_str()) {
Ok(dec) => {
let unscaled: i128 = dec.mantissa();
let scale = dec.scale() as u8;
if unscaled.abs() <= VariantDecimal4::MAX_UNSCALED_VALUE as i128
&& scale <= VariantDecimal4::MAX_PRECISION as u8
{
(unscaled as i32, scale).try_into()
} else if unscaled.abs() <= VariantDecimal8::MAX_UNSCALED_VALUE as i128
&& scale <= VariantDecimal8::MAX_PRECISION as u8
{
(unscaled as i64, scale).try_into()
} else {
(unscaled, scale).try_into()
}
}
Err(_) => {
// Try double
match n.as_f64() {
Some(f) => return Ok(f.into()),
None => Err(ArrowError::InvalidArgumentError(format!(
"Failed to parse {} as number",
n.as_str()
))),
}?
}
}
}
}

fn append_json(json: &Value, builder: &mut impl AppendVariantHelper) -> Result<(), ArrowError> {
match json {
Value::Null => builder.append_value(Variant::Null),
Value::Bool(b) => builder.append_value(*b),
Value::Null => builder.append_value_helper(Variant::Null),
Value::Bool(b) => builder.append_value_helper(*b),
Value::Number(n) => {
let v: Variant = n.try_into()?;
builder.append_value(v)
builder.append_value_helper(variant_from_number(n)?);
}
Comment thread
harshmotw-db marked this conversation as resolved.
Value::String(s) => builder.append_value(s.as_str()),
Value::String(s) => builder.append_value_helper(s.as_str()),
Value::Array(arr) => {
let mut list_builder = builder.new_list();
let mut list_builder = builder.new_list_helper();
build_list(arr, &mut list_builder)?;
Comment thread
harshmotw-db marked this conversation as resolved.
Outdated
list_builder.finish();
}
Value::Object(obj) => {
let mut obj_builder = builder.new_object();
let mut obj_builder = builder.new_object_helper();
build_object(obj, &mut obj_builder)?;
Comment thread
harshmotw-db marked this conversation as resolved.
Outdated
obj_builder.finish();
}
Expand All @@ -56,22 +107,7 @@ fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowErr

fn build_list(arr: &[Value], builder: &mut ListBuilder) -> Result<(), ArrowError> {
for val in arr {
match val {
Value::Null => builder.append_value(Variant::Null),
Value::Bool(b) => builder.append_value(*b),
Value::Number(n) => builder.append_value(Variant::try_from(n)?),
Value::String(s) => builder.append_value(s.as_str()),
Value::Array(arr) => {
let mut list_builder = builder.new_list();
build_list(arr, &mut list_builder)?;
list_builder.finish()
}
Value::Object(obj) => {
let mut obj_builder = builder.new_object();
build_object(obj, &mut obj_builder)?;
obj_builder.finish();
}
}
append_json(val, builder)?;
}
Ok(())
}
Expand All @@ -81,22 +117,27 @@ fn build_object<'a, 'b>(
builder: &mut ObjectBuilder<'a, 'b>,
) -> Result<(), ArrowError> {
for (key, value) in obj.iter() {
match value {
Value::Null => builder.insert(key, Variant::Null),
Value::Bool(b) => builder.insert(key, *b),
Value::Number(n) => builder.insert(key, Variant::try_from(n)?),
Value::String(s) => builder.insert(key, s.as_str()),
Value::Array(arr) => {
let mut list_builder = builder.new_list(key);
build_list(arr, &mut list_builder)?;
list_builder.finish()
}
Value::Object(obj) => {
let mut obj_builder = builder.new_object(key);
build_object(obj, &mut obj_builder)?;
obj_builder.finish();
}
}
let mut field_builder = ObjectFieldBuilder { key, builder };
append_json(value, &mut field_builder)?;
}
Ok(())
}

struct ObjectFieldBuilder<'a, 'b, 'c> {
key: &'a str,
builder: &'b mut ObjectBuilder<'c, 'a>,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is hard to interpret... can we use 'm and 'v?

Suggested change
struct ObjectFieldBuilder<'a, 'b, 'c> {
key: &'a str,
builder: &'b mut ObjectBuilder<'c, 'a>,
struct ObjectFieldBuilder<'m, 'v, 'r> {
key: &'r str,
builder: &'r mut ObjectBuilder<'m, 'v>,

(here, 'r is the lifetime of the references used to construct the field builder)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've changed it to 's, 'o and 'v where 's is the lifetime of the 's is the lifetime of the string, 'o is the lifetime of [ObjectBuilder] and v is the lifetime of the variant buffers.

}

impl AppendVariantHelper for ObjectFieldBuilder<'_, '_, '_> {
fn append_value_helper<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
self.builder.insert(self.key, value);
}

fn new_list_helper(&mut self) -> ListBuilder {
self.builder.new_list(self.key)
}

fn new_object_helper(&mut self) -> ObjectBuilder {
self.builder.new_object(self.key)
}
}
4 changes: 1 addition & 3 deletions parquet-variant/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,4 @@ pub use builder::*;
pub use from_json::json_to_variant;
pub use to_json::{variant_to_json, variant_to_json_string, variant_to_json_value};
pub use variant::*;
pub use variant_buffer_manager::{
SampleBoxBasedVariantBufferManager, SampleVecBasedVariantBufferManager, VariantBufferManager,
};
pub use variant_buffer_manager::{SampleVecBasedVariantBufferManager, VariantBufferManager};
52 changes: 0 additions & 52 deletions parquet-variant/src/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ mod decimal;
mod list;
mod metadata;
mod object;
use rust_decimal::prelude::*;
use serde_json::Number;

const MAX_SHORT_STRING_BYTES: usize = 0x3F;

Expand Down Expand Up @@ -943,56 +941,6 @@ impl From<VariantDecimal16> for Variant<'_, '_> {
}
}

impl TryFrom<&Number> for Variant<'_, '_> {
type Error = ArrowError;

fn try_from(n: &Number) -> Result<Self, Self::Error> {
if let Some(i) = n.as_i64() {
// Find minimum Integer width to fit
if i as i8 as i64 == i {
Ok((i as i8).into())
} else if i as i16 as i64 == i {
Ok((i as i16).into())
} else if i as i32 as i64 == i {
Ok((i as i32).into())
} else {
Ok(i.into())
}
} else {
// Try decimal
// TODO: Replace with custom decimal parsing as the rust_decimal library only supports
// a max unscaled value of 2^96.
match Decimal::from_str_exact(n.as_str()) {
Ok(dec) => {
let unscaled: i128 = dec.mantissa();
let scale = dec.scale() as u8;
if unscaled.abs() <= VariantDecimal4::MAX_UNSCALED_VALUE as i128
&& scale <= VariantDecimal4::MAX_PRECISION as u8
{
(unscaled as i32, scale).try_into()
} else if unscaled.abs() <= VariantDecimal8::MAX_UNSCALED_VALUE as i128
&& scale <= VariantDecimal8::MAX_PRECISION as u8
{
(unscaled as i64, scale).try_into()
} else {
(unscaled, scale).try_into()
}
}
Err(_) => {
// Try double
match n.as_f64() {
Some(f) => return Ok(f.into()),
None => Err(ArrowError::InvalidArgumentError(format!(
"Failed to parse {} as number",
n.as_str()
))),
}?
}
}
}
}
}

impl From<f32> for Variant<'_, '_> {
fn from(value: f32) -> Self {
Variant::Float(value)
Expand Down
Loading