From 93ed81292259173c693a748ee879171a57d670b3 Mon Sep 17 00:00:00 2001 From: Sreeram Garlapati Date: Fri, 29 May 2026 20:29:01 -0700 Subject: [PATCH] fix: support Java HadoopCatalog's v.metadata.json naming in MetadataLocation Java's HadoopCatalog writes metadata files as v1.metadata.json, v2.metadata.json etc., while iceberg-rust expects -.metadata.json. This causes MetadataLocation::from_str to fail when reading tables originally created by Java's HadoopCatalog that were later registered into a proper catalog (REST, Glue, DynamoDB). The fix extends parse_file_name to try the standard format first, then fall back to the v format. The id field becomes Option (None for Hadoop format). Display faithfully reproduces the original format, but after with_next_version() the output always uses iceberg-rust's - convention since it generates a new UUID. Closes #2533 Co-authored-by: rawataaryan9 --- .../iceberg/src/catalog/metadata_location.rs | 132 ++++++++++++++---- 1 file changed, 103 insertions(+), 29 deletions(-) diff --git a/crates/iceberg/src/catalog/metadata_location.rs b/crates/iceberg/src/catalog/metadata_location.rs index acd041d5e1..36cf8f2398 100644 --- a/crates/iceberg/src/catalog/metadata_location.rs +++ b/crates/iceberg/src/catalog/metadata_location.rs @@ -26,12 +26,15 @@ use crate::spec::{TableMetadata, parse_metadata_file_compression}; use crate::{Error, ErrorKind, Result}; /// Helper for parsing a location of the format: `/metadata/-.metadata.json` -/// or with compression: `/metadata/-.gz.metadata.json` +/// or with compression: `/metadata/-.gz.metadata.json`. +/// +/// Also supports Java HadoopCatalog's naming convention: `/metadata/v.metadata.json`. #[derive(Clone, Debug, PartialEq)] pub struct MetadataLocation { table_location: String, version: i32, - id: Uuid, + /// `None` for Java HadoopCatalog's `v.metadata.json` format. + id: Option, compression_codec: CompressionCodec, } @@ -52,7 +55,7 @@ impl MetadataLocation { Self { table_location: table_location.to_string(), version: 0, - id: Uuid::new_v4(), + id: Some(Uuid::new_v4()), compression_codec: CompressionCodec::None, } } @@ -64,7 +67,7 @@ impl MetadataLocation { Self { table_location: table_location.to_string(), version: 0, - id: Uuid::new_v4(), + id: Some(Uuid::new_v4()), compression_codec: Self::compression_from_properties(metadata.properties()), } } @@ -75,7 +78,7 @@ impl MetadataLocation { Self { table_location: self.table_location.clone(), version: self.version + 1, - id: Uuid::new_v4(), + id: Some(Uuid::new_v4()), compression_codec: self.compression_codec, } } @@ -104,10 +107,9 @@ impl MetadataLocation { Ok(prefix.to_string()) } - /// Parses a file name of the format `-.metadata.json` - /// or with compression: `-.gz.metadata.json`. - /// Parse errors for compression codec result in CompressionCodec::None. - fn parse_file_name(file_name: &str) -> Result<(i32, Uuid, CompressionCodec)> { + /// Parses a file name of the format `-.metadata.json`, + /// `-.gz.metadata.json`, or Java HadoopCatalog's `v.metadata.json`. + fn parse_file_name(file_name: &str) -> Result<(i32, Option, CompressionCodec)> { let stripped = file_name.strip_suffix(".metadata.json").ok_or(Error::new( ErrorKind::Unexpected, format!("Invalid metadata file ending: {file_name}"), @@ -121,15 +123,23 @@ impl MetadataLocation { (stripped, CompressionCodec::None) }; - let (version, id) = stripped.split_once('-').ok_or(Error::new( + // Try standard iceberg-rust format: - + if let Some((version_str, id_str)) = stripped.split_once('-') + && let (Ok(version), Ok(id)) = (version_str.parse::(), Uuid::parse_str(id_str)) + { + return Ok((version, Some(id), compression_codec)); + } + + // Try Java HadoopCatalog format: v + if let Some(version_str) = stripped.strip_prefix('v') + && let Ok(version) = version_str.parse::() + { + return Ok((version, None, compression_codec)); + } + + Err(Error::new( ErrorKind::Unexpected, format!("Invalid metadata file name format: {file_name}"), - ))?; - - Ok(( - version.parse::()?, - Uuid::parse_str(id)?, - compression_codec, )) } } @@ -137,11 +147,18 @@ impl MetadataLocation { impl Display for MetadataLocation { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let suffix = self.compression_codec.suffix().unwrap_or(""); - write!( - f, - "{}/metadata/{:0>5}-{}{}.metadata.json", - self.table_location, self.version, self.id, suffix - ) + match self.id { + Some(id) => write!( + f, + "{}/metadata/{:0>5}-{}{}.metadata.json", + self.table_location, self.version, id, suffix + ), + None => write!( + f, + "{}/metadata/v{}{}.metadata.json", + self.table_location, self.version, suffix + ), + } } } @@ -157,7 +174,7 @@ impl FromStr for MetadataLocation { let prefix = Self::parse_metadata_path_prefix(path)?; let (version, id, compression_codec) = Self::parse_file_name(file_name)?; - Ok(MetadataLocation { + Ok(Self { table_location: prefix, version, id, @@ -200,7 +217,7 @@ mod test { Ok(MetadataLocation { table_location: "".to_string(), version: 1234567, - id: Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap(), + id: Some(Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap()), compression_codec: CompressionCodec::None, }), ), @@ -210,7 +227,7 @@ mod test { Ok(MetadataLocation { table_location: "/abc".to_string(), version: 1234567, - id: Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap(), + id: Some(Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap()), compression_codec: CompressionCodec::None, }), ), @@ -220,7 +237,7 @@ mod test { Ok(MetadataLocation { table_location: "/abc/def".to_string(), version: 1234567, - id: Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap(), + id: Some(Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap()), compression_codec: CompressionCodec::None, }), ), @@ -230,7 +247,7 @@ mod test { Ok(MetadataLocation { table_location: "https://127.0.0.1".to_string(), version: 1234567, - id: Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap(), + id: Some(Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap()), compression_codec: CompressionCodec::None, }), ), @@ -240,7 +257,7 @@ mod test { Ok(MetadataLocation { table_location: "/abc".to_string(), version: 1234567, - id: Uuid::from_str("81056704-ce5b-41c4-bb83-eb6408081af6").unwrap(), + id: Some(Uuid::from_str("81056704-ce5b-41c4-bb83-eb6408081af6").unwrap()), compression_codec: CompressionCodec::None, }), ), @@ -250,7 +267,7 @@ mod test { Ok(MetadataLocation { table_location: "/abc".to_string(), version: 0, - id: Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap(), + id: Some(Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap()), compression_codec: CompressionCodec::None, }), ), @@ -260,7 +277,37 @@ mod test { Ok(MetadataLocation { table_location: "/abc".to_string(), version: 1234567, - id: Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap(), + id: Some(Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap()), + compression_codec: CompressionCodec::gzip_default(), + }), + ), + // Java HadoopCatalog format: v.metadata.json + ( + "/abc/metadata/v1.metadata.json", + Ok(MetadataLocation { + table_location: "/abc".to_string(), + version: 1, + id: None, + compression_codec: CompressionCodec::None, + }), + ), + // Java HadoopCatalog with higher version + ( + "s3://bucket/warehouse/db/table/metadata/v123.metadata.json", + Ok(MetadataLocation { + table_location: "s3://bucket/warehouse/db/table".to_string(), + version: 123, + id: None, + compression_codec: CompressionCodec::None, + }), + ), + // Java HadoopCatalog with gzip + ( + "/abc/metadata/v5.gz.metadata.json", + Ok(MetadataLocation { + table_location: "/abc".to_string(), + version: 5, + id: None, compression_codec: CompressionCodec::gzip_default(), }), ), @@ -327,6 +374,33 @@ mod test { } } + #[test] + fn test_java_hadoop_format_with_next_version() { + let location = + MetadataLocation::from_str("/warehouse/db/t/metadata/v3.metadata.json").unwrap(); + assert_eq!(location.version, 3); + assert_eq!(location.id, None); + + let next = location.with_next_version(); + assert_eq!(next.version, 4); + assert!(next.id.is_some()); + + let next_str = next.to_string(); + assert!(next_str.starts_with("/warehouse/db/t/metadata/00004-")); + assert!(next_str.ends_with(".metadata.json")); + // Round-trip the new location + let reparsed = MetadataLocation::from_str(&next_str).unwrap(); + assert_eq!(reparsed.version, 4); + assert!(reparsed.id.is_some()); + } + + #[test] + fn test_java_hadoop_format_round_trip() { + let input = "s3://bucket/table/metadata/v1.metadata.json"; + let location = MetadataLocation::from_str(input).unwrap(); + assert_eq!(location.to_string(), input); + } + #[test] fn test_with_next_version_preserves_compression() { // Start from a parsed location with no compression