From 158146a2f08496155f8eca798bbd299dacf0d893 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Thu, 2 Apr 2026 00:29:53 -0700 Subject: [PATCH 01/10] [GH-2824] Add sedonainfo data source for GeoTIFF metadata Add a new Spark DataSourceV2 that returns GeoTIFF file metadata without decoding pixel data, similar to gdalinfo. Usage: spark.read.format("sedonainfo").load("/path/to/*.tif") Returns one row per file with: path, driver, fileSize, width, height, numBands, srid, crs, geoTransform, cornerCoordinates, bands (array with dataType, noData, blockSize, colorInterpretation), overviews, metadata, isTiled, and compression. Supports glob patterns, directory recursion, LIMIT pushdown, and column pruning. --- ...pache.spark.sql.sources.DataSourceRegister | 1 + .../GeoTiffMetadataDataSource.scala | 91 ++++++ .../GeoTiffMetadataPartitionReader.scala | 306 ++++++++++++++++++ ...eoTiffMetadataPartitionReaderFactory.scala | 55 ++++ .../GeoTiffMetadataScanBuilder.scala | 135 ++++++++ .../GeoTiffMetadataTable.scala | 114 +++++++ .../sedona/sql/geotiffMetadataTest.scala | 184 +++++++++++ 7 files changed, 886 insertions(+) create mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataDataSource.scala create mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReader.scala create mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReaderFactory.scala create mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataScanBuilder.scala create mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataTable.scala create mode 100644 spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala diff --git a/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister index 10c362405c7..cb55d4b024f 100644 --- a/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ b/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -4,3 +4,4 @@ org.apache.sedona.sql.datasources.spider.SpiderDataSource org.apache.spark.sql.sedona_sql.io.stac.StacDataSource org.apache.sedona.sql.datasources.osm.OsmPbfFormat org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetFileFormat +org.apache.spark.sql.sedona_sql.io.geotiffmetadata.GeoTiffMetadataDataSource diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataDataSource.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataDataSource.scala new file mode 100644 index 00000000000..d113dcb43bf --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataDataSource.scala @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.geotiffmetadata + +import org.apache.spark.sql.connector.catalog.Table +import org.apache.spark.sql.connector.catalog.TableProvider +import org.apache.spark.sql.execution.datasources.FileFormat +import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 +import org.apache.spark.sql.sedona_sql.io.raster.RasterFileFormat +import org.apache.spark.sql.sources.DataSourceRegister +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +import scala.collection.JavaConverters._ + +class GeoTiffMetadataDataSource + extends FileDataSourceV2 + with TableProvider + with DataSourceRegister { + + override def shortName(): String = "sedonainfo" + + private def createTable( + options: CaseInsensitiveStringMap, + userSchema: Option[StructType] = None): Table = { + var paths = getPaths(options) + var optionsWithoutPaths = getOptionsWithoutPaths(options) + val tableName = getTableName(options, paths) + + if (paths.size == 1) { + if (paths.head.endsWith("/")) { + val newOptions = + new java.util.HashMap[String, String](optionsWithoutPaths.asCaseSensitiveMap()) + newOptions.put("recursiveFileLookup", "true") + if (!newOptions.containsKey("pathGlobFilter")) { + newOptions.put("pathGlobFilter", "*.{tif,tiff,TIF,TIFF}") + } + optionsWithoutPaths = new CaseInsensitiveStringMap(newOptions) + } else { + val loadTifPattern = "(.*)/([^/]*\\*[^/]*\\.(?i:tif|tiff))$".r + paths.head match { + case loadTifPattern(prefix, glob) => + paths = Seq(prefix) + val newOptions = + new java.util.HashMap[String, String](optionsWithoutPaths.asCaseSensitiveMap()) + newOptions.put("pathGlobFilter", glob) + optionsWithoutPaths = new CaseInsensitiveStringMap(newOptions) + case _ => + } + } + } + + new GeoTiffMetadataTable( + tableName, + sparkSession, + optionsWithoutPaths, + paths, + userSchema, + fallbackFileFormat) + } + + override def getTable(options: CaseInsensitiveStringMap): Table = { + createTable(options) + } + + override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { + createTable(options, Some(schema)) + } + + override def inferSchema(options: CaseInsensitiveStringMap): StructType = { + GeoTiffMetadataTable.SCHEMA + } + + override def fallbackFileFormat: Class[_ <: FileFormat] = classOf[RasterFileFormat] +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReader.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReader.scala new file mode 100644 index 00000000000..3674dd9f72f --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReader.scala @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.geotiffmetadata + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.sedona.common.raster.RasterAccessors +import org.apache.sedona.common.raster.RasterBandAccessors +import org.apache.sedona.common.raster.inputstream.HadoopImageInputStream +import org.apache.sedona.common.utils.RasterUtils +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.sql.connector.read.PartitionReader +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.types.StructType +import org.apache.spark.unsafe.types.UTF8String +import org.geotools.coverage.grid.GridCoverage2D +import org.geotools.gce.geotiff.GeoTiffReader +import org.geotools.referencing.crs.DefaultEngineeringCRS + +import java.net.URI +import scala.collection.mutable +import scala.util.Try + +class GeoTiffMetadataPartitionReader( + configuration: Configuration, + partitionedFiles: Array[PartitionedFile], + readDataSchema: StructType) + extends PartitionReader[InternalRow] { + + private var currentFileIndex = 0 + private var currentRow: InternalRow = _ + private var hasNext_ = false + + override def next(): Boolean = { + if (currentFileIndex < partitionedFiles.length) { + currentRow = readFileMetadata(partitionedFiles(currentFileIndex)) + currentFileIndex += 1 + hasNext_ = true + true + } else { + hasNext_ = false + false + } + } + + override def get(): InternalRow = currentRow + + override def close(): Unit = {} + + private def readFileMetadata(partition: PartitionedFile): InternalRow = { + val path = new Path(new URI(partition.filePath.toString())) + val imageStream = new HadoopImageInputStream(path, configuration) + var reader: GeoTiffReader = null + var raster: GridCoverage2D = null + try { + reader = new GeoTiffReader( + imageStream, + new org.geotools.util.factory.Hints( + org.geotools.util.factory.Hints.FORCE_LONGITUDE_FIRST_AXIS_ORDER, + java.lang.Boolean.TRUE)) + raster = reader.read(null) + + // Lazily compute metadata values only when requested + lazy val filePath = path.toString + lazy val fileSize = partition.fileSize + lazy val width = RasterAccessors.getWidth(raster) + lazy val height = RasterAccessors.getHeight(raster) + lazy val numBands = RasterAccessors.numBands(raster) + lazy val srid = RasterAccessors.srid(raster) + + lazy val crsStr = Try { + val crs = raster.getCoordinateReferenceSystem + if (crs == null || crs.isInstanceOf[DefaultEngineeringCRS]) null + else crs.toWKT + }.getOrElse(null) + + lazy val affine = RasterUtils.getGDALAffineTransform(raster) + + lazy val geoTransformRow = new GenericInternalRow( + Array[Any]( + affine.getTranslateX, + affine.getTranslateY, + affine.getScaleX, + affine.getScaleY, + affine.getShearX, + affine.getShearY)) + + lazy val env = raster.getEnvelope2D + lazy val cornerCoordinatesRow = new GenericInternalRow( + Array[Any](env.getMinX, env.getMinY, env.getMaxX, env.getMaxY)) + + lazy val image = raster.getRenderedImage + lazy val tileWidth = image.getTileWidth + lazy val tileHeight = image.getTileHeight + lazy val isTiled = tileWidth < width || tileHeight < height + + lazy val bandsArray = buildBandsArray(raster, numBands, tileWidth, tileHeight) + lazy val overviewsArray = buildOverviewsArray(reader, width, height) + lazy val metadataMap = buildMetadataMap(reader) + lazy val compression = extractCompression(reader) + + // Build row matching readDataSchema field order + val fields = readDataSchema.fieldNames.map { + case "path" => UTF8String.fromString(filePath) + case "driver" => UTF8String.fromString("GTiff") + case "fileSize" => fileSize: Any + case "width" => width: Any + case "height" => height: Any + case "numBands" => numBands: Any + case "srid" => srid: Any + case "crs" => + if (crsStr != null) UTF8String.fromString(crsStr) else null + case "geoTransform" => geoTransformRow + case "cornerCoordinates" => cornerCoordinatesRow + case "bands" => bandsArray + case "overviews" => overviewsArray + case "metadata" => metadataMap + case "isTiled" => isTiled: Any + case "compression" => + if (compression != null) UTF8String.fromString(compression) else null + case other => + throw new IllegalArgumentException(s"Unsupported field name: $other") + } + + new GenericInternalRow(fields) + } finally { + if (raster != null) raster.dispose(true) + if (reader != null) reader.dispose() + imageStream.close() + } + } + + private def buildBandsArray( + raster: GridCoverage2D, + numBands: Int, + tileWidth: Int, + tileHeight: Int): ArrayData = { + val bands = (1 to numBands).map { i => + val dataType = Try(RasterBandAccessors.getBandType(raster, i)).getOrElse(null) + val noDataValue = Try(RasterBandAccessors.getBandNoDataValue(raster, i)).getOrElse(null) + val description = Try { + val desc = raster.getSampleDimension(i - 1).getDescription + if (desc != null) desc.toString(java.util.Locale.ROOT) else null + }.getOrElse(null) + + // Color interpretation is typically stored in the band description + val colorInterp = description + + // Unit type from sample dimension + val unit = Try { + val units = raster.getSampleDimension(i - 1).getUnits + if (units != null) units.toString else null + }.getOrElse(null) + + new GenericInternalRow( + Array[Any]( + i, + if (dataType != null) UTF8String.fromString(dataType) else null, + if (colorInterp != null) UTF8String.fromString(colorInterp) else null, + if (noDataValue != null) noDataValue.doubleValue() else null, + tileWidth, + tileHeight, + if (description != null) UTF8String.fromString(description) else null, + if (unit != null) UTF8String.fromString(unit) else null)) + }.toArray + + new GenericArrayData(bands) + } + + private def buildOverviewsArray( + reader: GeoTiffReader, + fullWidth: Int, + fullHeight: Int): ArrayData = { + try { + val resolutionLevels = reader.getResolutionLevels + if (resolutionLevels == null || resolutionLevels.length <= 1) { + new GenericArrayData(Array.empty[InternalRow]) + } else { + // Level 0 is full resolution; levels 1+ are overviews + val fullResX = resolutionLevels(0)(0) + val fullResY = resolutionLevels(0)(1) + val overviews = (1 until resolutionLevels.length).map { level => + val overviewResX = resolutionLevels(level)(0) + val overviewResY = resolutionLevels(level)(1) + val overviewWidth = Math.round(fullWidth.toDouble * fullResX / overviewResX).toInt + val overviewHeight = Math.round(fullHeight.toDouble * fullResY / overviewResY).toInt + new GenericInternalRow(Array[Any](level, overviewWidth, overviewHeight)) + }.toArray + new GenericArrayData(overviews) + } + } catch { + case _: Exception => new GenericArrayData(Array.empty[InternalRow]) + } + } + + private def buildMetadataMap( + reader: GeoTiffReader): org.apache.spark.sql.catalyst.util.MapData = { + try { + val metadata = reader.getMetadata + if (metadata == null) return null + + val rootNode = metadata.getRootNode + if (rootNode == null) return null + + val map = new mutable.LinkedHashMap[UTF8String, UTF8String]() + extractMetadataFromNode(rootNode, "", map) + + if (map.isEmpty) return null + + org.apache.spark.sql.catalyst.util.ArrayBasedMapData(map.keys.toArray, map.values.toArray) + } catch { + case _: Exception => null + } + } + + private def extractMetadataFromNode( + node: org.w3c.dom.Node, + prefix: String, + map: mutable.LinkedHashMap[UTF8String, UTF8String]): Unit = { + if (node == null) return + + // Extract attributes + val attrs = node.getAttributes + if (attrs != null) { + val nameAttr = attrs.getNamedItem("name") + val valueAttr = attrs.getNamedItem("value") + if (nameAttr != null && valueAttr != null) { + val key = + if (prefix.nonEmpty) s"$prefix.${nameAttr.getNodeValue}" + else nameAttr.getNodeValue + map.put(UTF8String.fromString(key), UTF8String.fromString(valueAttr.getNodeValue)) + } + } + + // Recurse into children + val children = node.getChildNodes + if (children != null) { + val childPrefix = if (prefix.nonEmpty && node.getNodeName != "#document") { + s"$prefix.${node.getNodeName}" + } else if (node.getNodeName != "#document") { + node.getNodeName + } else { + prefix + } + for (i <- 0 until children.getLength) { + extractMetadataFromNode(children.item(i), childPrefix, map) + } + } + } + + private def extractCompression(reader: GeoTiffReader): String = { + try { + val metadata = reader.getMetadata + if (metadata == null) return null + + val rootNode = metadata.getRootNode + if (rootNode == null) return null + + findCompressionInNode(rootNode) + } catch { + case _: Exception => null + } + } + + private def findCompressionInNode(node: org.w3c.dom.Node): String = { + if (node == null) return null + + val attrs = node.getAttributes + if (attrs != null) { + val nameAttr = attrs.getNamedItem("name") + val valueAttr = attrs.getNamedItem("value") + if (nameAttr != null && valueAttr != null && + nameAttr.getNodeValue.equalsIgnoreCase("Compression")) { + return valueAttr.getNodeValue + } + } + + val children = node.getChildNodes + if (children != null) { + for (i <- 0 until children.getLength) { + val result = findCompressionInNode(children.item(i)) + if (result != null) return result + } + } + null + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReaderFactory.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReaderFactory.scala new file mode 100644 index 00000000000..508e8c3cf33 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReaderFactory.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.geotiffmetadata + +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.connector.read.PartitionReader +import org.apache.spark.sql.connector.read.PartitionReaderFactory +import org.apache.spark.sql.execution.datasources.v2.PartitionReaderWithPartitionValues +import org.apache.spark.sql.sedona_sql.io.raster.RasterInputPartition +import org.apache.spark.sql.types.StructType +import org.apache.spark.util.SerializableConfiguration + +case class GeoTiffMetadataPartitionReaderFactory( + broadcastedConf: Broadcast[SerializableConfiguration], + dataSchema: StructType, + readDataSchema: StructType, + partitionSchema: StructType) + extends PartitionReaderFactory { + + override def createReader(partition: InputPartition): PartitionReader[InternalRow] = { + partition match { + case filePartition: RasterInputPartition => + val fileReader = new GeoTiffMetadataPartitionReader( + broadcastedConf.value.value, + filePartition.files, + readDataSchema) + new PartitionReaderWithPartitionValues( + fileReader, + readDataSchema, + partitionSchema, + filePartition.files.head.partitionValues) + case _ => + throw new IllegalArgumentException( + s"Unexpected partition type: ${partition.getClass.getCanonicalName}") + } + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataScanBuilder.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataScanBuilder.scala new file mode 100644 index 00000000000..0f422aee3a7 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataScanBuilder.scala @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.geotiffmetadata + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.connector.read.Batch +import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.connector.read.PartitionReaderFactory +import org.apache.spark.sql.connector.read.Scan +import org.apache.spark.sql.connector.read.SupportsPushDownLimit +import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex +import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder +import org.apache.spark.sql.execution.datasources.FilePartition +import org.apache.spark.sql.execution.datasources.v2.FileScan +import org.apache.spark.sql.sedona_sql.io.raster.RasterInputPartition +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.util.SerializableConfiguration + +import scala.collection.JavaConverters._ + +case class GeoTiffMetadataScanBuilder( + sparkSession: SparkSession, + fileIndex: PartitioningAwareFileIndex, + schema: StructType, + dataSchema: StructType, + options: CaseInsensitiveStringMap) + extends FileScanBuilder(sparkSession, fileIndex, dataSchema) + with SupportsPushDownLimit { + + private var pushedLimit: Option[Int] = None + + override def build(): Scan = { + GeoTiffMetadataScan( + sparkSession, + fileIndex, + dataSchema, + readDataSchema(), + readPartitionSchema(), + options, + pushedDataFilters, + partitionFilters, + dataFilters, + pushedLimit) + } + + override def pushLimit(limit: Int): Boolean = { + pushedLimit = Some(limit) + true + } + + override def isPartiallyPushed: Boolean = false +} + +case class GeoTiffMetadataScan( + sparkSession: SparkSession, + fileIndex: PartitioningAwareFileIndex, + dataSchema: StructType, + readDataSchema: StructType, + readPartitionSchema: StructType, + options: CaseInsensitiveStringMap, + pushedFilters: Array[Filter], + partitionFilters: Seq[Expression] = Seq.empty, + dataFilters: Seq[Expression] = Seq.empty, + pushedLimit: Option[Int] = None) + extends FileScan + with Batch { + + private lazy val inputPartitions = { + var partitions = super.planInputPartitions() + + pushedLimit.foreach { limit => + var remaining = limit + partitions = partitions.iterator + .takeWhile(_ => remaining > 0) + .map { partition => + val filePartition = partition.asInstanceOf[FilePartition] + val files = filePartition.files + if (files.length <= remaining) { + remaining -= files.length + filePartition + } else { + val selectedFiles = files.take(remaining) + remaining = 0 + FilePartition(filePartition.index, selectedFiles) + } + } + .toArray + } + + partitions + } + + override def planInputPartitions(): Array[InputPartition] = { + inputPartitions.map { + case filePartition: FilePartition => + RasterInputPartition(filePartition.index, filePartition.files) + case partition => + throw new IllegalArgumentException( + s"Unexpected partition type: ${partition.getClass.getCanonicalName}") + } + } + + override def createReaderFactory(): PartitionReaderFactory = { + val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(options.asScala.toMap) + val broadcastedConf = + sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) + + GeoTiffMetadataPartitionReaderFactory( + broadcastedConf, + dataSchema, + readDataSchema, + readPartitionSchema) + } + + override def isSplitable(path: org.apache.hadoop.fs.Path): Boolean = false +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataTable.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataTable.scala new file mode 100644 index 00000000000..a1c5e125d7f --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataTable.scala @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.geotiffmetadata + +import org.apache.hadoop.fs.FileStatus +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.catalog.SupportsRead +import org.apache.spark.sql.connector.catalog.SupportsWrite +import org.apache.spark.sql.connector.catalog.TableCapability +import org.apache.spark.sql.connector.read.ScanBuilder +import org.apache.spark.sql.connector.write.LogicalWriteInfo +import org.apache.spark.sql.connector.write.WriteBuilder +import org.apache.spark.sql.execution.datasources.FileFormat +import org.apache.spark.sql.execution.datasources.v2.FileTable +import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +import java.util.{Set => JSet} + +case class GeoTiffMetadataTable( + name: String, + sparkSession: SparkSession, + options: CaseInsensitiveStringMap, + paths: Seq[String], + userSpecifiedSchema: Option[StructType], + fallbackFileFormat: Class[_ <: FileFormat]) + extends FileTable(sparkSession, options, paths, userSpecifiedSchema) + with SupportsRead + with SupportsWrite { + + override def inferSchema(files: Seq[FileStatus]): Option[StructType] = + Some(GeoTiffMetadataTable.SCHEMA) + + override def formatName: String = "GeoTiffMetadata" + + override def capabilities(): JSet[TableCapability] = + java.util.EnumSet.of(TableCapability.BATCH_READ) + + override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { + GeoTiffMetadataScanBuilder(sparkSession, fileIndex, schema, dataSchema, options) + } + + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = null +} + +object GeoTiffMetadataTable { + + val GEO_TRANSFORM_TYPE: StructType = StructType( + Seq( + StructField("upperLeftX", DoubleType, nullable = false), + StructField("upperLeftY", DoubleType, nullable = false), + StructField("scaleX", DoubleType, nullable = false), + StructField("scaleY", DoubleType, nullable = false), + StructField("skewX", DoubleType, nullable = false), + StructField("skewY", DoubleType, nullable = false))) + + val CORNER_COORDINATES_TYPE: StructType = StructType( + Seq( + StructField("minX", DoubleType, nullable = false), + StructField("minY", DoubleType, nullable = false), + StructField("maxX", DoubleType, nullable = false), + StructField("maxY", DoubleType, nullable = false))) + + val BAND_TYPE: StructType = StructType( + Seq( + StructField("band", IntegerType, nullable = false), + StructField("dataType", StringType, nullable = true), + StructField("colorInterpretation", StringType, nullable = true), + StructField("noDataValue", DoubleType, nullable = true), + StructField("blockWidth", IntegerType, nullable = false), + StructField("blockHeight", IntegerType, nullable = false), + StructField("description", StringType, nullable = true), + StructField("unit", StringType, nullable = true))) + + val OVERVIEW_TYPE: StructType = StructType( + Seq( + StructField("level", IntegerType, nullable = false), + StructField("width", IntegerType, nullable = false), + StructField("height", IntegerType, nullable = false))) + + val SCHEMA: StructType = StructType( + Seq( + StructField("path", StringType, nullable = false), + StructField("driver", StringType, nullable = false), + StructField("fileSize", LongType, nullable = true), + StructField("width", IntegerType, nullable = false), + StructField("height", IntegerType, nullable = false), + StructField("numBands", IntegerType, nullable = false), + StructField("srid", IntegerType, nullable = false), + StructField("crs", StringType, nullable = true), + StructField("geoTransform", GEO_TRANSFORM_TYPE, nullable = false), + StructField("cornerCoordinates", CORNER_COORDINATES_TYPE, nullable = false), + StructField("bands", ArrayType(BAND_TYPE), nullable = true), + StructField("overviews", ArrayType(OVERVIEW_TYPE), nullable = true), + StructField("metadata", MapType(StringType, StringType), nullable = true), + StructField("isTiled", BooleanType, nullable = false), + StructField("compression", StringType, nullable = true))) +} diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala new file mode 100644 index 00000000000..fc513239095 --- /dev/null +++ b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sedona.sql + +import org.scalatest.BeforeAndAfter + +class geotiffMetadataTest extends TestBaseScala with BeforeAndAfter { + + val rasterDir: String = resourceFolder + "raster/" + val singleFileLocation: String = resourceFolder + "raster/test1.tiff" + + describe("GeoTiff Metadata (sedonainfo) data source") { + + it("should read a single GeoTIFF file and return one row") { + val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) + assert(df.count() == 1) + val row = df.first() + assert(row.getAs[String]("path").contains("test1.tiff")) + assert(row.getAs[String]("driver") == "GTiff") + assert(row.getAs[Int]("width") > 0) + assert(row.getAs[Int]("height") > 0) + assert(row.getAs[Int]("numBands") > 0) + } + + it("should read multiple GeoTIFF files via glob pattern") { + val df = sparkSession.read.format("sedonainfo").load(rasterDir + "*.tiff") + assert(df.count() > 1) + } + + it("should read GeoTIFF files from directory with trailing slash") { + val df = sparkSession.read.format("sedonainfo").load(rasterDir) + assert(df.count() > 1) + } + + it("should return correct schema") { + val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) + val schema = df.schema + assert(schema.fieldNames.contains("path")) + assert(schema.fieldNames.contains("driver")) + assert(schema.fieldNames.contains("fileSize")) + assert(schema.fieldNames.contains("width")) + assert(schema.fieldNames.contains("height")) + assert(schema.fieldNames.contains("numBands")) + assert(schema.fieldNames.contains("srid")) + assert(schema.fieldNames.contains("crs")) + assert(schema.fieldNames.contains("geoTransform")) + assert(schema.fieldNames.contains("cornerCoordinates")) + assert(schema.fieldNames.contains("bands")) + assert(schema.fieldNames.contains("overviews")) + assert(schema.fieldNames.contains("metadata")) + assert(schema.fieldNames.contains("isTiled")) + assert(schema.fieldNames.contains("compression")) + } + + it("should return correct geoTransform struct") { + val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) + val row = df + .selectExpr( + "geoTransform.upperLeftX", + "geoTransform.upperLeftY", + "geoTransform.scaleX", + "geoTransform.scaleY", + "geoTransform.skewX", + "geoTransform.skewY") + .first() + // scaleX should be positive, scaleY should be negative (north-up convention) + assert(row.getAs[Double]("scaleX") != 0.0) + assert(row.getAs[Double]("scaleY") != 0.0) + } + + it("should return correct cornerCoordinates struct") { + val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) + val row = df + .selectExpr( + "cornerCoordinates.minX", + "cornerCoordinates.minY", + "cornerCoordinates.maxX", + "cornerCoordinates.maxY") + .first() + assert(row.getAs[Double]("maxX") > row.getAs[Double]("minX")) + assert(row.getAs[Double]("maxY") > row.getAs[Double]("minY")) + } + + it("should return correct bands array") { + val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) + val row = df.first() + val numBands = row.getAs[Int]("numBands") + val bands = row.getAs[Seq[Any]]("bands") + assert(bands != null) + assert(bands.size == numBands) + } + + it("should return band metadata with correct fields") { + val df = sparkSession.read + .format("sedonainfo") + .load(singleFileLocation) + .selectExpr("explode(bands) as band") + .selectExpr("band.band", "band.dataType", "band.blockWidth", "band.blockHeight") + val row = df.first() + assert(row.getAs[Int]("band") == 1) + assert(row.getAs[String]("dataType") != null) + assert(row.getAs[Int]("blockWidth") > 0) + assert(row.getAs[Int]("blockHeight") > 0) + } + + it("should cross-validate metadata against raster data source") { + val metaDf = sparkSession.read.format("sedonainfo").load(singleFileLocation) + val rasterDf = sparkSession.read + .format("raster") + .option("retile", "false") + .load(singleFileLocation) + .selectExpr( + "RS_Width(rast) as width", + "RS_Height(rast) as height", + "RS_NumBands(rast) as numBands", + "RS_SRID(rast) as srid") + + val metaRow = metaDf.first() + val rasterRow = rasterDf.first() + assert(metaRow.getAs[Int]("width") == rasterRow.getAs[Int]("width")) + assert(metaRow.getAs[Int]("height") == rasterRow.getAs[Int]("height")) + assert(metaRow.getAs[Int]("numBands") == rasterRow.getAs[Int]("numBands")) + assert(metaRow.getAs[Int]("srid") == rasterRow.getAs[Int]("srid")) + } + + it("should support LIMIT pushdown") { + val df = sparkSession.read.format("sedonainfo").load(rasterDir) + val totalCount = df.count() + assert(totalCount > 2, "Need at least 3 files for this test") + val limitedDf = df.limit(2) + assert(limitedDf.count() == 2) + } + + it("should support column selection") { + val df = sparkSession.read + .format("sedonainfo") + .load(singleFileLocation) + .select("path", "width", "height") + assert(df.schema.fieldNames.length == 3) + assert(df.count() == 1) + } + + it("should report isTiled correctly") { + val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) + val row = df.first() + // isTiled should be a boolean + val isTiled = row.getAs[Boolean]("isTiled") + val width = row.getAs[Int]("width") + // If the raster is small, it's likely not tiled (strip-based) + assert(isTiled == true || isTiled == false) + } + + it("should return fileSize") { + val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) + val row = df.first() + val fileSize = row.getAs[Long]("fileSize") + assert(fileSize > 0) + } + + it("should return overviews array") { + val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) + val row = df.first() + // Test rasters likely don't have overviews, so empty array is expected + val overviews = row.getAs[Seq[Any]]("overviews") + assert(overviews != null) + } + } +} From a3666a37e7e021f9ecd325061d553fcdcf3eb7e5 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Thu, 2 Apr 2026 01:10:59 -0700 Subject: [PATCH 02/10] [GH-2824] Refactor sedonainfo: rename package, decouple GeoTIFF logic, add docs - Rename package from io.geotiffmetadata to io.sedonainfo - Extract RasterFileMetadataExtractor trait for format-agnostic design - Move GeoTIFF-specific logic into GeoTiffMetadataExtractor - SedonaInfoPartitionReader delegates to format extractors via canHandle() dispatch, making it easy to add new formats - Add documentation page for the sedonainfo data source - Register in mkdocs.yml navigation --- .../tutorial/files/sedonainfo-sedona-spark.md | 194 +++++++++++ mkdocs.yml | 1 + ...pache.spark.sql.sources.DataSourceRegister | 2 +- .../GeoTiffMetadataPartitionReader.scala | 306 ------------------ .../sedonainfo/GeoTiffMetadataExtractor.scala | 256 +++++++++++++++ .../RasterFileMetadataExtractor.scala | 81 +++++ .../SedonaInfoDataSource.scala} | 16 +- .../SedonaInfoPartitionReader.scala | 153 +++++++++ .../SedonaInfoPartitionReaderFactory.scala} | 6 +- .../SedonaInfoScanBuilder.scala} | 10 +- .../SedonaInfoTable.scala} | 12 +- 11 files changed, 709 insertions(+), 328 deletions(-) create mode 100644 docs/tutorial/files/sedonainfo-sedona-spark.md delete mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReader.scala create mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala create mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala rename spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/{geotiffmetadata/GeoTiffMetadataDataSource.scala => sedonainfo/SedonaInfoDataSource.scala} (87%) create mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala rename spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/{geotiffmetadata/GeoTiffMetadataPartitionReaderFactory.scala => sedonainfo/SedonaInfoPartitionReaderFactory.scala} (92%) rename spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/{geotiffmetadata/GeoTiffMetadataScanBuilder.scala => sedonainfo/SedonaInfoScanBuilder.scala} (95%) rename spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/{geotiffmetadata/GeoTiffMetadataTable.scala => sedonainfo/SedonaInfoTable.scala} (94%) diff --git a/docs/tutorial/files/sedonainfo-sedona-spark.md b/docs/tutorial/files/sedonainfo-sedona-spark.md new file mode 100644 index 00000000000..13e7a827f17 --- /dev/null +++ b/docs/tutorial/files/sedonainfo-sedona-spark.md @@ -0,0 +1,194 @@ + + +# SedonaInfo - Raster File Metadata + +SedonaInfo is a Spark data source that reads raster file metadata without decoding pixel data, similar to [gdalinfo](https://gdal.org/en/stable/programs/gdalinfo.html). It returns one row per file with metadata including dimensions, coordinate system, band information, tiling, overviews, and compression. + +This is useful for: + +* Cataloging and inventorying large collections of raster files +* Detecting Cloud Optimized GeoTIFFs (COGs) by checking tiling and overview status +* Inspecting file properties before loading full raster data +* Building spatial indexes over raster file collections + +Currently supports **GeoTIFF** files. Additional formats can be added in the future. + +## Read GeoTIFF metadata + +=== "Scala" + + ```scala + val df = sedona.read.format("sedonainfo").load("/path/to/rasters/") + df.show() + ``` + +=== "Java" + + ```java + Dataset df = sedona.read().format("sedonainfo").load("/path/to/rasters/"); + df.show(); + ``` + +=== "Python" + + ```python + df = sedona.read.format("sedonainfo").load("/path/to/rasters/") + df.show() + ``` + +You can also use glob patterns: + +```python +df = sedona.read.format("sedonainfo").load("/path/to/rasters/*.tif") +``` + +Or load a single file: + +```python +df = sedona.read.format("sedonainfo").load("/path/to/image.tiff") +``` + +## Output schema + +Each row represents one raster file with the following columns: + +| Column | Type | Description | +|--------|------|-------------| +| `path` | String | File path | +| `driver` | String | Format driver (e.g., "GTiff") | +| `fileSize` | Long | File size in bytes | +| `width` | Int | Image width in pixels | +| `height` | Int | Image height in pixels | +| `numBands` | Int | Number of bands | +| `srid` | Int | EPSG code (0 if unknown) | +| `crs` | String | Coordinate Reference System as WKT | +| `geoTransform` | Struct | Affine transform parameters | +| `cornerCoordinates` | Struct | Bounding box | +| `bands` | Array[Struct] | Per-band metadata | +| `overviews` | Array[Struct] | Overview (pyramid) levels | +| `metadata` | Map[String, String] | File-wide TIFF metadata tags | +| `isTiled` | Boolean | Whether the file uses internal tiling | +| `compression` | String | Compression type (e.g., "Deflate") | + +### geoTransform struct + +| Field | Type | Description | +|-------|------|-------------| +| `upperLeftX` | Double | Origin X in world coordinates | +| `upperLeftY` | Double | Origin Y in world coordinates | +| `scaleX` | Double | Pixel size in X direction | +| `scaleY` | Double | Pixel size in Y direction | +| `skewX` | Double | Rotation/shear in X | +| `skewY` | Double | Rotation/shear in Y | + +### cornerCoordinates struct + +| Field | Type | Description | +|-------|------|-------------| +| `minX` | Double | Minimum X (west) | +| `minY` | Double | Minimum Y (south) | +| `maxX` | Double | Maximum X (east) | +| `maxY` | Double | Maximum Y (north) | + +### bands array element + +| Field | Type | Description | +|-------|------|-------------| +| `band` | Int | Band number (1-indexed) | +| `dataType` | String | Data type (e.g., "REAL_32BITS") | +| `colorInterpretation` | String | Color interpretation (e.g., "Gray") | +| `noDataValue` | Double | NoData value (null if not set) | +| `blockWidth` | Int | Internal tile/block width | +| `blockHeight` | Int | Internal tile/block height | +| `description` | String | Band description | +| `unit` | String | Unit type (e.g., "meters") | + +### overviews array element + +| Field | Type | Description | +|-------|------|-------------| +| `level` | Int | Overview level (1, 2, 3, ...) | +| `width` | Int | Overview width in pixels | +| `height` | Int | Overview height in pixels | + +## Examples + +### Detect Cloud Optimized GeoTIFFs (COGs) + +A COG is a GeoTIFF that is internally tiled and has overview levels: + +```python +df = sedona.read.format("sedonainfo").load("/path/to/rasters/") +cogs = df.filter("isTiled AND size(overviews) > 0") +cogs.select("path", "compression", "overviews").show(truncate=False) +``` + +### Inspect band information + +```python +df = sedona.read.format("sedonainfo").load("/path/to/image.tif") +df.selectExpr("path", "explode(bands) as band").selectExpr( + "path", + "band.band", + "band.dataType", + "band.noDataValue", + "band.blockWidth", + "band.blockHeight", +).show() +``` + +### Filter by spatial extent + +```python +df = sedona.read.format("sedonainfo").load("/path/to/rasters/") +df.filter("cornerCoordinates.minX > -120 AND cornerCoordinates.maxX < -100").select( + "path", "width", "height", "srid" +).show() +``` + +### Get overview details + +```python +df = sedona.read.format("sedonainfo").load("/path/to/image.tif") +df.selectExpr("path", "explode(overviews) as ovr").selectExpr( + "path", "ovr.level", "ovr.width", "ovr.height" +).show() +``` + +### Column pruning for performance + +Select only the columns you need. SedonaInfo uses column pruning to skip extracting unused metadata: + +```python +df = ( + sedona.read.format("sedonainfo") + .load("/path/to/rasters/") + .select("path", "width", "height", "numBands") +) +df.show() +``` + +## Supported formats + +| Format | Driver | Extensions | +|--------|--------|------------| +| GeoTIFF | GTiff | `.tif`, `.tiff` | + +Additional formats may be added in future releases. diff --git a/mkdocs.yml b/mkdocs.yml index 27f3e621061..c2d03698738 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -56,6 +56,7 @@ nav: - GeoParquet: tutorial/files/geoparquet-sedona-spark.md - GeoJSON: tutorial/files/geojson-sedona-spark.md - Shapefiles: tutorial/files/shapefiles-sedona-spark.md + - Raster metadata (SedonaInfo): tutorial/files/sedonainfo-sedona-spark.md - STAC catalog: tutorial/files/stac-sedona-spark.md - Concepts: - Spatial Joins: tutorial/concepts/spatial-joins.md diff --git a/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister index cb55d4b024f..f2465ab4562 100644 --- a/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ b/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -4,4 +4,4 @@ org.apache.sedona.sql.datasources.spider.SpiderDataSource org.apache.spark.sql.sedona_sql.io.stac.StacDataSource org.apache.sedona.sql.datasources.osm.OsmPbfFormat org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetFileFormat -org.apache.spark.sql.sedona_sql.io.geotiffmetadata.GeoTiffMetadataDataSource +org.apache.spark.sql.sedona_sql.io.sedonainfo.SedonaInfoDataSource diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReader.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReader.scala deleted file mode 100644 index 3674dd9f72f..00000000000 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReader.scala +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.spark.sql.sedona_sql.io.geotiffmetadata - -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path -import org.apache.sedona.common.raster.RasterAccessors -import org.apache.sedona.common.raster.RasterBandAccessors -import org.apache.sedona.common.raster.inputstream.HadoopImageInputStream -import org.apache.sedona.common.utils.RasterUtils -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow -import org.apache.spark.sql.catalyst.util.ArrayData -import org.apache.spark.sql.catalyst.util.GenericArrayData -import org.apache.spark.sql.connector.read.PartitionReader -import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.types.StructType -import org.apache.spark.unsafe.types.UTF8String -import org.geotools.coverage.grid.GridCoverage2D -import org.geotools.gce.geotiff.GeoTiffReader -import org.geotools.referencing.crs.DefaultEngineeringCRS - -import java.net.URI -import scala.collection.mutable -import scala.util.Try - -class GeoTiffMetadataPartitionReader( - configuration: Configuration, - partitionedFiles: Array[PartitionedFile], - readDataSchema: StructType) - extends PartitionReader[InternalRow] { - - private var currentFileIndex = 0 - private var currentRow: InternalRow = _ - private var hasNext_ = false - - override def next(): Boolean = { - if (currentFileIndex < partitionedFiles.length) { - currentRow = readFileMetadata(partitionedFiles(currentFileIndex)) - currentFileIndex += 1 - hasNext_ = true - true - } else { - hasNext_ = false - false - } - } - - override def get(): InternalRow = currentRow - - override def close(): Unit = {} - - private def readFileMetadata(partition: PartitionedFile): InternalRow = { - val path = new Path(new URI(partition.filePath.toString())) - val imageStream = new HadoopImageInputStream(path, configuration) - var reader: GeoTiffReader = null - var raster: GridCoverage2D = null - try { - reader = new GeoTiffReader( - imageStream, - new org.geotools.util.factory.Hints( - org.geotools.util.factory.Hints.FORCE_LONGITUDE_FIRST_AXIS_ORDER, - java.lang.Boolean.TRUE)) - raster = reader.read(null) - - // Lazily compute metadata values only when requested - lazy val filePath = path.toString - lazy val fileSize = partition.fileSize - lazy val width = RasterAccessors.getWidth(raster) - lazy val height = RasterAccessors.getHeight(raster) - lazy val numBands = RasterAccessors.numBands(raster) - lazy val srid = RasterAccessors.srid(raster) - - lazy val crsStr = Try { - val crs = raster.getCoordinateReferenceSystem - if (crs == null || crs.isInstanceOf[DefaultEngineeringCRS]) null - else crs.toWKT - }.getOrElse(null) - - lazy val affine = RasterUtils.getGDALAffineTransform(raster) - - lazy val geoTransformRow = new GenericInternalRow( - Array[Any]( - affine.getTranslateX, - affine.getTranslateY, - affine.getScaleX, - affine.getScaleY, - affine.getShearX, - affine.getShearY)) - - lazy val env = raster.getEnvelope2D - lazy val cornerCoordinatesRow = new GenericInternalRow( - Array[Any](env.getMinX, env.getMinY, env.getMaxX, env.getMaxY)) - - lazy val image = raster.getRenderedImage - lazy val tileWidth = image.getTileWidth - lazy val tileHeight = image.getTileHeight - lazy val isTiled = tileWidth < width || tileHeight < height - - lazy val bandsArray = buildBandsArray(raster, numBands, tileWidth, tileHeight) - lazy val overviewsArray = buildOverviewsArray(reader, width, height) - lazy val metadataMap = buildMetadataMap(reader) - lazy val compression = extractCompression(reader) - - // Build row matching readDataSchema field order - val fields = readDataSchema.fieldNames.map { - case "path" => UTF8String.fromString(filePath) - case "driver" => UTF8String.fromString("GTiff") - case "fileSize" => fileSize: Any - case "width" => width: Any - case "height" => height: Any - case "numBands" => numBands: Any - case "srid" => srid: Any - case "crs" => - if (crsStr != null) UTF8String.fromString(crsStr) else null - case "geoTransform" => geoTransformRow - case "cornerCoordinates" => cornerCoordinatesRow - case "bands" => bandsArray - case "overviews" => overviewsArray - case "metadata" => metadataMap - case "isTiled" => isTiled: Any - case "compression" => - if (compression != null) UTF8String.fromString(compression) else null - case other => - throw new IllegalArgumentException(s"Unsupported field name: $other") - } - - new GenericInternalRow(fields) - } finally { - if (raster != null) raster.dispose(true) - if (reader != null) reader.dispose() - imageStream.close() - } - } - - private def buildBandsArray( - raster: GridCoverage2D, - numBands: Int, - tileWidth: Int, - tileHeight: Int): ArrayData = { - val bands = (1 to numBands).map { i => - val dataType = Try(RasterBandAccessors.getBandType(raster, i)).getOrElse(null) - val noDataValue = Try(RasterBandAccessors.getBandNoDataValue(raster, i)).getOrElse(null) - val description = Try { - val desc = raster.getSampleDimension(i - 1).getDescription - if (desc != null) desc.toString(java.util.Locale.ROOT) else null - }.getOrElse(null) - - // Color interpretation is typically stored in the band description - val colorInterp = description - - // Unit type from sample dimension - val unit = Try { - val units = raster.getSampleDimension(i - 1).getUnits - if (units != null) units.toString else null - }.getOrElse(null) - - new GenericInternalRow( - Array[Any]( - i, - if (dataType != null) UTF8String.fromString(dataType) else null, - if (colorInterp != null) UTF8String.fromString(colorInterp) else null, - if (noDataValue != null) noDataValue.doubleValue() else null, - tileWidth, - tileHeight, - if (description != null) UTF8String.fromString(description) else null, - if (unit != null) UTF8String.fromString(unit) else null)) - }.toArray - - new GenericArrayData(bands) - } - - private def buildOverviewsArray( - reader: GeoTiffReader, - fullWidth: Int, - fullHeight: Int): ArrayData = { - try { - val resolutionLevels = reader.getResolutionLevels - if (resolutionLevels == null || resolutionLevels.length <= 1) { - new GenericArrayData(Array.empty[InternalRow]) - } else { - // Level 0 is full resolution; levels 1+ are overviews - val fullResX = resolutionLevels(0)(0) - val fullResY = resolutionLevels(0)(1) - val overviews = (1 until resolutionLevels.length).map { level => - val overviewResX = resolutionLevels(level)(0) - val overviewResY = resolutionLevels(level)(1) - val overviewWidth = Math.round(fullWidth.toDouble * fullResX / overviewResX).toInt - val overviewHeight = Math.round(fullHeight.toDouble * fullResY / overviewResY).toInt - new GenericInternalRow(Array[Any](level, overviewWidth, overviewHeight)) - }.toArray - new GenericArrayData(overviews) - } - } catch { - case _: Exception => new GenericArrayData(Array.empty[InternalRow]) - } - } - - private def buildMetadataMap( - reader: GeoTiffReader): org.apache.spark.sql.catalyst.util.MapData = { - try { - val metadata = reader.getMetadata - if (metadata == null) return null - - val rootNode = metadata.getRootNode - if (rootNode == null) return null - - val map = new mutable.LinkedHashMap[UTF8String, UTF8String]() - extractMetadataFromNode(rootNode, "", map) - - if (map.isEmpty) return null - - org.apache.spark.sql.catalyst.util.ArrayBasedMapData(map.keys.toArray, map.values.toArray) - } catch { - case _: Exception => null - } - } - - private def extractMetadataFromNode( - node: org.w3c.dom.Node, - prefix: String, - map: mutable.LinkedHashMap[UTF8String, UTF8String]): Unit = { - if (node == null) return - - // Extract attributes - val attrs = node.getAttributes - if (attrs != null) { - val nameAttr = attrs.getNamedItem("name") - val valueAttr = attrs.getNamedItem("value") - if (nameAttr != null && valueAttr != null) { - val key = - if (prefix.nonEmpty) s"$prefix.${nameAttr.getNodeValue}" - else nameAttr.getNodeValue - map.put(UTF8String.fromString(key), UTF8String.fromString(valueAttr.getNodeValue)) - } - } - - // Recurse into children - val children = node.getChildNodes - if (children != null) { - val childPrefix = if (prefix.nonEmpty && node.getNodeName != "#document") { - s"$prefix.${node.getNodeName}" - } else if (node.getNodeName != "#document") { - node.getNodeName - } else { - prefix - } - for (i <- 0 until children.getLength) { - extractMetadataFromNode(children.item(i), childPrefix, map) - } - } - } - - private def extractCompression(reader: GeoTiffReader): String = { - try { - val metadata = reader.getMetadata - if (metadata == null) return null - - val rootNode = metadata.getRootNode - if (rootNode == null) return null - - findCompressionInNode(rootNode) - } catch { - case _: Exception => null - } - } - - private def findCompressionInNode(node: org.w3c.dom.Node): String = { - if (node == null) return null - - val attrs = node.getAttributes - if (attrs != null) { - val nameAttr = attrs.getNamedItem("name") - val valueAttr = attrs.getNamedItem("value") - if (nameAttr != null && valueAttr != null && - nameAttr.getNodeValue.equalsIgnoreCase("Compression")) { - return valueAttr.getNodeValue - } - } - - val children = node.getChildNodes - if (children != null) { - for (i <- 0 until children.getLength) { - val result = findCompressionInNode(children.item(i)) - if (result != null) return result - } - } - null - } -} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala new file mode 100644 index 00000000000..b60c0b605a3 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.sedonainfo + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.sedona.common.raster.RasterAccessors +import org.apache.sedona.common.raster.RasterBandAccessors +import org.apache.sedona.common.raster.inputstream.HadoopImageInputStream +import org.apache.sedona.common.utils.RasterUtils +import org.geotools.coverage.grid.GridCoverage2D +import org.geotools.gce.geotiff.GeoTiffReader +import org.geotools.referencing.crs.DefaultEngineeringCRS + +import scala.collection.mutable +import scala.util.Try + +/** + * Extracts metadata from GeoTIFF files without decoding pixel data. Uses GeoTools GeoTiffReader + * which lazily decodes the RenderedImage. + */ +object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { + + override def driver: String = "GTiff" + + override def canHandle(path: Path): Boolean = { + val name = path.getName.toLowerCase + name.endsWith(".tif") || name.endsWith(".tiff") + } + + override def extract( + path: Path, + fileSize: Long, + configuration: Configuration): RasterFileMetadata = { + val imageStream = new HadoopImageInputStream(path, configuration) + var reader: GeoTiffReader = null + var raster: GridCoverage2D = null + try { + reader = new GeoTiffReader( + imageStream, + new org.geotools.util.factory.Hints( + org.geotools.util.factory.Hints.FORCE_LONGITUDE_FIRST_AXIS_ORDER, + java.lang.Boolean.TRUE)) + raster = reader.read(null) + + val width = RasterAccessors.getWidth(raster) + val height = RasterAccessors.getHeight(raster) + val numBands = RasterAccessors.numBands(raster) + val srid = RasterAccessors.srid(raster) + + val crsStr = Try { + val crs = raster.getCoordinateReferenceSystem + if (crs == null || crs.isInstanceOf[DefaultEngineeringCRS]) null + else crs.toWKT + }.getOrElse(null) + + val affine = RasterUtils.getGDALAffineTransform(raster) + val env = raster.getEnvelope2D + + val image = raster.getRenderedImage + val tileWidth = image.getTileWidth + val tileHeight = image.getTileHeight + val isTiled = tileWidth < width || tileHeight < height + + val bands = extractBands(raster, numBands, tileWidth, tileHeight) + val overviews = extractOverviews(reader, width, height) + val metadata = extractMetadata(reader) + val compression = extractCompression(reader) + + RasterFileMetadata( + path = path.toString, + driver = driver, + fileSize = fileSize, + width = width, + height = height, + numBands = numBands, + srid = srid, + crs = crsStr, + upperLeftX = affine.getTranslateX, + upperLeftY = affine.getTranslateY, + scaleX = affine.getScaleX, + scaleY = affine.getScaleY, + skewX = affine.getShearX, + skewY = affine.getShearY, + envelopeMinX = env.getMinX, + envelopeMinY = env.getMinY, + envelopeMaxX = env.getMaxX, + envelopeMaxY = env.getMaxY, + bands = bands, + overviews = overviews, + metadata = metadata, + isTiled = isTiled, + compression = compression) + } finally { + if (raster != null) raster.dispose(true) + if (reader != null) reader.dispose() + imageStream.close() + } + } + + private def extractBands( + raster: GridCoverage2D, + numBands: Int, + tileWidth: Int, + tileHeight: Int): Seq[BandMetadata] = { + (1 to numBands).map { i => + val dataType = Try(RasterBandAccessors.getBandType(raster, i)).getOrElse(null) + val noDataValue = Try(RasterBandAccessors.getBandNoDataValue(raster, i)).getOrElse(null) + val description = Try { + val desc = raster.getSampleDimension(i - 1).getDescription + if (desc != null) desc.toString(java.util.Locale.ROOT) else null + }.getOrElse(null) + val unit = Try { + val units = raster.getSampleDimension(i - 1).getUnits + if (units != null) units.toString else null + }.getOrElse(null) + + BandMetadata( + band = i, + dataType = dataType, + colorInterpretation = description, + noDataValue = noDataValue, + blockWidth = tileWidth, + blockHeight = tileHeight, + description = description, + unit = unit) + } + } + + private def extractOverviews( + reader: GeoTiffReader, + fullWidth: Int, + fullHeight: Int): Seq[OverviewMetadata] = { + try { + val resolutionLevels = reader.getResolutionLevels + if (resolutionLevels == null || resolutionLevels.length <= 1) { + Seq.empty + } else { + val fullResX = resolutionLevels(0)(0) + val fullResY = resolutionLevels(0)(1) + (1 until resolutionLevels.length).map { level => + val overviewResX = resolutionLevels(level)(0) + val overviewResY = resolutionLevels(level)(1) + OverviewMetadata( + level = level, + width = Math.round(fullWidth.toDouble * fullResX / overviewResX).toInt, + height = Math.round(fullHeight.toDouble * fullResY / overviewResY).toInt) + } + } + } catch { + case _: Exception => Seq.empty + } + } + + private def extractMetadata(reader: GeoTiffReader): Map[String, String] = { + try { + val metadata = reader.getMetadata + if (metadata == null) return Map.empty + + val rootNode = metadata.getRootNode + if (rootNode == null) return Map.empty + + val map = new mutable.LinkedHashMap[String, String]() + extractMetadataFromNode(rootNode, "", map) + map.toMap + } catch { + case _: Exception => Map.empty + } + } + + private def extractMetadataFromNode( + node: org.w3c.dom.Node, + prefix: String, + map: mutable.LinkedHashMap[String, String]): Unit = { + if (node == null) return + + val attrs = node.getAttributes + if (attrs != null) { + val nameAttr = attrs.getNamedItem("name") + val valueAttr = attrs.getNamedItem("value") + if (nameAttr != null && valueAttr != null) { + val key = + if (prefix.nonEmpty) s"$prefix.${nameAttr.getNodeValue}" + else nameAttr.getNodeValue + map.put(key, valueAttr.getNodeValue) + } + } + + val children = node.getChildNodes + if (children != null) { + val childPrefix = if (prefix.nonEmpty && node.getNodeName != "#document") { + s"$prefix.${node.getNodeName}" + } else if (node.getNodeName != "#document") { + node.getNodeName + } else { + prefix + } + for (i <- 0 until children.getLength) { + extractMetadataFromNode(children.item(i), childPrefix, map) + } + } + } + + private def extractCompression(reader: GeoTiffReader): String = { + try { + val metadata = reader.getMetadata + if (metadata == null) return null + + val rootNode = metadata.getRootNode + if (rootNode == null) return null + + findCompressionInNode(rootNode) + } catch { + case _: Exception => null + } + } + + private def findCompressionInNode(node: org.w3c.dom.Node): String = { + if (node == null) return null + + val attrs = node.getAttributes + if (attrs != null) { + val nameAttr = attrs.getNamedItem("name") + val valueAttr = attrs.getNamedItem("value") + if (nameAttr != null && valueAttr != null && + nameAttr.getNodeValue.equalsIgnoreCase("Compression")) { + return valueAttr.getNodeValue + } + } + + val children = node.getChildNodes + if (children != null) { + for (i <- 0 until children.getLength) { + val result = findCompressionInNode(children.item(i)) + if (result != null) return result + } + } + null + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala new file mode 100644 index 00000000000..9d4182ef485 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.sedonainfo + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path + +/** + * Trait for extracting metadata from raster files. Implementations handle format-specific logic + * (e.g., GeoTIFF, NetCDF, etc.) and return a common [[RasterFileMetadata]] structure. + */ +trait RasterFileMetadataExtractor { + + /** Short driver name (e.g., "GTiff", "NetCDF"). */ + def driver: String + + /** + * Extract metadata from the file at the given path. Implementations must not decode pixel data + * — only headers/metadata should be read. + */ + def extract(path: Path, fileSize: Long, configuration: Configuration): RasterFileMetadata + + /** Returns true if this extractor can handle the given file path (by extension). */ + def canHandle(path: Path): Boolean +} + +/** + * Common metadata structure returned by all format-specific extractors. + */ +case class RasterFileMetadata( + path: String, + driver: String, + fileSize: Long, + width: Int, + height: Int, + numBands: Int, + srid: Int, + crs: String, + upperLeftX: Double, + upperLeftY: Double, + scaleX: Double, + scaleY: Double, + skewX: Double, + skewY: Double, + envelopeMinX: Double, + envelopeMinY: Double, + envelopeMaxX: Double, + envelopeMaxY: Double, + bands: Seq[BandMetadata], + overviews: Seq[OverviewMetadata], + metadata: Map[String, String], + isTiled: Boolean, + compression: String) + +case class BandMetadata( + band: Int, + dataType: String, + colorInterpretation: String, + noDataValue: java.lang.Double, + blockWidth: Int, + blockHeight: Int, + description: String, + unit: String) + +case class OverviewMetadata(level: Int, width: Int, height: Int) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataDataSource.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala similarity index 87% rename from spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataDataSource.scala rename to spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala index d113dcb43bf..d8db5322836 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataDataSource.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.spark.sql.sedona_sql.io.geotiffmetadata +package org.apache.spark.sql.sedona_sql.io.sedonainfo import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.connector.catalog.TableProvider @@ -29,10 +29,12 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap import scala.collection.JavaConverters._ -class GeoTiffMetadataDataSource - extends FileDataSourceV2 - with TableProvider - with DataSourceRegister { +/** + * A Spark SQL data source that reads raster file metadata without decoding pixel data, similar to + * gdalinfo. Currently supports GeoTIFF files. Additional formats can be added by implementing + * [[RasterFileMetadataExtractor]]. + */ +class SedonaInfoDataSource extends FileDataSourceV2 with TableProvider with DataSourceRegister { override def shortName(): String = "sedonainfo" @@ -66,7 +68,7 @@ class GeoTiffMetadataDataSource } } - new GeoTiffMetadataTable( + new SedonaInfoTable( tableName, sparkSession, optionsWithoutPaths, @@ -84,7 +86,7 @@ class GeoTiffMetadataDataSource } override def inferSchema(options: CaseInsensitiveStringMap): StructType = { - GeoTiffMetadataTable.SCHEMA + SedonaInfoTable.SCHEMA } override def fallbackFileFormat: Class[_ <: FileFormat] = classOf[RasterFileFormat] diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala new file mode 100644 index 00000000000..5c3e4c3d499 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.sedonainfo + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.sql.connector.read.PartitionReader +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.types.StructType +import org.apache.spark.unsafe.types.UTF8String + +import java.net.URI + +/** + * Reads raster file metadata by delegating to format-specific [[RasterFileMetadataExtractor]] + * implementations. Produces one [[InternalRow]] per file matching the readDataSchema. + */ +class SedonaInfoPartitionReader( + configuration: Configuration, + partitionedFiles: Array[PartitionedFile], + readDataSchema: StructType) + extends PartitionReader[InternalRow] { + + private var currentFileIndex = 0 + private var currentRow: InternalRow = _ + + override def next(): Boolean = { + if (currentFileIndex < partitionedFiles.length) { + currentRow = readFileMetadata(partitionedFiles(currentFileIndex)) + currentFileIndex += 1 + true + } else { + false + } + } + + override def get(): InternalRow = currentRow + + override def close(): Unit = {} + + private def readFileMetadata(partition: PartitionedFile): InternalRow = { + val path = new Path(new URI(partition.filePath.toString())) + val extractor = SedonaInfoPartitionReader.findExtractor(path) + val meta = extractor.extract(path, partition.fileSize, configuration) + SedonaInfoPartitionReader.toInternalRow(meta, readDataSchema) + } +} + +object SedonaInfoPartitionReader { + + /** Registered metadata extractors. Add new format extractors here. */ + private val extractors: Seq[RasterFileMetadataExtractor] = Seq(GeoTiffMetadataExtractor) + + def findExtractor(path: Path): RasterFileMetadataExtractor = { + extractors + .find(_.canHandle(path)) + .getOrElse( + throw new UnsupportedOperationException( + s"No metadata extractor found for file: ${path.getName}. " + + s"Supported formats: ${extractors.map(_.driver).mkString(", ")}")) + } + + def toInternalRow(meta: RasterFileMetadata, readDataSchema: StructType): InternalRow = { + val geoTransformRow = new GenericInternalRow( + Array[Any]( + meta.upperLeftX, + meta.upperLeftY, + meta.scaleX, + meta.scaleY, + meta.skewX, + meta.skewY)) + + val cornerCoordinatesRow = new GenericInternalRow( + Array[Any](meta.envelopeMinX, meta.envelopeMinY, meta.envelopeMaxX, meta.envelopeMaxY)) + + lazy val bandsArray: ArrayData = { + val bands = meta.bands.map { b => + new GenericInternalRow( + Array[Any]( + b.band, + if (b.dataType != null) UTF8String.fromString(b.dataType) else null, + if (b.colorInterpretation != null) UTF8String.fromString(b.colorInterpretation) + else null, + if (b.noDataValue != null) b.noDataValue.doubleValue() else null, + b.blockWidth, + b.blockHeight, + if (b.description != null) UTF8String.fromString(b.description) else null, + if (b.unit != null) UTF8String.fromString(b.unit) else null)) + }.toArray + new GenericArrayData(bands) + } + + lazy val overviewsArray: ArrayData = { + val overviews = meta.overviews.map { o => + new GenericInternalRow(Array[Any](o.level, o.width, o.height)) + }.toArray + new GenericArrayData(overviews) + } + + lazy val metadataMap: Any = { + if (meta.metadata.isEmpty) null + else { + org.apache.spark.sql.catalyst.util.ArrayBasedMapData( + meta.metadata.keys.map(UTF8String.fromString).toArray, + meta.metadata.values.map(UTF8String.fromString).toArray) + } + } + + val fields = readDataSchema.fieldNames.map { + case "path" => UTF8String.fromString(meta.path) + case "driver" => UTF8String.fromString(meta.driver) + case "fileSize" => meta.fileSize: Any + case "width" => meta.width: Any + case "height" => meta.height: Any + case "numBands" => meta.numBands: Any + case "srid" => meta.srid: Any + case "crs" => + if (meta.crs != null) UTF8String.fromString(meta.crs) else null + case "geoTransform" => geoTransformRow + case "cornerCoordinates" => cornerCoordinatesRow + case "bands" => bandsArray + case "overviews" => overviewsArray + case "metadata" => metadataMap + case "isTiled" => meta.isTiled: Any + case "compression" => + if (meta.compression != null) UTF8String.fromString(meta.compression) else null + case other => + throw new IllegalArgumentException(s"Unsupported field name: $other") + } + + new GenericInternalRow(fields) + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReaderFactory.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReaderFactory.scala similarity index 92% rename from spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReaderFactory.scala rename to spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReaderFactory.scala index 508e8c3cf33..bbebae3bae9 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataPartitionReaderFactory.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReaderFactory.scala @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.spark.sql.sedona_sql.io.geotiffmetadata +package org.apache.spark.sql.sedona_sql.io.sedonainfo import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.InternalRow @@ -28,7 +28,7 @@ import org.apache.spark.sql.sedona_sql.io.raster.RasterInputPartition import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration -case class GeoTiffMetadataPartitionReaderFactory( +case class SedonaInfoPartitionReaderFactory( broadcastedConf: Broadcast[SerializableConfiguration], dataSchema: StructType, readDataSchema: StructType, @@ -38,7 +38,7 @@ case class GeoTiffMetadataPartitionReaderFactory( override def createReader(partition: InputPartition): PartitionReader[InternalRow] = { partition match { case filePartition: RasterInputPartition => - val fileReader = new GeoTiffMetadataPartitionReader( + val fileReader = new SedonaInfoPartitionReader( broadcastedConf.value.value, filePartition.files, readDataSchema) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataScanBuilder.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoScanBuilder.scala similarity index 95% rename from spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataScanBuilder.scala rename to spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoScanBuilder.scala index 0f422aee3a7..e1a406d9bfa 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataScanBuilder.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoScanBuilder.scala @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.spark.sql.sedona_sql.io.geotiffmetadata +package org.apache.spark.sql.sedona_sql.io.sedonainfo import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.Expression @@ -37,7 +37,7 @@ import org.apache.spark.util.SerializableConfiguration import scala.collection.JavaConverters._ -case class GeoTiffMetadataScanBuilder( +case class SedonaInfoScanBuilder( sparkSession: SparkSession, fileIndex: PartitioningAwareFileIndex, schema: StructType, @@ -49,7 +49,7 @@ case class GeoTiffMetadataScanBuilder( private var pushedLimit: Option[Int] = None override def build(): Scan = { - GeoTiffMetadataScan( + SedonaInfoScan( sparkSession, fileIndex, dataSchema, @@ -70,7 +70,7 @@ case class GeoTiffMetadataScanBuilder( override def isPartiallyPushed: Boolean = false } -case class GeoTiffMetadataScan( +case class SedonaInfoScan( sparkSession: SparkSession, fileIndex: PartitioningAwareFileIndex, dataSchema: StructType, @@ -124,7 +124,7 @@ case class GeoTiffMetadataScan( val broadcastedConf = sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) - GeoTiffMetadataPartitionReaderFactory( + SedonaInfoPartitionReaderFactory( broadcastedConf, dataSchema, readDataSchema, diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataTable.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoTable.scala similarity index 94% rename from spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataTable.scala rename to spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoTable.scala index a1c5e125d7f..34ba34764ae 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/geotiffmetadata/GeoTiffMetadataTable.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoTable.scala @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.spark.sql.sedona_sql.io.geotiffmetadata +package org.apache.spark.sql.sedona_sql.io.sedonainfo import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.SparkSession @@ -33,7 +33,7 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap import java.util.{Set => JSet} -case class GeoTiffMetadataTable( +case class SedonaInfoTable( name: String, sparkSession: SparkSession, options: CaseInsensitiveStringMap, @@ -45,21 +45,21 @@ case class GeoTiffMetadataTable( with SupportsWrite { override def inferSchema(files: Seq[FileStatus]): Option[StructType] = - Some(GeoTiffMetadataTable.SCHEMA) + Some(SedonaInfoTable.SCHEMA) - override def formatName: String = "GeoTiffMetadata" + override def formatName: String = "SedonaInfo" override def capabilities(): JSet[TableCapability] = java.util.EnumSet.of(TableCapability.BATCH_READ) override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { - GeoTiffMetadataScanBuilder(sparkSession, fileIndex, schema, dataSchema, options) + SedonaInfoScanBuilder(sparkSession, fileIndex, schema, dataSchema, options) } override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = null } -object GeoTiffMetadataTable { +object SedonaInfoTable { val GEO_TRANSFORM_TYPE: StructType = StructType( Seq( From 4d6849b8e58f7946d13b7dd55ddd6a6a36f41af1 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Thu, 2 Apr 2026 01:25:13 -0700 Subject: [PATCH 03/10] [GH-2824] Add COG detection tests for sedonainfo data source Generate COG files on-the-fly using RS_AsCOG and verify that sedonainfo correctly reports isTiled=true, non-empty overviews with proper level/width/height, and blockSize matching the requested tile size. --- .../sedona/sql/geotiffMetadataTest.scala | 100 +++++++++++++++++- 1 file changed, 95 insertions(+), 5 deletions(-) diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala index fc513239095..83de357bdbb 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala @@ -18,12 +18,23 @@ */ package org.apache.sedona.sql +import org.apache.commons.io.FileUtils import org.scalatest.BeforeAndAfter +import java.io.File +import java.nio.file.Files + class geotiffMetadataTest extends TestBaseScala with BeforeAndAfter { val rasterDir: String = resourceFolder + "raster/" val singleFileLocation: String = resourceFolder + "raster/test1.tiff" + val tempDir: String = + Files.createTempDirectory("sedona_sedonainfo_test_").toFile.getAbsolutePath + + override def afterAll(): Unit = { + FileUtils.deleteDirectory(new File(tempDir)) + super.afterAll() + } describe("GeoTiff Metadata (sedonainfo) data source") { @@ -79,7 +90,6 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfter { "geoTransform.skewX", "geoTransform.skewY") .first() - // scaleX should be positive, scaleY should be negative (north-up convention) assert(row.getAs[Double]("scaleX") != 0.0) assert(row.getAs[Double]("scaleY") != 0.0) } @@ -159,10 +169,7 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfter { it("should report isTiled correctly") { val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) val row = df.first() - // isTiled should be a boolean val isTiled = row.getAs[Boolean]("isTiled") - val width = row.getAs[Int]("width") - // If the raster is small, it's likely not tiled (strip-based) assert(isTiled == true || isTiled == false) } @@ -176,9 +183,92 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfter { it("should return overviews array") { val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) val row = df.first() - // Test rasters likely don't have overviews, so empty array is expected val overviews = row.getAs[Seq[Any]]("overviews") assert(overviews != null) } + + it("should detect COG properties from a generated COG file") { + // Generate a COG from test1.tiff using RS_AsCOG and write to disk + val cogBytes = sparkSession.read + .format("binaryFile") + .load(singleFileLocation) + .selectExpr("RS_FromGeoTiff(content) as raster") + .selectExpr("RS_AsCOG(raster, 'LZW', 256, 0.5, 'Nearest', 2) as cog") + .first() + .getAs[Array[Byte]]("cog") + + val cogFile = new File(tempDir, "test_cog.tiff") + val fos = new java.io.FileOutputStream(cogFile) + try { fos.write(cogBytes) } + finally { fos.close() } + + // Read the COG with sedonainfo + val df = sparkSession.read.format("sedonainfo").load(cogFile.getAbsolutePath) + assert(df.count() == 1) + + val row = df.first() + + // COGs should be tiled + assert(row.getAs[Boolean]("isTiled") == true) + + // COGs should have overviews + val overviews = row.getAs[Seq[Any]]("overviews") + assert(overviews != null) + assert(overviews.nonEmpty, "COG should have at least one overview level") + + // Verify overview struct fields are accessible + val overviewDf = df + .selectExpr("explode(overviews) as ovr") + .selectExpr("ovr.level", "ovr.width", "ovr.height") + val ovrRow = overviewDf.first() + assert(ovrRow.getAs[Int]("level") >= 1) + assert(ovrRow.getAs[Int]("width") > 0) + assert(ovrRow.getAs[Int]("height") > 0) + + // Overview dimensions should be smaller than full resolution + val fullWidth = row.getAs[Int]("width") + val fullHeight = row.getAs[Int]("height") + assert(ovrRow.getAs[Int]("width") < fullWidth) + assert(ovrRow.getAs[Int]("height") < fullHeight) + + // Band block size should match the COG tile size (256) + val bandDf = df + .selectExpr("explode(bands) as band") + .selectExpr("band.blockWidth", "band.blockHeight") + val bandRow = bandDf.first() + assert(bandRow.getAs[Int]("blockWidth") == 256) + assert(bandRow.getAs[Int]("blockHeight") == 256) + } + + it("should correctly report non-COG vs COG differences") { + // Read the original non-COG test file + val nonCogDf = + sparkSession.read.format("sedonainfo").load(singleFileLocation).select("isTiled") + val nonCogTiled = nonCogDf.first().getAs[Boolean]("isTiled") + + // Generate a COG and write directly to file + val cogBytes = sparkSession.read + .format("binaryFile") + .load(singleFileLocation) + .selectExpr("RS_FromGeoTiff(content) as raster") + .selectExpr("RS_AsCOG(raster, 'Deflate', 256) as cog") + .first() + .getAs[Array[Byte]]("cog") + + val cogFile = new File(tempDir, "test_cog_compare.tiff") + val fos = new java.io.FileOutputStream(cogFile) + try { fos.write(cogBytes) } + finally { fos.close() } + + val cogDf = sparkSession.read + .format("sedonainfo") + .load(cogFile.getAbsolutePath) + .select("isTiled", "overviews") + val cogRow = cogDf.first() + + // COG should be tiled with overviews + assert(cogRow.getAs[Boolean]("isTiled") == true) + assert(cogRow.getAs[Seq[Any]]("overviews").nonEmpty) + } } } From ce987a5fd8518188fa2c66ff8a7d5e2e079f2ea6 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Thu, 2 Apr 2026 01:37:48 -0700 Subject: [PATCH 04/10] [GH-2824] Rewrite tests with exact assertions, fix overview detection - Replace all inexact assertions (>0, !=0) with exact value matches for test1.tiff: width=512, height=517, srid=3857, fileSize=174803, band type=UNSIGNED_8BITS, blockSize=256x256, etc. - Fix overview detection to use DatasetLayout.getNumInternalOverviews() instead of getResolutionLevels() which returns synthetic tile-based levels even for non-COG files - Add COG test that generates a COG on-the-fly via RS_AsCOG and verifies isTiled=true, 2 overviews, blockSize=256x256 --- .../sedonainfo/GeoTiffMetadataExtractor.scala | 33 +- .../sedona/sql/geotiffMetadataTest.scala | 289 +++++++----------- 2 files changed, 138 insertions(+), 184 deletions(-) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala index b60c0b605a3..6161eaab125 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala @@ -148,20 +148,27 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { fullWidth: Int, fullHeight: Int): Seq[OverviewMetadata] = { try { + // Use DatasetLayout to get actual internal overview count (not synthetic tile-based levels) + val layout = reader.getDatasetLayout + if (layout == null) return Seq.empty + + val numOverviews = layout.getNumInternalOverviews + if (numOverviews <= 0) return Seq.empty + val resolutionLevels = reader.getResolutionLevels - if (resolutionLevels == null || resolutionLevels.length <= 1) { - Seq.empty - } else { - val fullResX = resolutionLevels(0)(0) - val fullResY = resolutionLevels(0)(1) - (1 until resolutionLevels.length).map { level => - val overviewResX = resolutionLevels(level)(0) - val overviewResY = resolutionLevels(level)(1) - OverviewMetadata( - level = level, - width = Math.round(fullWidth.toDouble * fullResX / overviewResX).toInt, - height = Math.round(fullHeight.toDouble * fullResY / overviewResY).toInt) - } + if (resolutionLevels == null || resolutionLevels.length <= 1) return Seq.empty + + // Only report the actual internal overviews, not synthetic resolution levels + val count = Math.min(numOverviews, resolutionLevels.length - 1) + val fullResX = resolutionLevels(0)(0) + val fullResY = resolutionLevels(0)(1) + (1 to count).map { level => + val overviewResX = resolutionLevels(level)(0) + val overviewResY = resolutionLevels(level)(1) + OverviewMetadata( + level = level, + width = Math.round(fullWidth.toDouble * fullResX / overviewResX).toInt, + height = Math.round(fullHeight.toDouble * fullResY / overviewResY).toInt) } } catch { case _: Exception => Seq.empty diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala index 83de357bdbb..900ebd0e8c0 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala @@ -19,12 +19,13 @@ package org.apache.sedona.sql import org.apache.commons.io.FileUtils -import org.scalatest.BeforeAndAfter +import org.junit.Assert.assertEquals +import org.scalatest.BeforeAndAfterAll import java.io.File import java.nio.file.Files -class geotiffMetadataTest extends TestBaseScala with BeforeAndAfter { +class geotiffMetadataTest extends TestBaseScala with BeforeAndAfterAll { val rasterDir: String = resourceFolder + "raster/" val singleFileLocation: String = resourceFolder + "raster/test1.tiff" @@ -36,52 +37,28 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfter { super.afterAll() } - describe("GeoTiff Metadata (sedonainfo) data source") { + describe("SedonaInfo data source") { - it("should read a single GeoTIFF file and return one row") { + it("should read test1.tiff with exact metadata values") { val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) assert(df.count() == 1) - val row = df.first() - assert(row.getAs[String]("path").contains("test1.tiff")) - assert(row.getAs[String]("driver") == "GTiff") - assert(row.getAs[Int]("width") > 0) - assert(row.getAs[Int]("height") > 0) - assert(row.getAs[Int]("numBands") > 0) - } - - it("should read multiple GeoTIFF files via glob pattern") { - val df = sparkSession.read.format("sedonainfo").load(rasterDir + "*.tiff") - assert(df.count() > 1) - } - - it("should read GeoTIFF files from directory with trailing slash") { - val df = sparkSession.read.format("sedonainfo").load(rasterDir) - assert(df.count() > 1) - } - - it("should return correct schema") { - val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) - val schema = df.schema - assert(schema.fieldNames.contains("path")) - assert(schema.fieldNames.contains("driver")) - assert(schema.fieldNames.contains("fileSize")) - assert(schema.fieldNames.contains("width")) - assert(schema.fieldNames.contains("height")) - assert(schema.fieldNames.contains("numBands")) - assert(schema.fieldNames.contains("srid")) - assert(schema.fieldNames.contains("crs")) - assert(schema.fieldNames.contains("geoTransform")) - assert(schema.fieldNames.contains("cornerCoordinates")) - assert(schema.fieldNames.contains("bands")) - assert(schema.fieldNames.contains("overviews")) - assert(schema.fieldNames.contains("metadata")) - assert(schema.fieldNames.contains("isTiled")) - assert(schema.fieldNames.contains("compression")) - } - it("should return correct geoTransform struct") { - val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) - val row = df + val row = df.first() + assert(row.getAs[String]("path").endsWith("test1.tiff")) + assertEquals("GTiff", row.getAs[String]("driver")) + assertEquals(174803L, row.getAs[Long]("fileSize")) + assertEquals(512, row.getAs[Int]("width")) + assertEquals(517, row.getAs[Int]("height")) + assertEquals(1, row.getAs[Int]("numBands")) + assertEquals(3857, row.getAs[Int]("srid")) + assert(row.getAs[String]("crs").contains("EPSG")) + assertEquals(true, row.getAs[Boolean]("isTiled")) + } + + it("should return exact geoTransform for test1.tiff") { + val row = sparkSession.read + .format("sedonainfo") + .load(singleFileLocation) .selectExpr( "geoTransform.upperLeftX", "geoTransform.upperLeftY", @@ -90,48 +67,68 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfter { "geoTransform.skewX", "geoTransform.skewY") .first() - assert(row.getAs[Double]("scaleX") != 0.0) - assert(row.getAs[Double]("scaleY") != 0.0) + assertEquals(-1.3095817809482181e7, row.getAs[Double]("upperLeftX"), 0.01) + assertEquals(4021262.7487925636, row.getAs[Double]("upperLeftY"), 0.01) + assertEquals(72.32861272132695, row.getAs[Double]("scaleX"), 1e-10) + assertEquals(-72.32861272132695, row.getAs[Double]("scaleY"), 1e-10) + assertEquals(0.0, row.getAs[Double]("skewX"), 1e-15) + assertEquals(0.0, row.getAs[Double]("skewY"), 1e-15) } - it("should return correct cornerCoordinates struct") { - val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) - val row = df + it("should return exact cornerCoordinates for test1.tiff") { + val row = sparkSession.read + .format("sedonainfo") + .load(singleFileLocation) .selectExpr( "cornerCoordinates.minX", "cornerCoordinates.minY", "cornerCoordinates.maxX", "cornerCoordinates.maxY") .first() - assert(row.getAs[Double]("maxX") > row.getAs[Double]("minX")) - assert(row.getAs[Double]("maxY") > row.getAs[Double]("minY")) + assertEquals(-1.3095817809482181e7, row.getAs[Double]("minX"), 0.01) + assertEquals(3983868.8560156375, row.getAs[Double]("minY"), 0.01) + assertEquals(-1.3058785559768861e7, row.getAs[Double]("maxX"), 0.01) + assertEquals(4021262.7487925636, row.getAs[Double]("maxY"), 0.01) } - it("should return correct bands array") { - val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) - val row = df.first() - val numBands = row.getAs[Int]("numBands") - val bands = row.getAs[Seq[Any]]("bands") - assert(bands != null) - assert(bands.size == numBands) - } - - it("should return band metadata with correct fields") { - val df = sparkSession.read + it("should return exact band metadata for test1.tiff") { + val row = sparkSession.read .format("sedonainfo") .load(singleFileLocation) - .selectExpr("explode(bands) as band") - .selectExpr("band.band", "band.dataType", "band.blockWidth", "band.blockHeight") - val row = df.first() - assert(row.getAs[Int]("band") == 1) - assert(row.getAs[String]("dataType") != null) - assert(row.getAs[Int]("blockWidth") > 0) - assert(row.getAs[Int]("blockHeight") > 0) + .selectExpr("explode(bands) as b") + .selectExpr( + "b.band", + "b.dataType", + "b.colorInterpretation", + "b.noDataValue", + "b.blockWidth", + "b.blockHeight", + "b.description", + "b.unit") + .first() + assertEquals(1, row.getAs[Int]("band")) + assertEquals("UNSIGNED_8BITS", row.getAs[String]("dataType")) + assertEquals("GRAY_INDEX", row.getAs[String]("colorInterpretation")) + assert(row.isNullAt(row.fieldIndex("noDataValue"))) + assertEquals(256, row.getAs[Int]("blockWidth")) + assertEquals(256, row.getAs[Int]("blockHeight")) + assertEquals("GRAY_INDEX", row.getAs[String]("description")) + assert(row.isNullAt(row.fieldIndex("unit"))) + } + + it("should return empty overviews for non-COG test1.tiff") { + // test1.tiff has only 1 IFD (no internal overviews) + val row = sparkSession.read + .format("sedonainfo") + .load(singleFileLocation) + .selectExpr("size(overviews) as overviewCount") + .first() + assertEquals(0, row.getAs[Int]("overviewCount")) } - it("should cross-validate metadata against raster data source") { - val metaDf = sparkSession.read.format("sedonainfo").load(singleFileLocation) - val rasterDf = sparkSession.read + it("should cross-validate against raster data source") { + val metaRow = sparkSession.read.format("sedonainfo").load(singleFileLocation).first() + val rasterRow = sparkSession.read .format("raster") .option("retile", "false") .load(singleFileLocation) @@ -140,55 +137,43 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfter { "RS_Height(rast) as height", "RS_NumBands(rast) as numBands", "RS_SRID(rast) as srid") + .first() + assertEquals(metaRow.getAs[Int]("width"), rasterRow.getAs[Int]("width")) + assertEquals(metaRow.getAs[Int]("height"), rasterRow.getAs[Int]("height")) + assertEquals(metaRow.getAs[Int]("numBands"), rasterRow.getAs[Int]("numBands")) + assertEquals(metaRow.getAs[Int]("srid"), rasterRow.getAs[Int]("srid")) + } - val metaRow = metaDf.first() - val rasterRow = rasterDf.first() - assert(metaRow.getAs[Int]("width") == rasterRow.getAs[Int]("width")) - assert(metaRow.getAs[Int]("height") == rasterRow.getAs[Int]("height")) - assert(metaRow.getAs[Int]("numBands") == rasterRow.getAs[Int]("numBands")) - assert(metaRow.getAs[Int]("srid") == rasterRow.getAs[Int]("srid")) + it("should read multiple files via glob") { + val df = sparkSession.read.format("sedonainfo").load(rasterDir + "*.tiff") + // 7 .tiff files in the raster directory (excludes test3.tif) + assertEquals(7L, df.count()) } - it("should support LIMIT pushdown") { + it("should read files from directory with trailing slash") { val df = sparkSession.read.format("sedonainfo").load(rasterDir) - val totalCount = df.count() - assert(totalCount > 2, "Need at least 3 files for this test") - val limitedDf = df.limit(2) - assert(limitedDf.count() == 2) + // Recursive lookup finds all .tif/.tiff files including subdirectories + assertEquals(9L, df.count()) } - it("should support column selection") { + it("should support LIMIT pushdown") { + val df = sparkSession.read.format("sedonainfo").load(rasterDir).limit(2) + assertEquals(2L, df.count()) + } + + it("should support column pruning") { val df = sparkSession.read .format("sedonainfo") .load(singleFileLocation) .select("path", "width", "height") - assert(df.schema.fieldNames.length == 3) - assert(df.count() == 1) - } - - it("should report isTiled correctly") { - val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) - val row = df.first() - val isTiled = row.getAs[Boolean]("isTiled") - assert(isTiled == true || isTiled == false) - } - - it("should return fileSize") { - val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) + assertEquals(3, df.schema.fieldNames.length) val row = df.first() - val fileSize = row.getAs[Long]("fileSize") - assert(fileSize > 0) + assertEquals(512, row.getAs[Int]("width")) + assertEquals(517, row.getAs[Int]("height")) } - it("should return overviews array") { - val df = sparkSession.read.format("sedonainfo").load(singleFileLocation) - val row = df.first() - val overviews = row.getAs[Seq[Any]]("overviews") - assert(overviews != null) - } - - it("should detect COG properties from a generated COG file") { - // Generate a COG from test1.tiff using RS_AsCOG and write to disk + it("should detect COG properties from a generated COG") { + // Generate a COG with known parameters: LZW compression, 256 tile size, 2 overviews val cogBytes = sparkSession.read .format("binaryFile") .load(singleFileLocation) @@ -202,73 +187,35 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfter { try { fos.write(cogBytes) } finally { fos.close() } - // Read the COG with sedonainfo val df = sparkSession.read.format("sedonainfo").load(cogFile.getAbsolutePath) - assert(df.count() == 1) - val row = df.first() - // COGs should be tiled - assert(row.getAs[Boolean]("isTiled") == true) - - // COGs should have overviews - val overviews = row.getAs[Seq[Any]]("overviews") - assert(overviews != null) - assert(overviews.nonEmpty, "COG should have at least one overview level") - - // Verify overview struct fields are accessible - val overviewDf = df - .selectExpr("explode(overviews) as ovr") - .selectExpr("ovr.level", "ovr.width", "ovr.height") - val ovrRow = overviewDf.first() - assert(ovrRow.getAs[Int]("level") >= 1) - assert(ovrRow.getAs[Int]("width") > 0) - assert(ovrRow.getAs[Int]("height") > 0) - - // Overview dimensions should be smaller than full resolution - val fullWidth = row.getAs[Int]("width") - val fullHeight = row.getAs[Int]("height") - assert(ovrRow.getAs[Int]("width") < fullWidth) - assert(ovrRow.getAs[Int]("height") < fullHeight) - - // Band block size should match the COG tile size (256) - val bandDf = df - .selectExpr("explode(bands) as band") - .selectExpr("band.blockWidth", "band.blockHeight") - val bandRow = bandDf.first() - assert(bandRow.getAs[Int]("blockWidth") == 256) - assert(bandRow.getAs[Int]("blockHeight") == 256) - } - - it("should correctly report non-COG vs COG differences") { - // Read the original non-COG test file - val nonCogDf = - sparkSession.read.format("sedonainfo").load(singleFileLocation).select("isTiled") - val nonCogTiled = nonCogDf.first().getAs[Boolean]("isTiled") - - // Generate a COG and write directly to file - val cogBytes = sparkSession.read - .format("binaryFile") - .load(singleFileLocation) - .selectExpr("RS_FromGeoTiff(content) as raster") - .selectExpr("RS_AsCOG(raster, 'Deflate', 256) as cog") + // COG preserves original dimensions and CRS + assertEquals(512, row.getAs[Int]("width")) + assertEquals(517, row.getAs[Int]("height")) + assertEquals(1, row.getAs[Int]("numBands")) + assertEquals(3857, row.getAs[Int]("srid")) + + // COG must be tiled + assertEquals(true, row.getAs[Boolean]("isTiled")) + + // COG must have overviews (requested 2) + val overviews = df + .selectExpr("explode(overviews) as o") + .selectExpr("o.level", "o.width", "o.height") + .collect() + assertEquals(2, overviews.length) + // Each overview is progressively smaller + assert(overviews(0).getAs[Int]("width") < 512) + assert(overviews(1).getAs[Int]("width") < overviews(0).getAs[Int]("width")) + + // Block size should match the requested 256 + val bandRow = df + .selectExpr("explode(bands) as b") + .selectExpr("b.blockWidth", "b.blockHeight") .first() - .getAs[Array[Byte]]("cog") - - val cogFile = new File(tempDir, "test_cog_compare.tiff") - val fos = new java.io.FileOutputStream(cogFile) - try { fos.write(cogBytes) } - finally { fos.close() } - - val cogDf = sparkSession.read - .format("sedonainfo") - .load(cogFile.getAbsolutePath) - .select("isTiled", "overviews") - val cogRow = cogDf.first() - - // COG should be tiled with overviews - assert(cogRow.getAs[Boolean]("isTiled") == true) - assert(cogRow.getAs[Seq[Any]]("overviews").nonEmpty) + assertEquals(256, bandRow.getAs[Int]("blockWidth")) + assertEquals(256, bandRow.getAs[Int]("blockHeight")) } } } From 1c94ccb7e004fa8d858c2103f455474d7033cfe4 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Fri, 3 Apr 2026 01:53:48 -0700 Subject: [PATCH 05/10] [GH-2824] Address review: fix isTiled, colorInterpretation, SupportsWrite, docs - Fix isTiled: read TIFF TileWidth tag (322) from IIO metadata instead of RenderedImage tile size which reports strips as tiles - Fix colorInterpretation: derive from TIFF Photometric Interpretation tag (262) instead of copying band description. Maps to gdalinfo values: Gray, Red, Green, Blue, Alpha, Palette, Undefined - Fix SupportsWrite: remove mixin, throw UnsupportedOperationException in newWriteBuilder since sedonainfo is read-only - Fix docs: remove false claim about column pruning skipping extraction - Fix compression: read from TIFF tag 259 description attribute for human-readable names (e.g., "LZW", "Deflate") - Extract TIFF IIO metadata before reader.read() to avoid stream state issues --- .../tutorial/files/sedonainfo-sedona-spark.md | 4 +- .../sedonainfo/GeoTiffMetadataExtractor.scala | 209 ++++++++++++++++-- .../io/sedonainfo/SedonaInfoTable.scala | 7 +- .../sedona/sql/geotiffMetadataTest.scala | 3 +- 4 files changed, 198 insertions(+), 25 deletions(-) diff --git a/docs/tutorial/files/sedonainfo-sedona-spark.md b/docs/tutorial/files/sedonainfo-sedona-spark.md index 13e7a827f17..cdddd69f4f6 100644 --- a/docs/tutorial/files/sedonainfo-sedona-spark.md +++ b/docs/tutorial/files/sedonainfo-sedona-spark.md @@ -172,9 +172,9 @@ df.selectExpr("path", "explode(overviews) as ovr").selectExpr( ).show() ``` -### Column pruning for performance +### Select specific columns -Select only the columns you need. SedonaInfo uses column pruning to skip extracting unused metadata: +Select only the columns you need: ```python df = ( diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala index 6161eaab125..66e591f1a13 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala @@ -57,6 +57,13 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { new org.geotools.util.factory.Hints( org.geotools.util.factory.Hints.FORCE_LONGITUDE_FIRST_AXIS_ORDER, java.lang.Boolean.TRUE)) + + // Extract TIFF IIO metadata BEFORE read() which may alter stream state + val isTiled = hasTiffTag(reader, TAG_TILE_WIDTH) + val photometric = extractPhotometricInterpretation(reader) + val tiffMetadata = extractMetadata(reader) + val compression = extractCompression(reader) + raster = reader.read(null) val width = RasterAccessors.getWidth(raster) @@ -76,12 +83,9 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { val image = raster.getRenderedImage val tileWidth = image.getTileWidth val tileHeight = image.getTileHeight - val isTiled = tileWidth < width || tileHeight < height - val bands = extractBands(raster, numBands, tileWidth, tileHeight) + val bands = extractBands(raster, numBands, tileWidth, tileHeight, photometric) val overviews = extractOverviews(reader, width, height) - val metadata = extractMetadata(reader) - val compression = extractCompression(reader) RasterFileMetadata( path = path.toString, @@ -104,7 +108,7 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { envelopeMaxY = env.getMaxY, bands = bands, overviews = overviews, - metadata = metadata, + metadata = tiffMetadata, isTiled = isTiled, compression = compression) } finally { @@ -114,11 +118,21 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { } } + // TIFF tag constants + private val TAG_TILE_WIDTH = 322 + + // TIFF Photometric Interpretation values + private val PHOTOMETRIC_MIN_IS_WHITE = 0 + private val PHOTOMETRIC_MIN_IS_BLACK = 1 + private val PHOTOMETRIC_RGB = 2 + private val PHOTOMETRIC_PALETTE = 3 + private def extractBands( raster: GridCoverage2D, numBands: Int, tileWidth: Int, - tileHeight: Int): Seq[BandMetadata] = { + tileHeight: Int, + photometric: Int): Seq[BandMetadata] = { (1 to numBands).map { i => val dataType = Try(RasterBandAccessors.getBandType(raster, i)).getOrElse(null) val noDataValue = Try(RasterBandAccessors.getBandNoDataValue(raster, i)).getOrElse(null) @@ -130,11 +144,12 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { val units = raster.getSampleDimension(i - 1).getUnits if (units != null) units.toString else null }.getOrElse(null) + val colorInterp = resolveColorInterpretation(photometric, i, numBands) BandMetadata( band = i, dataType = dataType, - colorInterpretation = description, + colorInterpretation = colorInterp, noDataValue = noDataValue, blockWidth = tileWidth, blockHeight = tileHeight, @@ -143,6 +158,28 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { } } + /** + * Derive per-band color interpretation from the TIFF Photometric Interpretation tag. + */ + private def resolveColorInterpretation(photometric: Int, band: Int, numBands: Int): String = { + photometric match { + case PHOTOMETRIC_MIN_IS_WHITE | PHOTOMETRIC_MIN_IS_BLACK => + if (numBands == 1) "Gray" + else if (band <= numBands) s"Gray${band}" // multi-band grayscale + else "Undefined" + case PHOTOMETRIC_RGB => + band match { + case 1 => "Red" + case 2 => "Green" + case 3 => "Blue" + case 4 => "Alpha" + case _ => "Undefined" + } + case PHOTOMETRIC_PALETTE => "Palette" + case _ => "Undefined" + } + } + private def extractOverviews( reader: GeoTiffReader, fullWidth: Int, @@ -224,40 +261,176 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { } } + /** + * Extract compression type from TIFF tag 259. Returns the human-readable description (e.g., + * "LZW", "Deflate") from the TIFFShort description attribute if available, otherwise returns + * the raw numeric value. + */ private def extractCompression(reader: GeoTiffReader): String = { try { val metadata = reader.getMetadata if (metadata == null) return null - val rootNode = metadata.getRootNode if (rootNode == null) return null - findCompressionInNode(rootNode) + // Tag 259 = Compression + val desc = findTiffFieldDescription(rootNode, 259) + if (desc != null) return desc + findTiffFieldValue(rootNode, 259) } catch { case _: Exception => null } } - private def findCompressionInNode(node: org.w3c.dom.Node): String = { + /** + * Check if a TIFF tag exists in the IIO metadata tree by its tag number. Uses the "number" + * attribute of TIFFField elements. + */ + private def hasTiffTag(reader: GeoTiffReader, tagNumber: Int): Boolean = { + try { + val metadata = reader.getMetadata + if (metadata == null) return false + val rootNode = metadata.getRootNode + if (rootNode == null) return false + findTiffFieldByNumber(rootNode, tagNumber) + } catch { + case _: Exception => false + } + } + + private def findTiffFieldByNumber(node: org.w3c.dom.Node, tagNumber: Int): Boolean = { + if (node == null) return false + + if (node.getNodeName == "TIFFField") { + val attrs = node.getAttributes + if (attrs != null) { + val numAttr = attrs.getNamedItem("number") + if (numAttr != null && Try(numAttr.getNodeValue.toInt).getOrElse(-1) == tagNumber) { + return true + } + } + } + + val children = node.getChildNodes + if (children != null) { + for (i <- 0 until children.getLength) { + if (findTiffFieldByNumber(children.item(i), tagNumber)) return true + } + } + false + } + + /** + * Extract the TIFF Photometric Interpretation tag value (tag 262). Returns -1 if not found. + */ + private def extractPhotometricInterpretation(reader: GeoTiffReader): Int = { + try { + val metadata = reader.getMetadata + if (metadata == null) return -1 + val rootNode = metadata.getRootNode + if (rootNode == null) return -1 + val value = findTiffFieldValue(rootNode, 262) // PhotometricInterpretation + if (value != null) value.toInt else -1 + } catch { + case _: Exception => -1 + } + } + + /** + * Find the value of a TIFF field by tag number in the IIO metadata tree. Looks for TIFFField + * elements with matching "number" attribute and extracts the value. + */ + private def findTiffFieldValue(node: org.w3c.dom.Node, tagNumber: Int): String = { if (node == null) return null - val attrs = node.getAttributes - if (attrs != null) { - val nameAttr = attrs.getNamedItem("name") - val valueAttr = attrs.getNamedItem("value") - if (nameAttr != null && valueAttr != null && - nameAttr.getNodeValue.equalsIgnoreCase("Compression")) { - return valueAttr.getNodeValue + if (node.getNodeName == "TIFFField") { + val attrs = node.getAttributes + if (attrs != null) { + val numAttr = attrs.getNamedItem("number") + if (numAttr != null && Try(numAttr.getNodeValue.toInt).getOrElse(-1) == tagNumber) { + return extractValueFromTiffField(node) + } } } val children = node.getChildNodes if (children != null) { for (i <- 0 until children.getLength) { - val result = findCompressionInNode(children.item(i)) + val result = findTiffFieldValue(children.item(i), tagNumber) if (result != null) return result } } null } + + /** + * Extract the value from a TIFFField node. Handles TIFFShorts, TIFFLongs, TIFFAscii, etc. + * Returns the "value" attribute from the first leaf element (e.g., TIFFShort, TIFFLong). + */ + private def extractValueFromTiffField(fieldNode: org.w3c.dom.Node): String = { + val children = fieldNode.getChildNodes + if (children == null) return null + for (i <- 0 until children.getLength) { + val child = children.item(i) + val grandchildren = child.getChildNodes + if (grandchildren != null) { + for (j <- 0 until grandchildren.getLength) { + val gc = grandchildren.item(j) + val attrs = gc.getAttributes + if (attrs != null) { + val valueAttr = attrs.getNamedItem("value") + if (valueAttr != null) return valueAttr.getNodeValue + } + } + } + } + null + } + + /** + * Find the human-readable "description" attribute of a TIFF field by tag number. For example, + * tag 259 (Compression) has description="LZW" on the TIFFShort element. + */ + private def findTiffFieldDescription(node: org.w3c.dom.Node, tagNumber: Int): String = { + if (node == null) return null + + if (node.getNodeName == "TIFFField") { + val attrs = node.getAttributes + if (attrs != null) { + val numAttr = attrs.getNamedItem("number") + if (numAttr != null && Try(numAttr.getNodeValue.toInt).getOrElse(-1) == tagNumber) { + return extractDescriptionFromTiffField(node) + } + } + } + + val children = node.getChildNodes + if (children != null) { + for (i <- 0 until children.getLength) { + val result = findTiffFieldDescription(children.item(i), tagNumber) + if (result != null) return result + } + } + null + } + + private def extractDescriptionFromTiffField(fieldNode: org.w3c.dom.Node): String = { + val children = fieldNode.getChildNodes + if (children == null) return null + for (i <- 0 until children.getLength) { + val child = children.item(i) + val grandchildren = child.getChildNodes + if (grandchildren != null) { + for (j <- 0 until grandchildren.getLength) { + val gc = grandchildren.item(j) + val attrs = gc.getAttributes + if (attrs != null) { + val descAttr = attrs.getNamedItem("description") + if (descAttr != null) return descAttr.getNodeValue + } + } + } + } + null + } } diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoTable.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoTable.scala index 34ba34764ae..2d49dfb6662 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoTable.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoTable.scala @@ -21,7 +21,6 @@ package org.apache.spark.sql.sedona_sql.io.sedonainfo import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connector.catalog.SupportsRead -import org.apache.spark.sql.connector.catalog.SupportsWrite import org.apache.spark.sql.connector.catalog.TableCapability import org.apache.spark.sql.connector.read.ScanBuilder import org.apache.spark.sql.connector.write.LogicalWriteInfo @@ -41,8 +40,7 @@ case class SedonaInfoTable( userSpecifiedSchema: Option[StructType], fallbackFileFormat: Class[_ <: FileFormat]) extends FileTable(sparkSession, options, paths, userSpecifiedSchema) - with SupportsRead - with SupportsWrite { + with SupportsRead { override def inferSchema(files: Seq[FileStatus]): Option[StructType] = Some(SedonaInfoTable.SCHEMA) @@ -56,7 +54,8 @@ case class SedonaInfoTable( SedonaInfoScanBuilder(sparkSession, fileIndex, schema, dataSchema, options) } - override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = null + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + throw new UnsupportedOperationException("SedonaInfo is a read-only data source") } object SedonaInfoTable { diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala index 900ebd0e8c0..cb986d8c95c 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala @@ -52,6 +52,7 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfterAll { assertEquals(1, row.getAs[Int]("numBands")) assertEquals(3857, row.getAs[Int]("srid")) assert(row.getAs[String]("crs").contains("EPSG")) + // test1.tiff has TileWidth/TileLength TIFF tags (internally tiled) assertEquals(true, row.getAs[Boolean]("isTiled")) } @@ -108,7 +109,7 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfterAll { .first() assertEquals(1, row.getAs[Int]("band")) assertEquals("UNSIGNED_8BITS", row.getAs[String]("dataType")) - assertEquals("GRAY_INDEX", row.getAs[String]("colorInterpretation")) + assertEquals("Gray", row.getAs[String]("colorInterpretation")) assert(row.isNullAt(row.fieldIndex("noDataValue"))) assertEquals(256, row.getAs[Int]("blockWidth")) assertEquals(256, row.getAs[Int]("blockHeight")) From 32476c099312fed3e9fe6984ab55eb4193c26002 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Fri, 3 Apr 2026 12:39:01 -0700 Subject: [PATCH 06/10] [GH-2824] Extract GeoTransformMetadata and CornerCoordinatesMetadata case classes Make RasterFileMetadata consistent: all nested structures (bands, overviews, geoTransform, cornerCoordinates) use dedicated case classes. --- .../sedonainfo/GeoTiffMetadataExtractor.scala | 22 ++++++++++--------- .../RasterFileMetadataExtractor.scala | 22 ++++++++++--------- .../SedonaInfoPartitionReader.scala | 16 +++++--------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala index 66e591f1a13..2edf6e2b584 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala @@ -96,16 +96,18 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { numBands = numBands, srid = srid, crs = crsStr, - upperLeftX = affine.getTranslateX, - upperLeftY = affine.getTranslateY, - scaleX = affine.getScaleX, - scaleY = affine.getScaleY, - skewX = affine.getShearX, - skewY = affine.getShearY, - envelopeMinX = env.getMinX, - envelopeMinY = env.getMinY, - envelopeMaxX = env.getMaxX, - envelopeMaxY = env.getMaxY, + geoTransform = GeoTransformMetadata( + upperLeftX = affine.getTranslateX, + upperLeftY = affine.getTranslateY, + scaleX = affine.getScaleX, + scaleY = affine.getScaleY, + skewX = affine.getShearX, + skewY = affine.getShearY), + cornerCoordinates = CornerCoordinatesMetadata( + minX = env.getMinX, + minY = env.getMinY, + maxX = env.getMaxX, + maxY = env.getMaxY), bands = bands, overviews = overviews, metadata = tiffMetadata, diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala index 9d4182ef485..6319e351447 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala @@ -52,22 +52,24 @@ case class RasterFileMetadata( numBands: Int, srid: Int, crs: String, - upperLeftX: Double, - upperLeftY: Double, - scaleX: Double, - scaleY: Double, - skewX: Double, - skewY: Double, - envelopeMinX: Double, - envelopeMinY: Double, - envelopeMaxX: Double, - envelopeMaxY: Double, + geoTransform: GeoTransformMetadata, + cornerCoordinates: CornerCoordinatesMetadata, bands: Seq[BandMetadata], overviews: Seq[OverviewMetadata], metadata: Map[String, String], isTiled: Boolean, compression: String) +case class GeoTransformMetadata( + upperLeftX: Double, + upperLeftY: Double, + scaleX: Double, + scaleY: Double, + skewX: Double, + skewY: Double) + +case class CornerCoordinatesMetadata(minX: Double, minY: Double, maxX: Double, maxY: Double) + case class BandMetadata( band: Int, dataType: String, diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala index 5c3e4c3d499..151f490f3dc 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala @@ -81,17 +81,13 @@ object SedonaInfoPartitionReader { } def toInternalRow(meta: RasterFileMetadata, readDataSchema: StructType): InternalRow = { + val gt = meta.geoTransform val geoTransformRow = new GenericInternalRow( - Array[Any]( - meta.upperLeftX, - meta.upperLeftY, - meta.scaleX, - meta.scaleY, - meta.skewX, - meta.skewY)) - - val cornerCoordinatesRow = new GenericInternalRow( - Array[Any](meta.envelopeMinX, meta.envelopeMinY, meta.envelopeMaxX, meta.envelopeMaxY)) + Array[Any](gt.upperLeftX, gt.upperLeftY, gt.scaleX, gt.scaleY, gt.skewX, gt.skewY)) + + val cc = meta.cornerCoordinates + val cornerCoordinatesRow = + new GenericInternalRow(Array[Any](cc.minX, cc.minY, cc.maxX, cc.maxY)) lazy val bandsArray: ArrayData = { val bands = meta.bands.map { b => From 16d5feed5f73054443dcbad8d323da6793ef13c0 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Sat, 4 Apr 2026 00:23:57 -0700 Subject: [PATCH 07/10] [GH-2824] Add NetCDF metadata extraction support to sedonainfo - Add NetCdfMetadataExtractor implementing RasterFileMetadataExtractor - Opens NetCDF files via UCAR cdm-core, extracts metadata without reading data arrays (only lat/lon coordinate arrays for spatial info) - Maps data variables to bands (numBands = number of record variables) - Reports dimensions and variables in metadata map - Supports .nc/.nc4/.netcdf extensions - Update glob patterns in SedonaInfoDataSource to include NetCDF files - Add 7 exact-match tests using test.nc (O3/NO2 variables, 80x48 grid) --- .../sedonainfo/NetCdfMetadataExtractor.scala | 235 ++++++++++++++++++ .../io/sedonainfo/SedonaInfoDataSource.scala | 4 +- .../SedonaInfoPartitionReader.scala | 3 +- .../sedona/sql/geotiffMetadataTest.scala | 137 +++++++++- 4 files changed, 374 insertions(+), 5 deletions(-) create mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala new file mode 100644 index 00000000000..b65106f939a --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.sedonainfo + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import ucar.nc2.NetcdfFiles + +import scala.collection.JavaConverters._ +import scala.util.Try + +/** + * Extracts metadata from NetCDF files without reading data arrays. Only coordinate variable + * arrays (lat/lon) are read to compute spatial extent. + */ +object NetCdfMetadataExtractor extends RasterFileMetadataExtractor { + + override def driver: String = "NetCDF" + + override def canHandle(path: Path): Boolean = { + val name = path.getName.toLowerCase + name.endsWith(".nc") || name.endsWith(".nc4") || name.endsWith(".netcdf") + } + + override def extract( + path: Path, + fileSize: Long, + configuration: Configuration): RasterFileMetadata = { + // Read file bytes via Hadoop FS, then open in memory + val fs = path.getFileSystem(configuration) + val status = fs.getFileStatus(path) + val stream = fs.open(path) + val bytes = + try { + val buf = new Array[Byte](status.getLen.toInt) + org.apache.commons.io.IOUtils.readFully(stream, buf) + buf + } finally { + stream.close() + } + + val ncFile = NetcdfFiles.openInMemory("", bytes) + try { + extractFromNetcdf(path.toString, fileSize, ncFile) + } finally { + ncFile.close() + } + } + + private def extractFromNetcdf( + filePath: String, + fileSize: Long, + ncFile: ucar.nc2.NetcdfFile): RasterFileMetadata = { + + // Find record variables (variables with >= 2 dimensions) + val allVars = ncFile.getVariables.asScala.toSeq + val recordVars = allVars.filter(_.getDimensions.size() >= 2) + + // Find lat/lon coordinate variables from the first record variable + val (width, height, geoTransform, cornerCoords) = if (recordVars.nonEmpty) { + extractSpatialInfo(ncFile, recordVars.head) + } else { + (0, 0, GeoTransformMetadata(0, 0, 0, 0, 0, 0), CornerCoordinatesMetadata(0, 0, 0, 0)) + } + + // Each record variable is one band + val bands = recordVars.zipWithIndex.map { case (v, idx) => + val noData = Try { + val attr = v.findAttribute("missing_value") + if (attr != null) attr.getNumericValue.doubleValue() else null + }.getOrElse(null).asInstanceOf[java.lang.Double] + + val unit = Try { + val attr = v.findAttribute("units") + if (attr != null) attr.getStringValue else null + }.getOrElse(null) + + val longName = Try { + val attr = v.findAttribute("long_name") + if (attr != null) attr.getStringValue else null + }.getOrElse(null) + + val dims = + v.getDimensions.asScala.map(d => s"${d.getShortName}=${d.getLength}").mkString(",") + + BandMetadata( + band = idx + 1, + dataType = v.getDataType.toString, + colorInterpretation = "Undefined", + noDataValue = noData, + blockWidth = width, + blockHeight = height, + description = s"${v.getShortName}($dims)", + unit = unit) + } + + // Global attributes as metadata map + val globalAttrs = ncFile.getGlobalAttributes.asScala.flatMap { attr => + Try { + val value = + if (attr.isString) attr.getStringValue + else if (attr.getNumericValue != null) attr.getNumericValue.toString + else null + if (value != null) Some(attr.getShortName -> value) else None + }.getOrElse(None) + }.toMap + + // Add dimension info to metadata + val dimInfo = ncFile.getDimensions.asScala + .map(d => s"${d.getShortName}=${d.getLength}") + .mkString(",") + val metadata = globalAttrs + ("dimensions" -> dimInfo) + + // Add variable list to metadata + val varList = recordVars.map(_.getShortName).mkString(",") + val metadataWithVars = metadata + ("variables" -> varList) + + // CRS: check for crs_wkt or spatial_ref global attribute + val crs = Try { + val crsAttr = ncFile.findGlobalAttribute("crs_wkt") + if (crsAttr != null) crsAttr.getStringValue + else { + val spatialRef = ncFile.findGlobalAttribute("spatial_ref") + if (spatialRef != null) spatialRef.getStringValue else null + } + }.getOrElse(null) + + RasterFileMetadata( + path = filePath, + driver = driver, + fileSize = fileSize, + width = width, + height = height, + numBands = recordVars.size, + srid = 0, + crs = crs, + geoTransform = geoTransform, + cornerCoordinates = cornerCoords, + bands = bands, + overviews = Seq.empty, + metadata = metadataWithVars, + isTiled = false, + compression = null) + } + + /** + * Extract spatial extent from coordinate variables of a record variable. Assumes last 2 + * dimensions are Y and X (same convention as NetCdfReader). + */ + private def extractSpatialInfo(ncFile: ucar.nc2.NetcdfFile, recordVar: ucar.nc2.Variable) + : (Int, Int, GeoTransformMetadata, CornerCoordinatesMetadata) = { + val dims = recordVar.getDimensions.asScala.toSeq + val numDims = dims.size + val latDimName = dims(numDims - 2).getShortName + val lonDimName = dims(numDims - 1).getShortName + + val lonVar = findVariable(ncFile, lonDimName) + val latVar = findVariable(ncFile, latDimName) + + if (lonVar == null || latVar == null) { + return (0, 0, GeoTransformMetadata(0, 0, 0, 0, 0, 0), CornerCoordinatesMetadata(0, 0, 0, 0)) + } + + val lonData = lonVar.read() + val latData = latVar.read() + val width = lonData.getShape()(0) + val height = latData.getShape()(0) + + val lonFirst = lonData.getDouble(0) + val lonLast = lonData.getDouble(width - 1) + val latFirst = latData.getDouble(0) + val latLast = latData.getDouble(height - 1) + + val lonIncreasing = lonFirst < lonLast + val latIncreasing = latFirst < latLast + + val minX = if (lonIncreasing) lonFirst else lonLast + val maxX = if (lonIncreasing) lonLast else lonFirst + val minY = if (latIncreasing) latFirst else latLast + val maxY = if (latIncreasing) latLast else latFirst + + val scaleX = Math.abs(lonLast - lonFirst) / (width - 1) + val scaleY = -(Math.abs(latLast - latFirst) / (height - 1)) + + val geoTransform = GeoTransformMetadata( + upperLeftX = minX, + upperLeftY = maxY, + scaleX = scaleX, + scaleY = scaleY, + skewX = 0.0, + skewY = 0.0) + + // Envelope covers the full extent including half-pixel borders + val halfPixelX = scaleX / 2 + val halfPixelY = Math.abs(scaleY) / 2 + val cornerCoords = CornerCoordinatesMetadata( + minX = minX - halfPixelX, + minY = minY - halfPixelY, + maxX = maxX + halfPixelX, + maxY = maxY + halfPixelY) + + (width, height, geoTransform, cornerCoords) + } + + private def findVariable(ncFile: ucar.nc2.NetcdfFile, name: String): ucar.nc2.Variable = { + // Search recursively through groups + findVariableInGroup(name, ncFile.getRootGroup) + } + + private def findVariableInGroup(name: String, group: ucar.nc2.Group): ucar.nc2.Variable = { + val v = group.findVariableLocal(name) + if (v != null) return v + for (g <- group.getGroups.asScala) { + val found = findVariableInGroup(name, g) + if (found != null) return found + } + null + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala index d8db5322836..32bc4452713 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala @@ -51,11 +51,11 @@ class SedonaInfoDataSource extends FileDataSourceV2 with TableProvider with Data new java.util.HashMap[String, String](optionsWithoutPaths.asCaseSensitiveMap()) newOptions.put("recursiveFileLookup", "true") if (!newOptions.containsKey("pathGlobFilter")) { - newOptions.put("pathGlobFilter", "*.{tif,tiff,TIF,TIFF}") + newOptions.put("pathGlobFilter", "*.{tif,tiff,TIF,TIFF,nc,nc4,NC,NC4}") } optionsWithoutPaths = new CaseInsensitiveStringMap(newOptions) } else { - val loadTifPattern = "(.*)/([^/]*\\*[^/]*\\.(?i:tif|tiff))$".r + val loadTifPattern = "(.*)/([^/]*\\*[^/]*\\.(?i:tif|tiff|nc|nc4))$".r paths.head match { case loadTifPattern(prefix, glob) => paths = Seq(prefix) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala index 151f490f3dc..68a59685ef8 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala @@ -69,7 +69,8 @@ class SedonaInfoPartitionReader( object SedonaInfoPartitionReader { /** Registered metadata extractors. Add new format extractors here. */ - private val extractors: Seq[RasterFileMetadataExtractor] = Seq(GeoTiffMetadataExtractor) + private val extractors: Seq[RasterFileMetadataExtractor] = + Seq(GeoTiffMetadataExtractor, NetCdfMetadataExtractor) def findExtractor(path: Path): RasterFileMetadataExtractor = { extractors diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala index cb986d8c95c..e5881b368d1 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala @@ -153,8 +153,8 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfterAll { it("should read files from directory with trailing slash") { val df = sparkSession.read.format("sedonainfo").load(rasterDir) - // Recursive lookup finds all .tif/.tiff files including subdirectories - assertEquals(9L, df.count()) + // Recursive lookup finds all .tif/.tiff/.nc files including subdirectories + assertEquals(10L, df.count()) } it("should support LIMIT pushdown") { @@ -219,4 +219,137 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfterAll { assertEquals(256, bandRow.getAs[Int]("blockHeight")) } } + + describe("SedonaInfo NetCDF support") { + val netcdfFile: String = resourceFolder + "raster/netcdf/test.nc" + + it("should read test.nc with exact metadata values") { + val df = sparkSession.read.format("sedonainfo").load(netcdfFile) + assertEquals(1L, df.count()) + + val row = df.first() + assert(row.getAs[String]("path").endsWith("test.nc")) + assertEquals("NetCDF", row.getAs[String]("driver")) + assertEquals(80, row.getAs[Int]("width")) + assertEquals(48, row.getAs[Int]("height")) + // O3 and NO2 are the two data variables + assertEquals(2, row.getAs[Int]("numBands")) + assertEquals(0, row.getAs[Int]("srid")) + assert(row.isNullAt(row.fieldIndex("crs"))) + assertEquals(false, row.getAs[Boolean]("isTiled")) + assert(row.isNullAt(row.fieldIndex("compression"))) + } + + it("should return exact geoTransform for test.nc") { + // Values match RS_Metadata output for RS_FromNetCDF(content, 'O3') + val row = sparkSession.read + .format("sedonainfo") + .load(netcdfFile) + .selectExpr( + "geoTransform.upperLeftX", + "geoTransform.upperLeftY", + "geoTransform.scaleX", + "geoTransform.scaleY", + "geoTransform.skewX", + "geoTransform.skewY") + .first() + // Cell-center coordinates: lon=[5.0..14.875], lat=[50.875..44.9375] (decreasing) + assertEquals(5.0, row.getAs[Double]("upperLeftX"), 1e-6) + assertEquals(50.875, row.getAs[Double]("upperLeftY"), 1e-6) + assertEquals(0.125, row.getAs[Double]("scaleX"), 1e-6) + assertEquals(-0.125, row.getAs[Double]("scaleY"), 1e-6) + assertEquals(0.0, row.getAs[Double]("skewX"), 1e-15) + assertEquals(0.0, row.getAs[Double]("skewY"), 1e-15) + } + + it("should return exact cornerCoordinates for test.nc") { + // Envelope includes half-pixel borders around cell centers + val row = sparkSession.read + .format("sedonainfo") + .load(netcdfFile) + .selectExpr( + "cornerCoordinates.minX", + "cornerCoordinates.minY", + "cornerCoordinates.maxX", + "cornerCoordinates.maxY") + .first() + assertEquals(4.9375, row.getAs[Double]("minX"), 1e-6) // 5.0 - 0.0625 + assertEquals(44.9375, row.getAs[Double]("minY"), 1e-6) // 45.0 - 0.0625 + assertEquals(14.9375, row.getAs[Double]("maxX"), 1e-6) // 14.875 + 0.0625 + assertEquals(50.9375, row.getAs[Double]("maxY"), 1e-6) // 50.875 + 0.0625 + } + + it("should return exact band metadata for test.nc") { + val rows = sparkSession.read + .format("sedonainfo") + .load(netcdfFile) + .selectExpr("explode(bands) as b") + .selectExpr( + "b.band", + "b.dataType", + "b.colorInterpretation", + "b.noDataValue", + "b.blockWidth", + "b.blockHeight", + "b.description", + "b.unit") + .collect() + assertEquals(2, rows.length) + + // Band 1: O3 + assertEquals(1, rows(0).getAs[Int]("band")) + assertEquals("float", rows(0).getAs[String]("dataType")) + assertEquals("Undefined", rows(0).getAs[String]("colorInterpretation")) + assert(rows(0).getAs[String]("description").startsWith("O3(")) + assertEquals(80, rows(0).getAs[Int]("blockWidth")) + assertEquals(48, rows(0).getAs[Int]("blockHeight")) + + // Band 2: NO2 + assertEquals(2, rows(1).getAs[Int]("band")) + assert(rows(1).getAs[String]("description").startsWith("NO2(")) + } + + it("should return empty overviews for test.nc") { + val row = sparkSession.read + .format("sedonainfo") + .load(netcdfFile) + .selectExpr("size(overviews) as overviewCount") + .first() + assertEquals(0, row.getAs[Int]("overviewCount")) + } + + it("should include dimensions and variables in metadata map") { + val row = sparkSession.read.format("sedonainfo").load(netcdfFile).first() + val meta = row.getAs[Map[String, String]]("metadata") + assert(meta != null) + assert(meta.contains("dimensions")) + assert(meta("dimensions").contains("lat=48")) + assert(meta("dimensions").contains("lon=80")) + assert(meta.contains("variables")) + assert(meta("variables").contains("O3")) + assert(meta("variables").contains("NO2")) + } + + it("should cross-validate spatial extent against RS_FromNetCDF") { + // Load via RS_FromNetCDF and extract metadata + val rasterRow = sparkSession.read + .format("binaryFile") + .load(netcdfFile) + .selectExpr("RS_FromNetCDF(content, 'O3') as raster") + .selectExpr( + "RS_Width(raster) as width", + "RS_Height(raster) as height", + "RS_NumBands(raster) as numBands") + .first() + + // Load via sedonainfo + val metaRow = sparkSession.read.format("sedonainfo").load(netcdfFile).first() + assertEquals(metaRow.getAs[Int]("width"), rasterRow.getAs[Int]("width")) + assertEquals(metaRow.getAs[Int]("height"), rasterRow.getAs[Int]("height")) + // RS_FromNetCDF for O3 returns 4 bands (time=2 * z=2), sedonainfo returns 2 (O3, NO2) + // These represent different things: sedonainfo counts data variables, not flattened bands + assertEquals(2, metaRow.getAs[Int]("numBands")) + assertEquals(4, rasterRow.getAs[Int]("numBands")) + } + } } From 6539d1635b3b02e42a7504841d7f2de011ae8af2 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Tue, 7 Apr 2026 03:36:23 -0700 Subject: [PATCH 08/10] [GH-2824] Add schema-aware column pruning to sedonainfo extractors Pass requiredFields from Spark's readDataSchema to extractors so they can skip expensive work (bands, overviews, metadata, compression, CRS WKT) when those columns are not selected in the query. --- .../sedonainfo/GeoTiffMetadataExtractor.scala | 35 ++++++++++++------- .../sedonainfo/NetCdfMetadataExtractor.scala | 3 +- .../RasterFileMetadataExtractor.scala | 10 +++++- .../SedonaInfoPartitionReader.scala | 3 +- 4 files changed, 36 insertions(+), 15 deletions(-) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala index 2edf6e2b584..5eee0850232 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala @@ -47,7 +47,8 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { override def extract( path: Path, fileSize: Long, - configuration: Configuration): RasterFileMetadata = { + configuration: Configuration, + requiredFields: Set[String] = Set.empty): RasterFileMetadata = { val imageStream = new HadoopImageInputStream(path, configuration) var reader: GeoTiffReader = null var raster: GridCoverage2D = null @@ -58,11 +59,16 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { org.geotools.util.factory.Hints.FORCE_LONGITUDE_FIRST_AXIS_ORDER, java.lang.Boolean.TRUE)) + val needAll = requiredFields.isEmpty + def need(field: String): Boolean = needAll || requiredFields.contains(field) + // Extract TIFF IIO metadata BEFORE read() which may alter stream state - val isTiled = hasTiffTag(reader, TAG_TILE_WIDTH) - val photometric = extractPhotometricInterpretation(reader) - val tiffMetadata = extractMetadata(reader) - val compression = extractCompression(reader) + val isTiled = if (need("isTiled")) hasTiffTag(reader, TAG_TILE_WIDTH) else false + val photometric = + if (need("bands")) extractPhotometricInterpretation(reader) else -1 + val tiffMetadata = + if (need("metadata")) extractMetadata(reader) else Map.empty[String, String] + val compression = if (need("compression")) extractCompression(reader) else null raster = reader.read(null) @@ -71,11 +77,13 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { val numBands = RasterAccessors.numBands(raster) val srid = RasterAccessors.srid(raster) - val crsStr = Try { - val crs = raster.getCoordinateReferenceSystem - if (crs == null || crs.isInstanceOf[DefaultEngineeringCRS]) null - else crs.toWKT - }.getOrElse(null) + val crsStr = if (need("crs")) { + Try { + val crs = raster.getCoordinateReferenceSystem + if (crs == null || crs.isInstanceOf[DefaultEngineeringCRS]) null + else crs.toWKT + }.getOrElse(null) + } else null val affine = RasterUtils.getGDALAffineTransform(raster) val env = raster.getEnvelope2D @@ -84,8 +92,11 @@ object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { val tileWidth = image.getTileWidth val tileHeight = image.getTileHeight - val bands = extractBands(raster, numBands, tileWidth, tileHeight, photometric) - val overviews = extractOverviews(reader, width, height) + val bands = + if (need("bands")) extractBands(raster, numBands, tileWidth, tileHeight, photometric) + else Seq.empty + val overviews = + if (need("overviews")) extractOverviews(reader, width, height) else Seq.empty RasterFileMetadata( path = path.toString, diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala index b65106f939a..e27618b07b6 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala @@ -41,7 +41,8 @@ object NetCdfMetadataExtractor extends RasterFileMetadataExtractor { override def extract( path: Path, fileSize: Long, - configuration: Configuration): RasterFileMetadata = { + configuration: Configuration, + requiredFields: Set[String] = Set.empty): RasterFileMetadata = { // Read file bytes via Hadoop FS, then open in memory val fs = path.getFileSystem(configuration) val status = fs.getFileStatus(path) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala index 6319e351447..a9a0ae2cb9f 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/RasterFileMetadataExtractor.scala @@ -33,8 +33,16 @@ trait RasterFileMetadataExtractor { /** * Extract metadata from the file at the given path. Implementations must not decode pixel data * — only headers/metadata should be read. + * + * @param requiredFields + * Column names requested by Spark's column pruning. Extractors may skip expensive work for + * fields not in this set. When empty, all fields are extracted. */ - def extract(path: Path, fileSize: Long, configuration: Configuration): RasterFileMetadata + def extract( + path: Path, + fileSize: Long, + configuration: Configuration, + requiredFields: Set[String] = Set.empty): RasterFileMetadata /** Returns true if this extractor can handle the given file path (by extension). */ def canHandle(path: Path): Boolean diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala index 68a59685ef8..7588623a549 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala @@ -61,7 +61,8 @@ class SedonaInfoPartitionReader( private def readFileMetadata(partition: PartitionedFile): InternalRow = { val path = new Path(new URI(partition.filePath.toString())) val extractor = SedonaInfoPartitionReader.findExtractor(path) - val meta = extractor.extract(path, partition.fileSize, configuration) + val requiredFields = readDataSchema.fieldNames.toSet + val meta = extractor.extract(path, partition.fileSize, configuration, requiredFields) SedonaInfoPartitionReader.toInternalRow(meta, readDataSchema) } } From b5d1dfb749e9954f08424d70cbe8170e273a13bc Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Tue, 7 Apr 2026 04:34:29 -0700 Subject: [PATCH 09/10] [GH-2824] Remove NetCDF support to separate PR Move NetCDF metadata extraction to a follow-up PR. This PR focuses on GeoTIFF metadata extraction via the sedonainfo data source. --- .../sedonainfo/NetCdfMetadataExtractor.scala | 236 ------------------ .../io/sedonainfo/SedonaInfoDataSource.scala | 4 +- .../SedonaInfoPartitionReader.scala | 3 +- .../sedona/sql/geotiffMetadataTest.scala | 137 +--------- 4 files changed, 5 insertions(+), 375 deletions(-) delete mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala deleted file mode 100644 index e27618b07b6..00000000000 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.spark.sql.sedona_sql.io.sedonainfo - -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path -import ucar.nc2.NetcdfFiles - -import scala.collection.JavaConverters._ -import scala.util.Try - -/** - * Extracts metadata from NetCDF files without reading data arrays. Only coordinate variable - * arrays (lat/lon) are read to compute spatial extent. - */ -object NetCdfMetadataExtractor extends RasterFileMetadataExtractor { - - override def driver: String = "NetCDF" - - override def canHandle(path: Path): Boolean = { - val name = path.getName.toLowerCase - name.endsWith(".nc") || name.endsWith(".nc4") || name.endsWith(".netcdf") - } - - override def extract( - path: Path, - fileSize: Long, - configuration: Configuration, - requiredFields: Set[String] = Set.empty): RasterFileMetadata = { - // Read file bytes via Hadoop FS, then open in memory - val fs = path.getFileSystem(configuration) - val status = fs.getFileStatus(path) - val stream = fs.open(path) - val bytes = - try { - val buf = new Array[Byte](status.getLen.toInt) - org.apache.commons.io.IOUtils.readFully(stream, buf) - buf - } finally { - stream.close() - } - - val ncFile = NetcdfFiles.openInMemory("", bytes) - try { - extractFromNetcdf(path.toString, fileSize, ncFile) - } finally { - ncFile.close() - } - } - - private def extractFromNetcdf( - filePath: String, - fileSize: Long, - ncFile: ucar.nc2.NetcdfFile): RasterFileMetadata = { - - // Find record variables (variables with >= 2 dimensions) - val allVars = ncFile.getVariables.asScala.toSeq - val recordVars = allVars.filter(_.getDimensions.size() >= 2) - - // Find lat/lon coordinate variables from the first record variable - val (width, height, geoTransform, cornerCoords) = if (recordVars.nonEmpty) { - extractSpatialInfo(ncFile, recordVars.head) - } else { - (0, 0, GeoTransformMetadata(0, 0, 0, 0, 0, 0), CornerCoordinatesMetadata(0, 0, 0, 0)) - } - - // Each record variable is one band - val bands = recordVars.zipWithIndex.map { case (v, idx) => - val noData = Try { - val attr = v.findAttribute("missing_value") - if (attr != null) attr.getNumericValue.doubleValue() else null - }.getOrElse(null).asInstanceOf[java.lang.Double] - - val unit = Try { - val attr = v.findAttribute("units") - if (attr != null) attr.getStringValue else null - }.getOrElse(null) - - val longName = Try { - val attr = v.findAttribute("long_name") - if (attr != null) attr.getStringValue else null - }.getOrElse(null) - - val dims = - v.getDimensions.asScala.map(d => s"${d.getShortName}=${d.getLength}").mkString(",") - - BandMetadata( - band = idx + 1, - dataType = v.getDataType.toString, - colorInterpretation = "Undefined", - noDataValue = noData, - blockWidth = width, - blockHeight = height, - description = s"${v.getShortName}($dims)", - unit = unit) - } - - // Global attributes as metadata map - val globalAttrs = ncFile.getGlobalAttributes.asScala.flatMap { attr => - Try { - val value = - if (attr.isString) attr.getStringValue - else if (attr.getNumericValue != null) attr.getNumericValue.toString - else null - if (value != null) Some(attr.getShortName -> value) else None - }.getOrElse(None) - }.toMap - - // Add dimension info to metadata - val dimInfo = ncFile.getDimensions.asScala - .map(d => s"${d.getShortName}=${d.getLength}") - .mkString(",") - val metadata = globalAttrs + ("dimensions" -> dimInfo) - - // Add variable list to metadata - val varList = recordVars.map(_.getShortName).mkString(",") - val metadataWithVars = metadata + ("variables" -> varList) - - // CRS: check for crs_wkt or spatial_ref global attribute - val crs = Try { - val crsAttr = ncFile.findGlobalAttribute("crs_wkt") - if (crsAttr != null) crsAttr.getStringValue - else { - val spatialRef = ncFile.findGlobalAttribute("spatial_ref") - if (spatialRef != null) spatialRef.getStringValue else null - } - }.getOrElse(null) - - RasterFileMetadata( - path = filePath, - driver = driver, - fileSize = fileSize, - width = width, - height = height, - numBands = recordVars.size, - srid = 0, - crs = crs, - geoTransform = geoTransform, - cornerCoordinates = cornerCoords, - bands = bands, - overviews = Seq.empty, - metadata = metadataWithVars, - isTiled = false, - compression = null) - } - - /** - * Extract spatial extent from coordinate variables of a record variable. Assumes last 2 - * dimensions are Y and X (same convention as NetCdfReader). - */ - private def extractSpatialInfo(ncFile: ucar.nc2.NetcdfFile, recordVar: ucar.nc2.Variable) - : (Int, Int, GeoTransformMetadata, CornerCoordinatesMetadata) = { - val dims = recordVar.getDimensions.asScala.toSeq - val numDims = dims.size - val latDimName = dims(numDims - 2).getShortName - val lonDimName = dims(numDims - 1).getShortName - - val lonVar = findVariable(ncFile, lonDimName) - val latVar = findVariable(ncFile, latDimName) - - if (lonVar == null || latVar == null) { - return (0, 0, GeoTransformMetadata(0, 0, 0, 0, 0, 0), CornerCoordinatesMetadata(0, 0, 0, 0)) - } - - val lonData = lonVar.read() - val latData = latVar.read() - val width = lonData.getShape()(0) - val height = latData.getShape()(0) - - val lonFirst = lonData.getDouble(0) - val lonLast = lonData.getDouble(width - 1) - val latFirst = latData.getDouble(0) - val latLast = latData.getDouble(height - 1) - - val lonIncreasing = lonFirst < lonLast - val latIncreasing = latFirst < latLast - - val minX = if (lonIncreasing) lonFirst else lonLast - val maxX = if (lonIncreasing) lonLast else lonFirst - val minY = if (latIncreasing) latFirst else latLast - val maxY = if (latIncreasing) latLast else latFirst - - val scaleX = Math.abs(lonLast - lonFirst) / (width - 1) - val scaleY = -(Math.abs(latLast - latFirst) / (height - 1)) - - val geoTransform = GeoTransformMetadata( - upperLeftX = minX, - upperLeftY = maxY, - scaleX = scaleX, - scaleY = scaleY, - skewX = 0.0, - skewY = 0.0) - - // Envelope covers the full extent including half-pixel borders - val halfPixelX = scaleX / 2 - val halfPixelY = Math.abs(scaleY) / 2 - val cornerCoords = CornerCoordinatesMetadata( - minX = minX - halfPixelX, - minY = minY - halfPixelY, - maxX = maxX + halfPixelX, - maxY = maxY + halfPixelY) - - (width, height, geoTransform, cornerCoords) - } - - private def findVariable(ncFile: ucar.nc2.NetcdfFile, name: String): ucar.nc2.Variable = { - // Search recursively through groups - findVariableInGroup(name, ncFile.getRootGroup) - } - - private def findVariableInGroup(name: String, group: ucar.nc2.Group): ucar.nc2.Variable = { - val v = group.findVariableLocal(name) - if (v != null) return v - for (g <- group.getGroups.asScala) { - val found = findVariableInGroup(name, g) - if (found != null) return found - } - null - } -} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala index 32bc4452713..d8db5322836 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala @@ -51,11 +51,11 @@ class SedonaInfoDataSource extends FileDataSourceV2 with TableProvider with Data new java.util.HashMap[String, String](optionsWithoutPaths.asCaseSensitiveMap()) newOptions.put("recursiveFileLookup", "true") if (!newOptions.containsKey("pathGlobFilter")) { - newOptions.put("pathGlobFilter", "*.{tif,tiff,TIF,TIFF,nc,nc4,NC,NC4}") + newOptions.put("pathGlobFilter", "*.{tif,tiff,TIF,TIFF}") } optionsWithoutPaths = new CaseInsensitiveStringMap(newOptions) } else { - val loadTifPattern = "(.*)/([^/]*\\*[^/]*\\.(?i:tif|tiff|nc|nc4))$".r + val loadTifPattern = "(.*)/([^/]*\\*[^/]*\\.(?i:tif|tiff))$".r paths.head match { case loadTifPattern(prefix, glob) => paths = Seq(prefix) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala index 7588623a549..200b9c60864 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala @@ -70,8 +70,7 @@ class SedonaInfoPartitionReader( object SedonaInfoPartitionReader { /** Registered metadata extractors. Add new format extractors here. */ - private val extractors: Seq[RasterFileMetadataExtractor] = - Seq(GeoTiffMetadataExtractor, NetCdfMetadataExtractor) + private val extractors: Seq[RasterFileMetadataExtractor] = Seq(GeoTiffMetadataExtractor) def findExtractor(path: Path): RasterFileMetadataExtractor = { extractors diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala index e5881b368d1..cb986d8c95c 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala @@ -153,8 +153,8 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfterAll { it("should read files from directory with trailing slash") { val df = sparkSession.read.format("sedonainfo").load(rasterDir) - // Recursive lookup finds all .tif/.tiff/.nc files including subdirectories - assertEquals(10L, df.count()) + // Recursive lookup finds all .tif/.tiff files including subdirectories + assertEquals(9L, df.count()) } it("should support LIMIT pushdown") { @@ -219,137 +219,4 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfterAll { assertEquals(256, bandRow.getAs[Int]("blockHeight")) } } - - describe("SedonaInfo NetCDF support") { - val netcdfFile: String = resourceFolder + "raster/netcdf/test.nc" - - it("should read test.nc with exact metadata values") { - val df = sparkSession.read.format("sedonainfo").load(netcdfFile) - assertEquals(1L, df.count()) - - val row = df.first() - assert(row.getAs[String]("path").endsWith("test.nc")) - assertEquals("NetCDF", row.getAs[String]("driver")) - assertEquals(80, row.getAs[Int]("width")) - assertEquals(48, row.getAs[Int]("height")) - // O3 and NO2 are the two data variables - assertEquals(2, row.getAs[Int]("numBands")) - assertEquals(0, row.getAs[Int]("srid")) - assert(row.isNullAt(row.fieldIndex("crs"))) - assertEquals(false, row.getAs[Boolean]("isTiled")) - assert(row.isNullAt(row.fieldIndex("compression"))) - } - - it("should return exact geoTransform for test.nc") { - // Values match RS_Metadata output for RS_FromNetCDF(content, 'O3') - val row = sparkSession.read - .format("sedonainfo") - .load(netcdfFile) - .selectExpr( - "geoTransform.upperLeftX", - "geoTransform.upperLeftY", - "geoTransform.scaleX", - "geoTransform.scaleY", - "geoTransform.skewX", - "geoTransform.skewY") - .first() - // Cell-center coordinates: lon=[5.0..14.875], lat=[50.875..44.9375] (decreasing) - assertEquals(5.0, row.getAs[Double]("upperLeftX"), 1e-6) - assertEquals(50.875, row.getAs[Double]("upperLeftY"), 1e-6) - assertEquals(0.125, row.getAs[Double]("scaleX"), 1e-6) - assertEquals(-0.125, row.getAs[Double]("scaleY"), 1e-6) - assertEquals(0.0, row.getAs[Double]("skewX"), 1e-15) - assertEquals(0.0, row.getAs[Double]("skewY"), 1e-15) - } - - it("should return exact cornerCoordinates for test.nc") { - // Envelope includes half-pixel borders around cell centers - val row = sparkSession.read - .format("sedonainfo") - .load(netcdfFile) - .selectExpr( - "cornerCoordinates.minX", - "cornerCoordinates.minY", - "cornerCoordinates.maxX", - "cornerCoordinates.maxY") - .first() - assertEquals(4.9375, row.getAs[Double]("minX"), 1e-6) // 5.0 - 0.0625 - assertEquals(44.9375, row.getAs[Double]("minY"), 1e-6) // 45.0 - 0.0625 - assertEquals(14.9375, row.getAs[Double]("maxX"), 1e-6) // 14.875 + 0.0625 - assertEquals(50.9375, row.getAs[Double]("maxY"), 1e-6) // 50.875 + 0.0625 - } - - it("should return exact band metadata for test.nc") { - val rows = sparkSession.read - .format("sedonainfo") - .load(netcdfFile) - .selectExpr("explode(bands) as b") - .selectExpr( - "b.band", - "b.dataType", - "b.colorInterpretation", - "b.noDataValue", - "b.blockWidth", - "b.blockHeight", - "b.description", - "b.unit") - .collect() - assertEquals(2, rows.length) - - // Band 1: O3 - assertEquals(1, rows(0).getAs[Int]("band")) - assertEquals("float", rows(0).getAs[String]("dataType")) - assertEquals("Undefined", rows(0).getAs[String]("colorInterpretation")) - assert(rows(0).getAs[String]("description").startsWith("O3(")) - assertEquals(80, rows(0).getAs[Int]("blockWidth")) - assertEquals(48, rows(0).getAs[Int]("blockHeight")) - - // Band 2: NO2 - assertEquals(2, rows(1).getAs[Int]("band")) - assert(rows(1).getAs[String]("description").startsWith("NO2(")) - } - - it("should return empty overviews for test.nc") { - val row = sparkSession.read - .format("sedonainfo") - .load(netcdfFile) - .selectExpr("size(overviews) as overviewCount") - .first() - assertEquals(0, row.getAs[Int]("overviewCount")) - } - - it("should include dimensions and variables in metadata map") { - val row = sparkSession.read.format("sedonainfo").load(netcdfFile).first() - val meta = row.getAs[Map[String, String]]("metadata") - assert(meta != null) - assert(meta.contains("dimensions")) - assert(meta("dimensions").contains("lat=48")) - assert(meta("dimensions").contains("lon=80")) - assert(meta.contains("variables")) - assert(meta("variables").contains("O3")) - assert(meta("variables").contains("NO2")) - } - - it("should cross-validate spatial extent against RS_FromNetCDF") { - // Load via RS_FromNetCDF and extract metadata - val rasterRow = sparkSession.read - .format("binaryFile") - .load(netcdfFile) - .selectExpr("RS_FromNetCDF(content, 'O3') as raster") - .selectExpr( - "RS_Width(raster) as width", - "RS_Height(raster) as height", - "RS_NumBands(raster) as numBands") - .first() - - // Load via sedonainfo - val metaRow = sparkSession.read.format("sedonainfo").load(netcdfFile).first() - assertEquals(metaRow.getAs[Int]("width"), rasterRow.getAs[Int]("width")) - assertEquals(metaRow.getAs[Int]("height"), rasterRow.getAs[Int]("height")) - // RS_FromNetCDF for O3 returns 4 bands (time=2 * z=2), sedonainfo returns 2 (O3, NO2) - // These represent different things: sedonainfo counts data variables, not flattened bands - assertEquals(2, metaRow.getAs[Int]("numBands")) - assertEquals(4, rasterRow.getAs[Int]("numBands")) - } - } } From 2e39b87ca669d56cd6f5f369518eb70a0c2a36d3 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Sat, 4 Apr 2026 00:23:57 -0700 Subject: [PATCH 10/10] [GH-2824] Add NetCDF metadata extraction support to sedonainfo - Add NetCdfMetadataExtractor implementing RasterFileMetadataExtractor - Opens NetCDF files via UCAR cdm-core, extracts metadata without reading data arrays (only lat/lon coordinate arrays for spatial info) - Maps data variables to bands (numBands = number of record variables) - Reports dimensions and variables in metadata map - Supports .nc/.nc4/.netcdf extensions - Update glob patterns in SedonaInfoDataSource to include NetCDF files - Add 7 exact-match tests using test.nc (O3/NO2 variables, 80x48 grid) --- .../sedonainfo/NetCdfMetadataExtractor.scala | 236 ++++++++++++++++++ .../io/sedonainfo/SedonaInfoDataSource.scala | 4 +- .../SedonaInfoPartitionReader.scala | 3 +- .../sedona/sql/geotiffMetadataTest.scala | 137 +++++++++- 4 files changed, 375 insertions(+), 5 deletions(-) create mode 100644 spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala new file mode 100644 index 00000000000..e27618b07b6 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/NetCdfMetadataExtractor.scala @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.sedonainfo + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import ucar.nc2.NetcdfFiles + +import scala.collection.JavaConverters._ +import scala.util.Try + +/** + * Extracts metadata from NetCDF files without reading data arrays. Only coordinate variable + * arrays (lat/lon) are read to compute spatial extent. + */ +object NetCdfMetadataExtractor extends RasterFileMetadataExtractor { + + override def driver: String = "NetCDF" + + override def canHandle(path: Path): Boolean = { + val name = path.getName.toLowerCase + name.endsWith(".nc") || name.endsWith(".nc4") || name.endsWith(".netcdf") + } + + override def extract( + path: Path, + fileSize: Long, + configuration: Configuration, + requiredFields: Set[String] = Set.empty): RasterFileMetadata = { + // Read file bytes via Hadoop FS, then open in memory + val fs = path.getFileSystem(configuration) + val status = fs.getFileStatus(path) + val stream = fs.open(path) + val bytes = + try { + val buf = new Array[Byte](status.getLen.toInt) + org.apache.commons.io.IOUtils.readFully(stream, buf) + buf + } finally { + stream.close() + } + + val ncFile = NetcdfFiles.openInMemory("", bytes) + try { + extractFromNetcdf(path.toString, fileSize, ncFile) + } finally { + ncFile.close() + } + } + + private def extractFromNetcdf( + filePath: String, + fileSize: Long, + ncFile: ucar.nc2.NetcdfFile): RasterFileMetadata = { + + // Find record variables (variables with >= 2 dimensions) + val allVars = ncFile.getVariables.asScala.toSeq + val recordVars = allVars.filter(_.getDimensions.size() >= 2) + + // Find lat/lon coordinate variables from the first record variable + val (width, height, geoTransform, cornerCoords) = if (recordVars.nonEmpty) { + extractSpatialInfo(ncFile, recordVars.head) + } else { + (0, 0, GeoTransformMetadata(0, 0, 0, 0, 0, 0), CornerCoordinatesMetadata(0, 0, 0, 0)) + } + + // Each record variable is one band + val bands = recordVars.zipWithIndex.map { case (v, idx) => + val noData = Try { + val attr = v.findAttribute("missing_value") + if (attr != null) attr.getNumericValue.doubleValue() else null + }.getOrElse(null).asInstanceOf[java.lang.Double] + + val unit = Try { + val attr = v.findAttribute("units") + if (attr != null) attr.getStringValue else null + }.getOrElse(null) + + val longName = Try { + val attr = v.findAttribute("long_name") + if (attr != null) attr.getStringValue else null + }.getOrElse(null) + + val dims = + v.getDimensions.asScala.map(d => s"${d.getShortName}=${d.getLength}").mkString(",") + + BandMetadata( + band = idx + 1, + dataType = v.getDataType.toString, + colorInterpretation = "Undefined", + noDataValue = noData, + blockWidth = width, + blockHeight = height, + description = s"${v.getShortName}($dims)", + unit = unit) + } + + // Global attributes as metadata map + val globalAttrs = ncFile.getGlobalAttributes.asScala.flatMap { attr => + Try { + val value = + if (attr.isString) attr.getStringValue + else if (attr.getNumericValue != null) attr.getNumericValue.toString + else null + if (value != null) Some(attr.getShortName -> value) else None + }.getOrElse(None) + }.toMap + + // Add dimension info to metadata + val dimInfo = ncFile.getDimensions.asScala + .map(d => s"${d.getShortName}=${d.getLength}") + .mkString(",") + val metadata = globalAttrs + ("dimensions" -> dimInfo) + + // Add variable list to metadata + val varList = recordVars.map(_.getShortName).mkString(",") + val metadataWithVars = metadata + ("variables" -> varList) + + // CRS: check for crs_wkt or spatial_ref global attribute + val crs = Try { + val crsAttr = ncFile.findGlobalAttribute("crs_wkt") + if (crsAttr != null) crsAttr.getStringValue + else { + val spatialRef = ncFile.findGlobalAttribute("spatial_ref") + if (spatialRef != null) spatialRef.getStringValue else null + } + }.getOrElse(null) + + RasterFileMetadata( + path = filePath, + driver = driver, + fileSize = fileSize, + width = width, + height = height, + numBands = recordVars.size, + srid = 0, + crs = crs, + geoTransform = geoTransform, + cornerCoordinates = cornerCoords, + bands = bands, + overviews = Seq.empty, + metadata = metadataWithVars, + isTiled = false, + compression = null) + } + + /** + * Extract spatial extent from coordinate variables of a record variable. Assumes last 2 + * dimensions are Y and X (same convention as NetCdfReader). + */ + private def extractSpatialInfo(ncFile: ucar.nc2.NetcdfFile, recordVar: ucar.nc2.Variable) + : (Int, Int, GeoTransformMetadata, CornerCoordinatesMetadata) = { + val dims = recordVar.getDimensions.asScala.toSeq + val numDims = dims.size + val latDimName = dims(numDims - 2).getShortName + val lonDimName = dims(numDims - 1).getShortName + + val lonVar = findVariable(ncFile, lonDimName) + val latVar = findVariable(ncFile, latDimName) + + if (lonVar == null || latVar == null) { + return (0, 0, GeoTransformMetadata(0, 0, 0, 0, 0, 0), CornerCoordinatesMetadata(0, 0, 0, 0)) + } + + val lonData = lonVar.read() + val latData = latVar.read() + val width = lonData.getShape()(0) + val height = latData.getShape()(0) + + val lonFirst = lonData.getDouble(0) + val lonLast = lonData.getDouble(width - 1) + val latFirst = latData.getDouble(0) + val latLast = latData.getDouble(height - 1) + + val lonIncreasing = lonFirst < lonLast + val latIncreasing = latFirst < latLast + + val minX = if (lonIncreasing) lonFirst else lonLast + val maxX = if (lonIncreasing) lonLast else lonFirst + val minY = if (latIncreasing) latFirst else latLast + val maxY = if (latIncreasing) latLast else latFirst + + val scaleX = Math.abs(lonLast - lonFirst) / (width - 1) + val scaleY = -(Math.abs(latLast - latFirst) / (height - 1)) + + val geoTransform = GeoTransformMetadata( + upperLeftX = minX, + upperLeftY = maxY, + scaleX = scaleX, + scaleY = scaleY, + skewX = 0.0, + skewY = 0.0) + + // Envelope covers the full extent including half-pixel borders + val halfPixelX = scaleX / 2 + val halfPixelY = Math.abs(scaleY) / 2 + val cornerCoords = CornerCoordinatesMetadata( + minX = minX - halfPixelX, + minY = minY - halfPixelY, + maxX = maxX + halfPixelX, + maxY = maxY + halfPixelY) + + (width, height, geoTransform, cornerCoords) + } + + private def findVariable(ncFile: ucar.nc2.NetcdfFile, name: String): ucar.nc2.Variable = { + // Search recursively through groups + findVariableInGroup(name, ncFile.getRootGroup) + } + + private def findVariableInGroup(name: String, group: ucar.nc2.Group): ucar.nc2.Variable = { + val v = group.findVariableLocal(name) + if (v != null) return v + for (g <- group.getGroups.asScala) { + val found = findVariableInGroup(name, g) + if (found != null) return found + } + null + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala index d8db5322836..32bc4452713 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala @@ -51,11 +51,11 @@ class SedonaInfoDataSource extends FileDataSourceV2 with TableProvider with Data new java.util.HashMap[String, String](optionsWithoutPaths.asCaseSensitiveMap()) newOptions.put("recursiveFileLookup", "true") if (!newOptions.containsKey("pathGlobFilter")) { - newOptions.put("pathGlobFilter", "*.{tif,tiff,TIF,TIFF}") + newOptions.put("pathGlobFilter", "*.{tif,tiff,TIF,TIFF,nc,nc4,NC,NC4}") } optionsWithoutPaths = new CaseInsensitiveStringMap(newOptions) } else { - val loadTifPattern = "(.*)/([^/]*\\*[^/]*\\.(?i:tif|tiff))$".r + val loadTifPattern = "(.*)/([^/]*\\*[^/]*\\.(?i:tif|tiff|nc|nc4))$".r paths.head match { case loadTifPattern(prefix, glob) => paths = Seq(prefix) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala index 200b9c60864..7588623a549 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala @@ -70,7 +70,8 @@ class SedonaInfoPartitionReader( object SedonaInfoPartitionReader { /** Registered metadata extractors. Add new format extractors here. */ - private val extractors: Seq[RasterFileMetadataExtractor] = Seq(GeoTiffMetadataExtractor) + private val extractors: Seq[RasterFileMetadataExtractor] = + Seq(GeoTiffMetadataExtractor, NetCdfMetadataExtractor) def findExtractor(path: Path): RasterFileMetadataExtractor = { extractors diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala index cb986d8c95c..e5881b368d1 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/geotiffMetadataTest.scala @@ -153,8 +153,8 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfterAll { it("should read files from directory with trailing slash") { val df = sparkSession.read.format("sedonainfo").load(rasterDir) - // Recursive lookup finds all .tif/.tiff files including subdirectories - assertEquals(9L, df.count()) + // Recursive lookup finds all .tif/.tiff/.nc files including subdirectories + assertEquals(10L, df.count()) } it("should support LIMIT pushdown") { @@ -219,4 +219,137 @@ class geotiffMetadataTest extends TestBaseScala with BeforeAndAfterAll { assertEquals(256, bandRow.getAs[Int]("blockHeight")) } } + + describe("SedonaInfo NetCDF support") { + val netcdfFile: String = resourceFolder + "raster/netcdf/test.nc" + + it("should read test.nc with exact metadata values") { + val df = sparkSession.read.format("sedonainfo").load(netcdfFile) + assertEquals(1L, df.count()) + + val row = df.first() + assert(row.getAs[String]("path").endsWith("test.nc")) + assertEquals("NetCDF", row.getAs[String]("driver")) + assertEquals(80, row.getAs[Int]("width")) + assertEquals(48, row.getAs[Int]("height")) + // O3 and NO2 are the two data variables + assertEquals(2, row.getAs[Int]("numBands")) + assertEquals(0, row.getAs[Int]("srid")) + assert(row.isNullAt(row.fieldIndex("crs"))) + assertEquals(false, row.getAs[Boolean]("isTiled")) + assert(row.isNullAt(row.fieldIndex("compression"))) + } + + it("should return exact geoTransform for test.nc") { + // Values match RS_Metadata output for RS_FromNetCDF(content, 'O3') + val row = sparkSession.read + .format("sedonainfo") + .load(netcdfFile) + .selectExpr( + "geoTransform.upperLeftX", + "geoTransform.upperLeftY", + "geoTransform.scaleX", + "geoTransform.scaleY", + "geoTransform.skewX", + "geoTransform.skewY") + .first() + // Cell-center coordinates: lon=[5.0..14.875], lat=[50.875..44.9375] (decreasing) + assertEquals(5.0, row.getAs[Double]("upperLeftX"), 1e-6) + assertEquals(50.875, row.getAs[Double]("upperLeftY"), 1e-6) + assertEquals(0.125, row.getAs[Double]("scaleX"), 1e-6) + assertEquals(-0.125, row.getAs[Double]("scaleY"), 1e-6) + assertEquals(0.0, row.getAs[Double]("skewX"), 1e-15) + assertEquals(0.0, row.getAs[Double]("skewY"), 1e-15) + } + + it("should return exact cornerCoordinates for test.nc") { + // Envelope includes half-pixel borders around cell centers + val row = sparkSession.read + .format("sedonainfo") + .load(netcdfFile) + .selectExpr( + "cornerCoordinates.minX", + "cornerCoordinates.minY", + "cornerCoordinates.maxX", + "cornerCoordinates.maxY") + .first() + assertEquals(4.9375, row.getAs[Double]("minX"), 1e-6) // 5.0 - 0.0625 + assertEquals(44.9375, row.getAs[Double]("minY"), 1e-6) // 45.0 - 0.0625 + assertEquals(14.9375, row.getAs[Double]("maxX"), 1e-6) // 14.875 + 0.0625 + assertEquals(50.9375, row.getAs[Double]("maxY"), 1e-6) // 50.875 + 0.0625 + } + + it("should return exact band metadata for test.nc") { + val rows = sparkSession.read + .format("sedonainfo") + .load(netcdfFile) + .selectExpr("explode(bands) as b") + .selectExpr( + "b.band", + "b.dataType", + "b.colorInterpretation", + "b.noDataValue", + "b.blockWidth", + "b.blockHeight", + "b.description", + "b.unit") + .collect() + assertEquals(2, rows.length) + + // Band 1: O3 + assertEquals(1, rows(0).getAs[Int]("band")) + assertEquals("float", rows(0).getAs[String]("dataType")) + assertEquals("Undefined", rows(0).getAs[String]("colorInterpretation")) + assert(rows(0).getAs[String]("description").startsWith("O3(")) + assertEquals(80, rows(0).getAs[Int]("blockWidth")) + assertEquals(48, rows(0).getAs[Int]("blockHeight")) + + // Band 2: NO2 + assertEquals(2, rows(1).getAs[Int]("band")) + assert(rows(1).getAs[String]("description").startsWith("NO2(")) + } + + it("should return empty overviews for test.nc") { + val row = sparkSession.read + .format("sedonainfo") + .load(netcdfFile) + .selectExpr("size(overviews) as overviewCount") + .first() + assertEquals(0, row.getAs[Int]("overviewCount")) + } + + it("should include dimensions and variables in metadata map") { + val row = sparkSession.read.format("sedonainfo").load(netcdfFile).first() + val meta = row.getAs[Map[String, String]]("metadata") + assert(meta != null) + assert(meta.contains("dimensions")) + assert(meta("dimensions").contains("lat=48")) + assert(meta("dimensions").contains("lon=80")) + assert(meta.contains("variables")) + assert(meta("variables").contains("O3")) + assert(meta("variables").contains("NO2")) + } + + it("should cross-validate spatial extent against RS_FromNetCDF") { + // Load via RS_FromNetCDF and extract metadata + val rasterRow = sparkSession.read + .format("binaryFile") + .load(netcdfFile) + .selectExpr("RS_FromNetCDF(content, 'O3') as raster") + .selectExpr( + "RS_Width(raster) as width", + "RS_Height(raster) as height", + "RS_NumBands(raster) as numBands") + .first() + + // Load via sedonainfo + val metaRow = sparkSession.read.format("sedonainfo").load(netcdfFile).first() + assertEquals(metaRow.getAs[Int]("width"), rasterRow.getAs[Int]("width")) + assertEquals(metaRow.getAs[Int]("height"), rasterRow.getAs[Int]("height")) + // RS_FromNetCDF for O3 returns 4 bands (time=2 * z=2), sedonainfo returns 2 (O3, NO2) + // These represent different things: sedonainfo counts data variables, not flattened bands + assertEquals(2, metaRow.getAs[Int]("numBands")) + assertEquals(4, rasterRow.getAs[Int]("numBands")) + } + } }