From eaacfa6aa2609495321a90f5e45b5dbd35cd8d89 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 30 May 2025 13:11:41 -0700
Subject: [PATCH 01/54] code to write daily irs

---
 .../main/scala/ai/chronon/spark/GroupBy.scala | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 79ef1095d3..8fbd990083 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -546,12 +546,23 @@ object GroupBy {
 
       df
     }
+    //make it parameterized
+    val incrementalAgg = true
+    if (incrementalAgg) {
+      new GroupBy(Option(groupByConf.getAggregations).map(_.toScala).orNull,
+        keyColumns,
+        nullFiltered,
+        mutationDfFn,
+        finalize = false)
+    } else {
+      new GroupBy(Option(groupByConf.getAggregations).map(_.toScala).orNull,
+        keyColumns,
+        nullFiltered,
+        mutationDfFn,
+        finalize = finalize)
+
+    }
 
-    new GroupBy(Option(groupByConf.getAggregations).map(_.toScala).orNull,
-                keyColumns,
-                nullFiltered,
-                mutationDfFn,
-                finalize = finalize)
   }
 
   def getIntersectedRange(source: api.Source,

From 40b6cb2645bbe18d2d7b6b19289b83aea1d262c8 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Tue, 3 Jun 2025 10:47:48 -0700
Subject: [PATCH 02/54] store incremental agg and compute final IRs

---
 .../scala/ai/chronon/api/Extensions.scala     |  1 +
 .../main/scala/ai/chronon/spark/GroupBy.scala | 83 ++++++++++++++-----
 2 files changed, 61 insertions(+), 23 deletions(-)

diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala
index 0ce907145b..d813b945a6 100644
--- a/api/src/main/scala/ai/chronon/api/Extensions.scala
+++ b/api/src/main/scala/ai/chronon/api/Extensions.scala
@@ -98,6 +98,7 @@ object Extensions {
     def cleanName: String = metaData.name.sanitize
 
     def outputTable = s"${metaData.outputNamespace}.${metaData.cleanName}"
+    def incOutputTable = s"${metaData.outputNamespace}.${metaData.cleanName}_inc"
     def outputLabelTable = s"${metaData.outputNamespace}.${metaData.cleanName}_labels"
     def outputFinalView = s"${metaData.outputNamespace}.${metaData.cleanName}_labeled"
     def outputLatestLabelView = s"${metaData.outputNamespace}.${metaData.cleanName}_labeled_latest"
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 8fbd990083..b7dd1f4f67 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -461,18 +461,19 @@ object GroupBy {
            bloomMapOpt: Option[util.Map[String, BloomFilter]] = None,
            skewFilter: Option[String] = None,
            finalize: Boolean = true,
-           showDf: Boolean = false): GroupBy = {
+           showDf: Boolean = false,
+           incrementalAgg: Boolean = false): GroupBy = {
     logger.info(s"\n----[Processing GroupBy: ${groupByConfOld.metaData.name}]----")
     val groupByConf = replaceJoinSource(groupByConfOld, queryRange, tableUtils, computeDependency, showDf)
     val inputDf = groupByConf.sources.toScala
       .map { source =>
         renderDataSourceQuery(groupByConf,
-                              source,
-                              groupByConf.getKeyColumns.toScala,
-                              queryRange,
-                              tableUtils,
-                              groupByConf.maxWindow,
-                              groupByConf.inferredAccuracy)
+          source,
+          groupByConf.getKeyColumns.toScala,
+          queryRange,
+          tableUtils,
+          groupByConf.maxWindow,
+          groupByConf.inferredAccuracy)
 
       }
       .map {
@@ -543,26 +544,18 @@ object GroupBy {
         logger.info(s"printing mutation data for groupBy: ${groupByConf.metaData.name}")
         df.prettyPrint()
       }
-
       df
     }
-    //make it parameterized
-    val incrementalAgg = true
-    if (incrementalAgg) {
-      new GroupBy(Option(groupByConf.getAggregations).map(_.toScala).orNull,
-        keyColumns,
-        nullFiltered,
-        mutationDfFn,
-        finalize = false)
+    val finalizeValue = if (incrementalAgg) {
+      !incrementalAgg
     } else {
-      new GroupBy(Option(groupByConf.getAggregations).map(_.toScala).orNull,
+      finalize
+    }
+    new GroupBy(Option(groupByConf.getAggregations).map(_.toScala).orNull,
         keyColumns,
         nullFiltered,
         mutationDfFn,
-        finalize = finalize)
-
-    }
-
+        finalize = finalizeValue)
   }
 
   def getIntersectedRange(source: api.Source,
@@ -681,12 +674,51 @@ object GroupBy {
     query
   }
 
+  def saveAndGetIncDf(
+                       groupByConf: api.GroupBy,
+                       range: PartitionRange,
+                       tableUtils: TableUtils,
+                     ): GroupBy = {
+    val incOutputTable = groupByConf.metaData.incOutputTable
+    val tableProps = Option(groupByConf.metaData.tableProperties)
+      .map(_.toScala)
+      .orNull
+    //range should be modified to incremental range
+    val incGroupByBackfill = from(groupByConf, range, tableUtils, computeDependency = true, incrementalAgg = true)
+    val incOutputDf = incGroupByBackfill.snapshotEvents(range)
+    incOutputDf.save(incOutputTable, tableProps)
+
+    val maxWindow = groupByConf.maxWindow.get
+    val sourceQueryableRange = PartitionRange(
+      range.start,
+      tableUtils.partitionSpec.minus(range.end, maxWindow)
+    )(tableUtils)
+
+    val incTableFirstPartition: Option[String] = tableUtils.firstAvailablePartition(incOutputTable)
+    val incTableLastPartition: Option[String] = tableUtils.lastAvailablePartition(incOutputTable)
+
+    val incTableRange = PartitionRange(
+      incTableFirstPartition.get,
+      incTableLastPartition.get
+    )(tableUtils)
+
+    val incDfQuery = incTableRange.intersect(sourceQueryableRange).genScanQuery(null, incOutputTable)
+    val incDf: DataFrame = tableUtils.sql(incDfQuery)
+
+    new GroupBy(
+      incGroupByBackfill.aggregations,
+      incGroupByBackfill.keyColumns,
+      incDf
+    )
+  }
+
   def computeBackfill(groupByConf: api.GroupBy,
                       endPartition: String,
                       tableUtils: TableUtils,
                       stepDays: Option[Int] = None,
                       overrideStartPartition: Option[String] = None,
-                      skipFirstHole: Boolean = true): Unit = {
+                      skipFirstHole: Boolean = true,
+                      incrementalAgg: Boolean = true): Unit = {
     assert(
       groupByConf.backfillStartDate != null,
       s"GroupBy:${groupByConf.metaData.name} has null backfillStartDate. This needs to be set for offline backfilling.")
@@ -725,7 +757,12 @@ object GroupBy {
           stepRanges.zipWithIndex.foreach {
             case (range, index) =>
               logger.info(s"Computing group by for range: $range [${index + 1}/${stepRanges.size}]")
-              val groupByBackfill = from(groupByConf, range, tableUtils, computeDependency = true)
+              val groupByBackfill = if (incrementalAgg) {
+                saveAndGetIncDf(groupByConf, range, tableUtils)
+                //from(groupByConf, range, tableUtils, computeDependency = true)
+              } else {
+                from(groupByConf, range, tableUtils, computeDependency = true)
+              }
               val outputDf = groupByConf.dataModel match {
                 // group by backfills have to be snapshot only
                 case Entities => groupByBackfill.snapshotEntities

From a014b6ef4706ce7200af1bb8a0681d04fc25208d Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 6 Jun 2025 19:04:46 -0700
Subject: [PATCH 03/54] Store hops to inc tables

---
 .../aggregator/row/RowAggregator.scala        |  5 ++
 .../scala/ai/chronon/api/Extensions.scala     |  5 ++
 .../main/scala/ai/chronon/spark/GroupBy.scala | 68 +++++++++++++++----
 3 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
index c8bc1da08c..6bda47bf19 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
@@ -70,6 +70,11 @@ class RowAggregator(val inputSchema: Seq[(String, DataType)], val aggregationPar
     .toArray
     .zip(columnAggregators.map(_.irType))
 
+  val incSchema = aggregationParts
+    .map(_.incOutputColumnName)
+    .toArray
+    .zip(columnAggregators.map(_.irType))
+
   val outputSchema: Array[(String, DataType)] = aggregationParts
     .map(_.outputColumnName)
     .toArray
diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala
index d813b945a6..c6c8074757 100644
--- a/api/src/main/scala/ai/chronon/api/Extensions.scala
+++ b/api/src/main/scala/ai/chronon/api/Extensions.scala
@@ -177,8 +177,13 @@ object Extensions {
 
     def outputColumnName =
       s"${aggregationPart.inputColumn}_$opSuffix${aggregationPart.window.suffix}${bucketSuffix}"
+
+    def incOutputColumnName =
+      s"${aggregationPart.inputColumn}_$opSuffix${bucketSuffix}"
+
   }
 
+
   implicit class AggregationOps(aggregation: Aggregation) {
 
     // one agg part per bucket per window
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index b7dd1f4f67..a3f76f1483 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -18,6 +18,7 @@ package ai.chronon.spark
 
 import ai.chronon.aggregator.base.TimeTuple
 import ai.chronon.aggregator.row.RowAggregator
+import ai.chronon.aggregator.windowing.HopsAggregator.HopIr
 import ai.chronon.aggregator.windowing._
 import ai.chronon.api
 import ai.chronon.api.DataModel.{Entities, Events}
@@ -41,7 +42,9 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
               val inputDf: DataFrame,
               val mutationDfFn: () => DataFrame = null,
               skewFilter: Option[String] = None,
-              finalize: Boolean = true)
+              finalize: Boolean = true,
+              incAgg: Boolean = false
+             )
     extends Serializable {
   @transient lazy val logger = LoggerFactory.getLogger(getClass)
 
@@ -88,7 +91,11 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
   lazy val aggPartWithSchema = aggregationParts.zip(columnAggregators.map(_.outputType))
 
   lazy val postAggSchema: StructType = {
-    val valueChrononSchema = if (finalize) windowAggregator.outputSchema else windowAggregator.irSchema
+    val valueChrononSchema = if (finalize) {
+      windowAggregator.outputSchema
+    } else {
+      windowAggregator.irSchema
+    }
     SparkConversions.fromChrononSchema(valueChrononSchema)
   }
 
@@ -141,12 +148,13 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     }
 
   def snapshotEventsBase(partitionRange: PartitionRange,
-                         resolution: Resolution = DailyResolution): RDD[(Array[Any], Array[Any])] = {
+                         resolution: Resolution = DailyResolution,
+                         incAgg: Boolean = true): RDD[(Array[Any], Array[Any])] = {
     val endTimes: Array[Long] = partitionRange.toTimePoints
     // add 1 day to the end times to include data [ds 00:00:00.000, ds + 1 00:00:00.000)
     val shiftedEndTimes = endTimes.map(_ + tableUtils.partitionSpec.spanMillis)
     val sawtoothAggregator = new SawtoothAggregator(aggregations, selectedSchema, resolution)
-    val hops = hopsAggregate(endTimes.min, resolution)
+    val hops = hopsAggregate(endTimes.min, resolution, incAgg)
 
     hops
       .flatMap {
@@ -356,12 +364,43 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     toDf(outputRdd, Seq(Constants.TimeColumn -> LongType, tableUtils.partitionColumn -> StringType))
   }
 
+  //def dfToOutputArrayType(df: DataFrame): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+  //  val keyBuilder: Row => KeyWithHash =
+  //    FastHashing.generateKeyBuilder(keyColumns.toArray, df.schema)
+
+  //  df.rdd
+  //    .keyBy(keyBuilder)
+  //    .mapValues(SparkConversions.toChrononRow(_, tsIndex))
+  //    .mapValues(windowAggregator.toTimeSortedArray)
+  //}
+
+  def flattenOutputArrayType(hopsArrays: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)]): RDD[(Array[Any], Array[Any])] = {
+    hopsArrays.flatMap { case (keyWithHash: KeyWithHash, hopsArray: HopsAggregator.OutputArrayType) =>
+      val hopsArrayHead: Array[HopIr] = hopsArray.headOption.get
+      hopsArrayHead.map { array: HopIr =>
+        // the last element is a timestamp, we need to drop it
+        // and add it to the key
+        val timestamp = array.last.asInstanceOf[Long]
+        val withoutTimestamp = array.dropRight(1)
+        ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp)), withoutTimestamp)
+      }
+    }
+  }
+
+  def convertHopsToDf(range: PartitionRange,
+                             schema: Array[(String, ai.chronon.api.DataType)]
+                            ): DataFrame = {
+    val hops = hopsAggregate(range.toTimePoints.min, DailyResolution)
+    val hopsDf = flattenOutputArrayType(hops)
+    toDf(hopsDf, Seq((tableUtils.partitionColumn, StringType)), Some(SparkConversions.fromChrononSchema(schema)))
+  }
+
   // convert raw data into IRs, collected by hopSizes
   // TODO cache this into a table: interface below
   // Class HopsCacher(keySchema, irSchema, resolution) extends RddCacher[(KeyWithHash, HopsOutput)]
   //  buildTableRow((keyWithHash, hopsOutput)) -> GenericRowWithSchema
   //  buildRddRow(GenericRowWithSchema) -> (keyWithHash, hopsOutput)
-  def hopsAggregate(minQueryTs: Long, resolution: Resolution): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+  def hopsAggregate(minQueryTs: Long, resolution: Resolution, incAgg: Boolean = false): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
     val hopsAggregator =
       new HopsAggregator(minQueryTs, aggregations, selectedSchema, resolution)
     val keyBuilder: Row => KeyWithHash =
@@ -378,9 +417,9 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
   }
 
   protected[spark] def toDf(aggregateRdd: RDD[(Array[Any], Array[Any])],
-                            additionalFields: Seq[(String, DataType)]): DataFrame = {
+                            additionalFields: Seq[(String, DataType)], schema: Option[StructType] = None): DataFrame = {
     val finalKeySchema = StructType(keySchema ++ additionalFields.map { case (name, typ) => StructField(name, typ) })
-    KvRdd(aggregateRdd, finalKeySchema, postAggSchema).toFlatDf
+    KvRdd(aggregateRdd, finalKeySchema, schema.getOrElse(postAggSchema)).toFlatDf
   }
 
   private def normalizeOrFinalize(ir: Array[Any]): Array[Any] =
@@ -555,7 +594,9 @@ object GroupBy {
         keyColumns,
         nullFiltered,
         mutationDfFn,
-        finalize = finalizeValue)
+        finalize = finalizeValue,
+      incAgg = incrementalAgg,
+    )
   }
 
   def getIntersectedRange(source: api.Source,
@@ -685,13 +726,16 @@ object GroupBy {
       .orNull
     //range should be modified to incremental range
     val incGroupByBackfill = from(groupByConf, range, tableUtils, computeDependency = true, incrementalAgg = true)
-    val incOutputDf = incGroupByBackfill.snapshotEvents(range)
-    incOutputDf.save(incOutputTable, tableProps)
+    val selectedSchema = incGroupByBackfill.selectedSchema
+    //TODO is there any other way to get incSchema?
+    val incSchema = new RowAggregator(selectedSchema, incGroupByBackfill.aggregations.flatMap(_.unWindowed)).incSchema
+    val hopsDf = incGroupByBackfill.convertHopsToDf(range, incSchema)
+    hopsDf.save(incOutputTable, tableProps)
 
     val maxWindow = groupByConf.maxWindow.get
     val sourceQueryableRange = PartitionRange(
-      range.start,
-      tableUtils.partitionSpec.minus(range.end, maxWindow)
+      tableUtils.partitionSpec.minus(range.start, maxWindow),
+      range.end
     )(tableUtils)
 
     val incTableFirstPartition: Option[String] = tableUtils.firstAvailablePartition(incOutputTable)

From 32d559eac7683dfc36a7704d27071f6578bf0baf Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 13 Jun 2025 18:10:03 -0700
Subject: [PATCH 04/54] add code changes to generate final output from IR for
 AVG

---
 .../aggregator/base/SimpleAggregators.scala   | 51 ++++++++++++++++
 .../aggregator/row/ColumnAggregator.scala     |  8 +++
 .../main/scala/ai/chronon/spark/GroupBy.scala | 59 +++++++++++++------
 3 files changed, 100 insertions(+), 18 deletions(-)

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala b/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
index b120d29e7f..31bf93cc6b 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
@@ -116,6 +116,57 @@ class UniqueCount[T](inputType: DataType) extends SimpleAggregator[T, util.HashS
   }
 }
 
+class AverageIR extends SimpleAggregator[Array[Any], Array[Any], Double] {
+  override def outputType: DataType = DoubleType
+
+  override def irType: DataType =
+    StructType(
+      "AvgIr",
+      Array(StructField("sum", DoubleType), StructField("count", IntType))
+    )
+
+  override def prepare(input: Array[Any]): Array[Any] = {
+    Array(input(0).asInstanceOf[Double], input(1).asInstanceOf[Int])
+  }
+
+  // mutating
+  override def update(ir: Array[Any], input: Array[Any]): Array[Any] = {
+    val inputSum = input(0).asInstanceOf[Double]
+    val inputCount = input(1).asInstanceOf[Int]
+    ir.update(0, ir(0).asInstanceOf[Double] + inputSum)
+    ir.update(1, ir(1).asInstanceOf[Int] + inputCount)
+    ir
+  }
+
+  // mutating
+  override def merge(ir1: Array[Any], ir2: Array[Any]): Array[Any] = {
+    ir1.update(0, ir1(0).asInstanceOf[Double] + ir2(0).asInstanceOf[Double])
+    ir1.update(1, ir1(1).asInstanceOf[Int] + ir2(1).asInstanceOf[Int])
+    ir1
+  }
+
+  override def finalize(ir: Array[Any]): Double =
+    ir(0).asInstanceOf[Double] / ir(1).asInstanceOf[Int].toDouble
+
+  override def delete(ir: Array[Any], input: Array[Any]): Array[Any] = {
+    val inputSum = input(0).asInstanceOf[Double]
+    val inputCount = input(1).asInstanceOf[Int]
+    ir.update(0, ir(0).asInstanceOf[Double] - inputSum)
+    ir.update(1, ir(1).asInstanceOf[Int] - inputCount)
+    ir
+  }
+
+  override def clone(ir: Array[Any]): Array[Any] = {
+    val arr = new Array[Any](ir.length)
+    ir.copyToArray(arr)
+    arr
+  }
+
+  override def isDeletable: Boolean = true
+}
+
+
+
 class Average extends SimpleAggregator[Double, Array[Any], Double] {
   override def outputType: DataType = DoubleType
 
diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
index d5f21b3072..5c8a9bcf56 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
@@ -217,6 +217,13 @@ object ColumnAggregator {
   private def toJavaDouble[A: Numeric](inp: Any) =
     implicitly[Numeric[A]].toDouble(inp.asInstanceOf[A]).asInstanceOf[java.lang.Double]
 
+
+  private def toStructArray(inp: Any): Array[Any] = inp match {
+    case r: org.apache.spark.sql.Row => r.toSeq.toArray
+    case null => null
+    case other => throw new IllegalArgumentException(s"Expected Row, got: $other")
+  }
+
   def construct(baseInputType: DataType,
                 aggregationPart: AggregationPart,
                 columnIndices: ColumnIndices,
@@ -330,6 +337,7 @@ object ColumnAggregator {
           case ShortType  => simple(new Average, toDouble[Short])
           case DoubleType => simple(new Average)
           case FloatType  => simple(new Average, toDouble[Float])
+          case StructType(name, fields) => simple(new AverageIR, toStructArray)
           case _          => mismatchException
         }
 
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index a3f76f1483..2765e7b99f 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -147,15 +147,16 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
       toDf(snapshotEntitiesBase, Seq(tableUtils.partitionColumn -> StringType))
     }
 
-  def snapshotEventsBase(partitionRange: PartitionRange,
-                         resolution: Resolution = DailyResolution,
-                         incAgg: Boolean = true): RDD[(Array[Any], Array[Any])] = {
-    val endTimes: Array[Long] = partitionRange.toTimePoints
+  def computeHopsAggregate(endTimes: Array[Long], resolution: Resolution): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+    hopsAggregate(endTimes.min, resolution)
+  }
+
+  def computeSawtoothAggregate(hops: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)],
+                               endTimes: Array[Long],
+                               resolution: Resolution): RDD[(Array[Any], Array[Any])] = {
     // add 1 day to the end times to include data [ds 00:00:00.000, ds + 1 00:00:00.000)
     val shiftedEndTimes = endTimes.map(_ + tableUtils.partitionSpec.spanMillis)
     val sawtoothAggregator = new SawtoothAggregator(aggregations, selectedSchema, resolution)
-    val hops = hopsAggregate(endTimes.min, resolution, incAgg)
-
     hops
       .flatMap {
         case (keys, hopsArrays) =>
@@ -169,6 +170,15 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
       }
   }
 
+  def snapshotEventsBase(partitionRange: PartitionRange,
+                         resolution: Resolution = DailyResolution,
+                         incAgg: Boolean = true): RDD[(Array[Any], Array[Any])] = {
+    val endTimes: Array[Long] = partitionRange.toTimePoints
+
+    val hops = computeHopsAggregate(endTimes, resolution)
+    computeSawtoothAggregate(hops, endTimes, resolution)
+  }
+
   // Calculate snapshot accurate windows for ALL keys at pre-defined "endTimes"
   // At this time, we hardcode the resolution to Daily, but it is straight forward to support
   // hourly resolution.
@@ -364,14 +374,13 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     toDf(outputRdd, Seq(Constants.TimeColumn -> LongType, tableUtils.partitionColumn -> StringType))
   }
 
-  //def dfToOutputArrayType(df: DataFrame): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+  //def convertDfToOutputArrayType(df: DataFrame): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
   //  val keyBuilder: Row => KeyWithHash =
   //    FastHashing.generateKeyBuilder(keyColumns.toArray, df.schema)
 
   //  df.rdd
   //    .keyBy(keyBuilder)
   //    .mapValues(SparkConversions.toChrononRow(_, tsIndex))
-  //    .mapValues(windowAggregator.toTimeSortedArray)
   //}
 
   def flattenOutputArrayType(hopsArrays: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)]): RDD[(Array[Any], Array[Any])] = {
@@ -382,17 +391,16 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
         // and add it to the key
         val timestamp = array.last.asInstanceOf[Long]
         val withoutTimestamp = array.dropRight(1)
-        ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp)), withoutTimestamp)
+        ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp) :+ timestamp), withoutTimestamp)
       }
     }
   }
 
-  def convertHopsToDf(range: PartitionRange,
+  def convertHopsToDf(hops: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)],
                              schema: Array[(String, ai.chronon.api.DataType)]
                             ): DataFrame = {
-    val hops = hopsAggregate(range.toTimePoints.min, DailyResolution)
     val hopsDf = flattenOutputArrayType(hops)
-    toDf(hopsDf, Seq((tableUtils.partitionColumn, StringType)), Some(SparkConversions.fromChrononSchema(schema)))
+    toDf(hopsDf, Seq(tableUtils.partitionColumn -> StringType, Constants.TimeColumn -> LongType), Some(SparkConversions.fromChrononSchema(schema)))
   }
 
   // convert raw data into IRs, collected by hopSizes
@@ -728,8 +736,10 @@ object GroupBy {
     val incGroupByBackfill = from(groupByConf, range, tableUtils, computeDependency = true, incrementalAgg = true)
     val selectedSchema = incGroupByBackfill.selectedSchema
     //TODO is there any other way to get incSchema?
-    val incSchema = new RowAggregator(selectedSchema, incGroupByBackfill.aggregations.flatMap(_.unWindowed)).incSchema
-    val hopsDf = incGroupByBackfill.convertHopsToDf(range, incSchema)
+    val incFlattendAgg = new RowAggregator(selectedSchema, incGroupByBackfill.aggregations.flatMap(_.unWindowed))
+      val incSchema = incFlattendAgg.incSchema
+    val hops = incGroupByBackfill.computeHopsAggregate(range.toTimePoints, DailyResolution)
+    val hopsDf = incGroupByBackfill.convertHopsToDf(hops, incSchema)
     hopsDf.save(incOutputTable, tableProps)
 
     val maxWindow = groupByConf.maxWindow.get
@@ -746,14 +756,27 @@ object GroupBy {
       incTableLastPartition.get
     )(tableUtils)
 
+    //val dfQuery = groupByConf.
     val incDfQuery = incTableRange.intersect(sourceQueryableRange).genScanQuery(null, incOutputTable)
     val incDf: DataFrame = tableUtils.sql(incDfQuery)
+    //incGroupByBackfill.computeSawtoothAggregate(incDf, range.toTimePoints, DailyResolution)
+
+    val a = incFlattendAgg.aggregationParts.zip(groupByConf.getAggregations.toScala).map{ case (part, agg) =>
+      val newAgg = agg.deepCopy()
+      newAgg.setInputColumn(part.incOutputColumnName)
+      newAgg
+    }
+
 
     new GroupBy(
-      incGroupByBackfill.aggregations,
-      incGroupByBackfill.keyColumns,
-      incDf
+      a,
+      groupByConf.getKeyColumns.toScala,
+      incDf,
+      () => null,
+      finalize = true,
+      incAgg = false,
     )
+
   }
 
   def computeBackfill(groupByConf: api.GroupBy,
@@ -801,7 +824,7 @@ object GroupBy {
           stepRanges.zipWithIndex.foreach {
             case (range, index) =>
               logger.info(s"Computing group by for range: $range [${index + 1}/${stepRanges.size}]")
-              val groupByBackfill = if (incrementalAgg) {
+              val groupByBackfill: GroupBy = if (incrementalAgg) {
                 saveAndGetIncDf(groupByConf, range, tableUtils)
                 //from(groupByConf, range, tableUtils, computeDependency = true)
               } else {

From 37293df2c7c7b491993016d96145bc5f7463301e Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Thu, 19 Jun 2025 13:39:06 -0700
Subject: [PATCH 05/54] change function structure and variable names

---
 .../aggregator/row/RowAggregator.scala        |   4 +-
 .../scala/ai/chronon/api/Extensions.scala     |   4 +-
 .../main/scala/ai/chronon/spark/GroupBy.scala | 104 +++++++++++-------
 3 files changed, 69 insertions(+), 43 deletions(-)

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
index 6bda47bf19..e9d0608d25 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
@@ -70,8 +70,8 @@ class RowAggregator(val inputSchema: Seq[(String, DataType)], val aggregationPar
     .toArray
     .zip(columnAggregators.map(_.irType))
 
-  val incSchema = aggregationParts
-    .map(_.incOutputColumnName)
+  val incrementalOutputSchema = aggregationParts
+    .map(_.incrementalOutputColumnName)
     .toArray
     .zip(columnAggregators.map(_.irType))
 
diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala
index c6c8074757..b39bb2f016 100644
--- a/api/src/main/scala/ai/chronon/api/Extensions.scala
+++ b/api/src/main/scala/ai/chronon/api/Extensions.scala
@@ -98,7 +98,7 @@ object Extensions {
     def cleanName: String = metaData.name.sanitize
 
     def outputTable = s"${metaData.outputNamespace}.${metaData.cleanName}"
-    def incOutputTable = s"${metaData.outputNamespace}.${metaData.cleanName}_inc"
+    def incrementalOutputTable = s"${metaData.outputNamespace}.${metaData.cleanName}_inc"
     def outputLabelTable = s"${metaData.outputNamespace}.${metaData.cleanName}_labels"
     def outputFinalView = s"${metaData.outputNamespace}.${metaData.cleanName}_labeled"
     def outputLatestLabelView = s"${metaData.outputNamespace}.${metaData.cleanName}_labeled_latest"
@@ -178,7 +178,7 @@ object Extensions {
     def outputColumnName =
       s"${aggregationPart.inputColumn}_$opSuffix${aggregationPart.window.suffix}${bucketSuffix}"
 
-    def incOutputColumnName =
+    def incrementalOutputColumnName =
       s"${aggregationPart.inputColumn}_$opSuffix${bucketSuffix}"
 
   }
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 2765e7b99f..36c8362618 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -23,7 +23,7 @@ import ai.chronon.aggregator.windowing._
 import ai.chronon.api
 import ai.chronon.api.DataModel.{Entities, Events}
 import ai.chronon.api.Extensions._
-import ai.chronon.api.{Accuracy, Constants, DataModel, ParametricMacro}
+import ai.chronon.api.{Accuracy, Constants, DataModel, ParametricMacro, Source}
 import ai.chronon.online.{RowWrapper, SparkConversions}
 import ai.chronon.spark.Extensions._
 import org.apache.spark.rdd.RDD
@@ -43,7 +43,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
               val mutationDfFn: () => DataFrame = null,
               skewFilter: Option[String] = None,
               finalize: Boolean = true,
-              incAgg: Boolean = false
+              incrementalMode: Boolean = false
              )
     extends Serializable {
   @transient lazy val logger = LoggerFactory.getLogger(getClass)
@@ -99,6 +99,10 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     SparkConversions.fromChrononSchema(valueChrononSchema)
   }
 
+  lazy val flattenedAgg: RowAggregator = new RowAggregator(selectedSchema, aggregations.flatMap(_.unWindowed))
+  lazy val incrementalSchema: Array[(String, api.DataType)] = flattenedAgg.incrementalOutputSchema
+
+
   @transient
   protected[spark] lazy val windowAggregator: RowAggregator =
     new RowAggregator(selectedSchema, aggregations.flatMap(_.unpack))
@@ -509,7 +513,7 @@ object GroupBy {
            skewFilter: Option[String] = None,
            finalize: Boolean = true,
            showDf: Boolean = false,
-           incrementalAgg: Boolean = false): GroupBy = {
+           incrementalMode: Boolean = false): GroupBy = {
     logger.info(s"\n----[Processing GroupBy: ${groupByConfOld.metaData.name}]----")
     val groupByConf = replaceJoinSource(groupByConfOld, queryRange, tableUtils, computeDependency, showDf)
     val inputDf = groupByConf.sources.toScala
@@ -593,17 +597,21 @@ object GroupBy {
       }
       df
     }
-    val finalizeValue = if (incrementalAgg) {
-      !incrementalAgg
+
+    //if incrementalMode is enabled, we do not compute finalize values
+    //IR values are stored in the table
+    val finalizeValue = if (incrementalMode) {
+      false
     } else {
       finalize
     }
+
     new GroupBy(Option(groupByConf.getAggregations).map(_.toScala).orNull,
         keyColumns,
         nullFiltered,
         mutationDfFn,
         finalize = finalizeValue,
-      incAgg = incrementalAgg,
+        incrementalMode = incrementalMode,
     )
   }
 
@@ -723,58 +731,77 @@ object GroupBy {
     query
   }
 
-  def saveAndGetIncDf(
+  /**
+    * Computes and saves the output of hopsAggregation.
+    * HopsAggregate computes event level data to daily aggregates and saves the output in IR format
+    *
+    * @param groupByConf
+    * @param range
+    * @param tableUtils
+    */
+  def computeIncrementalDf(
+                            groupByConf: api.GroupBy,
+                            range: PartitionRange,
+                            tableUtils: TableUtils,
+                          ): GroupBy = {
+
+    val incrementalOutputTable = groupByConf.metaData.incrementalOutputTable
+    val tableProps = Option(groupByConf.metaData.tableProperties)
+      .map(_.toScala)
+      .orNull
+
+    val incrementalGroupByBackfill =
+      from(groupByConf, range, tableUtils, computeDependency = true, incrementalMode = true)
+
+    val incrementalSchema = incrementalGroupByBackfill.incrementalSchema
+
+    val hops = incrementalGroupByBackfill.computeHopsAggregate(range.toTimePoints, DailyResolution)
+    val hopsDf = incrementalGroupByBackfill.convertHopsToDf(hops, incrementalSchema)
+    hopsDf.save(incrementalOutputTable, tableProps)
+
+    incrementalGroupByBackfill
+  }
+
+  def fromIncrementalDf(
                        groupByConf: api.GroupBy,
                        range: PartitionRange,
                        tableUtils: TableUtils,
                      ): GroupBy = {
-    val incOutputTable = groupByConf.metaData.incOutputTable
-    val tableProps = Option(groupByConf.metaData.tableProperties)
-      .map(_.toScala)
-      .orNull
-    //range should be modified to incremental range
-    val incGroupByBackfill = from(groupByConf, range, tableUtils, computeDependency = true, incrementalAgg = true)
-    val selectedSchema = incGroupByBackfill.selectedSchema
-    //TODO is there any other way to get incSchema?
-    val incFlattendAgg = new RowAggregator(selectedSchema, incGroupByBackfill.aggregations.flatMap(_.unWindowed))
-      val incSchema = incFlattendAgg.incSchema
-    val hops = incGroupByBackfill.computeHopsAggregate(range.toTimePoints, DailyResolution)
-    val hopsDf = incGroupByBackfill.convertHopsToDf(hops, incSchema)
-    hopsDf.save(incOutputTable, tableProps)
-
-    val maxWindow = groupByConf.maxWindow.get
+
+
+    val incrementalGroupByBackfill: GroupBy = computeIncrementalDf(groupByConf, range, tableUtils)
+
+    val incrementalOutputTable = groupByConf.metaData.incrementalOutputTable
     val sourceQueryableRange = PartitionRange(
-      tableUtils.partitionSpec.minus(range.start, maxWindow),
+      tableUtils.partitionSpec.minus(range.start, groupByConf.maxWindow.get),
       range.end
     )(tableUtils)
 
-    val incTableFirstPartition: Option[String] = tableUtils.firstAvailablePartition(incOutputTable)
-    val incTableLastPartition: Option[String] = tableUtils.lastAvailablePartition(incOutputTable)
+    val incTableFirstPartition: Option[String] = tableUtils.firstAvailablePartition(incrementalOutputTable)
+    val incTableLastPartition: Option[String] = tableUtils.lastAvailablePartition(incrementalOutputTable)
 
     val incTableRange = PartitionRange(
       incTableFirstPartition.get,
       incTableLastPartition.get
     )(tableUtils)
 
-    //val dfQuery = groupByConf.
-    val incDfQuery = incTableRange.intersect(sourceQueryableRange).genScanQuery(null, incOutputTable)
-    val incDf: DataFrame = tableUtils.sql(incDfQuery)
-    //incGroupByBackfill.computeSawtoothAggregate(incDf, range.toTimePoints, DailyResolution)
+    val incrementalDf: DataFrame = tableUtils.sql(
+      incTableRange.intersect(sourceQueryableRange).genScanQuery(null, incrementalOutputTable)
+    )
 
-    val a = incFlattendAgg.aggregationParts.zip(groupByConf.getAggregations.toScala).map{ case (part, agg) =>
+    val incrementalAggregations = incrementalGroupByBackfill.flattenedAgg.aggregationParts.zip(groupByConf.getAggregations.toScala).map{ case (part, agg) =>
       val newAgg = agg.deepCopy()
-      newAgg.setInputColumn(part.incOutputColumnName)
+      newAgg.setInputColumn(part.incrementalOutputColumnName)
       newAgg
     }
 
-
     new GroupBy(
-      a,
+      incrementalAggregations,
       groupByConf.getKeyColumns.toScala,
-      incDf,
+      incrementalDf,
       () => null,
       finalize = true,
-      incAgg = false,
+      incrementalMode = false,
     )
 
   }
@@ -785,7 +812,7 @@ object GroupBy {
                       stepDays: Option[Int] = None,
                       overrideStartPartition: Option[String] = None,
                       skipFirstHole: Boolean = true,
-                      incrementalAgg: Boolean = true): Unit = {
+                      incrementalMode: Boolean = true): Unit = {
     assert(
       groupByConf.backfillStartDate != null,
       s"GroupBy:${groupByConf.metaData.name} has null backfillStartDate. This needs to be set for offline backfilling.")
@@ -824,9 +851,8 @@ object GroupBy {
           stepRanges.zipWithIndex.foreach {
             case (range, index) =>
               logger.info(s"Computing group by for range: $range [${index + 1}/${stepRanges.size}]")
-              val groupByBackfill: GroupBy = if (incrementalAgg) {
-                saveAndGetIncDf(groupByConf, range, tableUtils)
-                //from(groupByConf, range, tableUtils, computeDependency = true)
+              val groupByBackfill: GroupBy = if (incrementalMode) {
+                fromIncrementalDf(groupByConf, range, tableUtils)
               } else {
                 from(groupByConf, range, tableUtils, computeDependency = true)
               }

From 6263706d0c2420af3cf1e5be6ecfb118ce04e69e Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Thu, 19 Jun 2025 13:45:18 -0700
Subject: [PATCH 06/54] remove unused functions

---
 .../main/scala/ai/chronon/spark/GroupBy.scala | 23 ++++---------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 36c8362618..53347dc970 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -150,11 +150,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     } else {
       toDf(snapshotEntitiesBase, Seq(tableUtils.partitionColumn -> StringType))
     }
-
-  def computeHopsAggregate(endTimes: Array[Long], resolution: Resolution): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
-    hopsAggregate(endTimes.min, resolution)
-  }
-
+  
   def computeSawtoothAggregate(hops: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)],
                                endTimes: Array[Long],
                                resolution: Resolution): RDD[(Array[Any], Array[Any])] = {
@@ -179,7 +175,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
                          incAgg: Boolean = true): RDD[(Array[Any], Array[Any])] = {
     val endTimes: Array[Long] = partitionRange.toTimePoints
 
-    val hops = computeHopsAggregate(endTimes, resolution)
+    val hops = hopsAggregate(endTimes.min, resolution)
     computeSawtoothAggregate(hops, endTimes, resolution)
   }
 
@@ -378,21 +374,10 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     toDf(outputRdd, Seq(Constants.TimeColumn -> LongType, tableUtils.partitionColumn -> StringType))
   }
 
-  //def convertDfToOutputArrayType(df: DataFrame): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
-  //  val keyBuilder: Row => KeyWithHash =
-  //    FastHashing.generateKeyBuilder(keyColumns.toArray, df.schema)
-
-  //  df.rdd
-  //    .keyBy(keyBuilder)
-  //    .mapValues(SparkConversions.toChrononRow(_, tsIndex))
-  //}
-
   def flattenOutputArrayType(hopsArrays: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)]): RDD[(Array[Any], Array[Any])] = {
     hopsArrays.flatMap { case (keyWithHash: KeyWithHash, hopsArray: HopsAggregator.OutputArrayType) =>
       val hopsArrayHead: Array[HopIr] = hopsArray.headOption.get
       hopsArrayHead.map { array: HopIr =>
-        // the last element is a timestamp, we need to drop it
-        // and add it to the key
         val timestamp = array.last.asInstanceOf[Long]
         val withoutTimestamp = array.dropRight(1)
         ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp) :+ timestamp), withoutTimestamp)
@@ -412,7 +397,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
   // Class HopsCacher(keySchema, irSchema, resolution) extends RddCacher[(KeyWithHash, HopsOutput)]
   //  buildTableRow((keyWithHash, hopsOutput)) -> GenericRowWithSchema
   //  buildRddRow(GenericRowWithSchema) -> (keyWithHash, hopsOutput)
-  def hopsAggregate(minQueryTs: Long, resolution: Resolution, incAgg: Boolean = false): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+  def hopsAggregate(minQueryTs: Long, resolution: Resolution): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
     val hopsAggregator =
       new HopsAggregator(minQueryTs, aggregations, selectedSchema, resolution)
     val keyBuilder: Row => KeyWithHash =
@@ -755,7 +740,7 @@ object GroupBy {
 
     val incrementalSchema = incrementalGroupByBackfill.incrementalSchema
 
-    val hops = incrementalGroupByBackfill.computeHopsAggregate(range.toTimePoints, DailyResolution)
+    val hops = incrementalGroupByBackfill.hopsAggregate(range.toTimePoints.min, DailyResolution)
     val hopsDf = incrementalGroupByBackfill.convertHopsToDf(hops, incrementalSchema)
     hopsDf.save(incrementalOutputTable, tableProps)
 

From cb4325ba90456e3d93a0d4ec365bcdd329318b0e Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Thu, 19 Jun 2025 14:00:33 -0700
Subject: [PATCH 07/54] change function defs

---
 .../main/scala/ai/chronon/spark/GroupBy.scala | 20 +++++++------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 53347dc970..d56877af69 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -150,10 +150,13 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     } else {
       toDf(snapshotEntitiesBase, Seq(tableUtils.partitionColumn -> StringType))
     }
-  
-  def computeSawtoothAggregate(hops: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)],
-                               endTimes: Array[Long],
-                               resolution: Resolution): RDD[(Array[Any], Array[Any])] = {
+
+  def snapshotEventsBase(partitionRange: PartitionRange,
+                         resolution: Resolution = DailyResolution,
+                         incAgg: Boolean = true): RDD[(Array[Any], Array[Any])] = {
+    val endTimes: Array[Long] = partitionRange.toTimePoints
+
+    val hops = hopsAggregate(endTimes.min, resolution)
     // add 1 day to the end times to include data [ds 00:00:00.000, ds + 1 00:00:00.000)
     val shiftedEndTimes = endTimes.map(_ + tableUtils.partitionSpec.spanMillis)
     val sawtoothAggregator = new SawtoothAggregator(aggregations, selectedSchema, resolution)
@@ -170,15 +173,6 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
       }
   }
 
-  def snapshotEventsBase(partitionRange: PartitionRange,
-                         resolution: Resolution = DailyResolution,
-                         incAgg: Boolean = true): RDD[(Array[Any], Array[Any])] = {
-    val endTimes: Array[Long] = partitionRange.toTimePoints
-
-    val hops = hopsAggregate(endTimes.min, resolution)
-    computeSawtoothAggregate(hops, endTimes, resolution)
-  }
-
   // Calculate snapshot accurate windows for ALL keys at pre-defined "endTimes"
   // At this time, we hardcode the resolution to Daily, but it is straight forward to support
   // hourly resolution.

From 796ef9660bcbee5a8263e990b282787ab59f53aa Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Thu, 19 Jun 2025 14:01:43 -0700
Subject: [PATCH 08/54] make changes

---
 spark/src/main/scala/ai/chronon/spark/GroupBy.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index d56877af69..dfbfd43090 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -155,11 +155,11 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
                          resolution: Resolution = DailyResolution,
                          incAgg: Boolean = true): RDD[(Array[Any], Array[Any])] = {
     val endTimes: Array[Long] = partitionRange.toTimePoints
-
-    val hops = hopsAggregate(endTimes.min, resolution)
     // add 1 day to the end times to include data [ds 00:00:00.000, ds + 1 00:00:00.000)
     val shiftedEndTimes = endTimes.map(_ + tableUtils.partitionSpec.spanMillis)
     val sawtoothAggregator = new SawtoothAggregator(aggregations, selectedSchema, resolution)
+    val hops = hopsAggregate(endTimes.min, resolution)
+
     hops
       .flatMap {
         case (keys, hopsArrays) =>

From f218b231bd3e430850f3304c1f9be40dfdcdace3 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Thu, 19 Jun 2025 14:03:46 -0700
Subject: [PATCH 09/54] change function order

---
 spark/src/main/scala/ai/chronon/spark/GroupBy.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index dfbfd43090..cb557a50f3 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -152,8 +152,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     }
 
   def snapshotEventsBase(partitionRange: PartitionRange,
-                         resolution: Resolution = DailyResolution,
-                         incAgg: Boolean = true): RDD[(Array[Any], Array[Any])] = {
+                         resolution: Resolution = DailyResolution): RDD[(Array[Any], Array[Any])] = {
     val endTimes: Array[Long] = partitionRange.toTimePoints
     // add 1 day to the end times to include data [ds 00:00:00.000, ds + 1 00:00:00.000)
     val shiftedEndTimes = endTimes.map(_ + tableUtils.partitionSpec.spanMillis)

From b1d4ee99b96a9671cec04e9ced50a73760de44ed Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 20 Jun 2025 10:52:14 -0700
Subject: [PATCH 10/54] add new field is_incremental to python api

---
 api/py/ai/chronon/group_by.py | 2 ++
 api/thrift/api.thrift         | 1 +
 2 files changed, 3 insertions(+)

diff --git a/api/py/ai/chronon/group_by.py b/api/py/ai/chronon/group_by.py
index b2290e34e8..e919b74060 100644
--- a/api/py/ai/chronon/group_by.py
+++ b/api/py/ai/chronon/group_by.py
@@ -349,6 +349,7 @@ def GroupBy(
     tags: Dict[str, str] = None,
     derivations: List[ttypes.Derivation] = None,
     deprecation_date: str = None,
+    is_incremental: bool = False,
     **kwargs,
 ) -> ttypes.GroupBy:
     """
@@ -556,6 +557,7 @@ def _normalize_source(source):
         backfillStartDate=backfill_start_date,
         accuracy=accuracy,
         derivations=derivations,
+        isIncremental=is_incremental,
     )
     validate_group_by(group_by)
     return group_by
diff --git a/api/thrift/api.thrift b/api/thrift/api.thrift
index 3fd8f5428a..16f19d1681 100644
--- a/api/thrift/api.thrift
+++ b/api/thrift/api.thrift
@@ -278,6 +278,7 @@ struct GroupBy {
     6: optional string backfillStartDate
     // Optional derivation list
     7: optional list<Derivation> derivations
+    8: optional bool isIncremental
 }
 
 struct JoinPart {

From 2ab7659c18db9c764a3d3e0cd078f649d9ac4d26 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 20 Jun 2025 10:54:01 -0700
Subject: [PATCH 11/54] get argument for isIncremental in scala spark backend

---
 spark/src/main/scala/ai/chronon/spark/Driver.scala |  3 ++-
 .../src/main/scala/ai/chronon/spark/GroupBy.scala  | 14 +++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala
index 560d6d261a..748c4765bf 100644
--- a/spark/src/main/scala/ai/chronon/spark/Driver.scala
+++ b/spark/src/main/scala/ai/chronon/spark/Driver.scala
@@ -386,7 +386,8 @@ object Driver {
         tableUtils,
         args.stepDays.toOption,
         args.startPartitionOverride.toOption,
-        !args.runFirstHole()
+        !args.runFirstHole(),
+        args.groupByConf.isIncremental
       )
 
       if (args.shouldExport()) {
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index cb557a50f3..0f607680c5 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -497,12 +497,12 @@ object GroupBy {
     val inputDf = groupByConf.sources.toScala
       .map { source =>
         renderDataSourceQuery(groupByConf,
-          source,
-          groupByConf.getKeyColumns.toScala,
-          queryRange,
-          tableUtils,
-          groupByConf.maxWindow,
-          groupByConf.inferredAccuracy)
+                              source,
+                              groupByConf.getKeyColumns.toScala,
+                              queryRange,
+                              tableUtils,
+                              groupByConf.maxWindow,
+                              groupByConf.inferredAccuracy)
 
       }
       .map {
@@ -790,7 +790,7 @@ object GroupBy {
                       stepDays: Option[Int] = None,
                       overrideStartPartition: Option[String] = None,
                       skipFirstHole: Boolean = true,
-                      incrementalMode: Boolean = true): Unit = {
+                      incrementalMode: Boolean = false): Unit = {
     assert(
       groupByConf.backfillStartDate != null,
       s"GroupBy:${groupByConf.metaData.name} has null backfillStartDate. This needs to be set for offline backfilling.")

From 238c781f5b8adb13928ed3fa6634843746bc0516 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 20 Jun 2025 15:36:56 -0700
Subject: [PATCH 12/54] add unit test for incremental groupby

---
 .../ai/chronon/spark/test/GroupByTest.scala   | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index dd979e4422..baa7a1cbf4 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -23,6 +23,7 @@ import ai.chronon.api.{Aggregation, Builders, Constants, Derivation, DoubleType,
 import ai.chronon.online.{RowWrapper, SparkConversions}
 import ai.chronon.spark.Extensions._
 import ai.chronon.spark._
+import ai.chronon.spark.test.TestUtils.makeDf
 import com.google.gson.Gson
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.types.{StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
@@ -423,6 +424,7 @@ class GroupByTest {
              additionalAgg = aggs)
   }
 
+
   private def createTestSource(windowSize: Int = 365, suffix: String = ""): (Source, String) = {
     lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
     implicit val tableUtils = TableUtils(spark)
@@ -694,4 +696,96 @@ class GroupByTest {
              tableUtils = tableUtils,
              additionalAgg = aggs)
   }
+
+  @Test
+  def testIncrementalMode(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+
+    val namespace = "test_incremental_group_by"
+
+    val schema =
+      ai.chronon.api.StructType(
+        "test_incremental_group_by",
+        Array(
+          ai.chronon.api.StructField("user", StringType),
+          ai.chronon.api.StructField("purchase_price", IntType),
+          ai.chronon.api.StructField("ds", StringType),
+          ai.chronon.api.StructField("ts", LongType)
+        )
+      )
+
+
+    val sourceTable = "test_incremental_group_by_" + Random.alphanumeric.take(6).mkString
+    val data = List(
+        Row("user1", 100, "2025-06-01", 1748772000000L),
+        Row("user2", 200, "2025-06-01", 1748772000000L),
+        Row("user3", 300, "2025-06-01", 1748772000000L),
+    )
+    val df = makeDf(spark, schema, data).save(s"${namespace}.${sourceTable}")
+
+
+    val source = Builders.Source.events(
+      query = Builders.Query(selects = Builders.Selects("ts", "user", "purchase_price", "ds"),
+        startPartition = "2025-06-01"),
+      table = sourceTable
+    )
+
+    // Define aggregations
+    val aggregations: Seq[Aggregation] = Seq(
+      Builders.Aggregation(Operation.SUM, "purchase_price", Seq(new Window(3, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.COUNT, "purchase_price", Seq(new Window(3, TimeUnit.DAYS)))
+    )
+
+    val groupByConf = Builders.GroupBy(
+      sources = Seq(source),
+      keyColumns = Seq("user"),
+      aggregations = aggregations,
+      metaData = Builders.MetaData(name = "intermediate_output_gb", namespace = "test_incremental_group_by", team = "chronon"),
+      backfillStartDate = tableUtils.partitionSpec.minus(tableUtils.partitionSpec.at(System.currentTimeMillis()),
+        new Window(60, TimeUnit.DAYS)),
+    )
+
+    val outputTableName = groupByConf.metaData.incrementalOutputTable
+
+    GroupBy.computeIncrementalDf(
+        groupByConf,
+        PartitionRange("2025-06-01", "2025-06-01"),
+        tableUtils
+    )
+
+    //check if the table exists
+    assertTrue(s"Output table $outputTableName should exist", spark.catalog.tableExists(outputTableName))
+    
+    // Create GroupBy with incrementalMode = true
+    /*
+    val incrementalGroupBy = new GroupBy(
+      aggregations = aggregations,
+      keyColumns = Seq("user"),
+      inputDf = df,
+      incrementalMode = true
+    )
+     */
+    
+    // Test that incremental schema is available
+    //val incrementalSchema = incrementalGroupBy.incrementalSchema
+    //assertNotNull("Incremental schema should not be null", incrementalSchema)
+    //assertEquals("Should have correct number of incremental schema columns",
+    //            aggregations.length, incrementalSchema.length)
+
+
+    // Test that we can compute snapshot events
+    //val result = incrementalGroupBy.snapshotEvents(PartitionRange("2025-06-01", "2025-06-01"))
+    //println("================================")
+    //println(df.show())
+    //println(result.show())
+    //println("================================")
+    //assertNotNull("Should be able to compute snapshot events", result)
+    //assertTrue("Result should have data", result.count() > 100)
+    
+    // Verify that result contains expected columns
+    //val resultColumns = result.columns.toSet
+    //assertTrue("Result should contain user column", resultColumns.contains("user"))
+    //assertTrue("Result should contain partition column", resultColumns.contains(tableUtils.partitionColumn))
+  }
 }

From 8edfd2785aab51adf858d43b7e526a5124ad7065 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Thu, 17 Jul 2025 21:00:54 -0700
Subject: [PATCH 13/54] reuse table ccreation

---
 .../ai/chronon/spark/test/GroupByTest.scala   | 45 +++++++------------
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index baa7a1cbf4..de934561e6 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -699,49 +699,31 @@ class GroupByTest {
 
   @Test
   def testIncrementalMode(): Unit = {
-    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncremental" + "_" + Random.alphanumeric.take(6).mkString, local = true)
     implicit val tableUtils = TableUtils(spark)
 
-    val namespace = "test_incremental_group_by"
-
-    val schema =
-      ai.chronon.api.StructType(
-        "test_incremental_group_by",
-        Array(
-          ai.chronon.api.StructField("user", StringType),
-          ai.chronon.api.StructField("purchase_price", IntType),
-          ai.chronon.api.StructField("ds", StringType),
-          ai.chronon.api.StructField("ts", LongType)
-        )
-      )
-
-
-    val sourceTable = "test_incremental_group_by_" + Random.alphanumeric.take(6).mkString
-    val data = List(
-        Row("user1", 100, "2025-06-01", 1748772000000L),
-        Row("user2", 200, "2025-06-01", 1748772000000L),
-        Row("user3", 300, "2025-06-01", 1748772000000L),
+    val schema = List(
+      Column("user", StringType, 10), // ts = last 10 days
+      Column("session_length", IntType, 2),
+      Column("rating", DoubleType, 2000)
     )
-    val df = makeDf(spark, schema, data).save(s"${namespace}.${sourceTable}")
 
+    val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
 
-    val source = Builders.Source.events(
-      query = Builders.Query(selects = Builders.Selects("ts", "user", "purchase_price", "ds"),
-        startPartition = "2025-06-01"),
-      table = sourceTable
-    )
+    println(s"Input DataFrame: ${df.count()}")
 
-    // Define aggregations
     val aggregations: Seq[Aggregation] = Seq(
-      Builders.Aggregation(Operation.SUM, "purchase_price", Seq(new Window(3, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.COUNT, "purchase_price", Seq(new Window(3, TimeUnit.DAYS)))
+        Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "session_length", Seq(new Window(10, TimeUnit.DAYS), WindowUtils.Unbounded)),
+        Builders.Aggregation(Operation.UNIQUE_COUNT, "session_length", Seq(new Window(10, TimeUnit.DAYS), WindowUtils.Unbounded)),
+        Builders.Aggregation(Operation.SUM, "session_length", Seq(new Window(10, TimeUnit.DAYS)))
     )
 
+
     val groupByConf = Builders.GroupBy(
       sources = Seq(source),
       keyColumns = Seq("user"),
       aggregations = aggregations,
-      metaData = Builders.MetaData(name = "intermediate_output_gb", namespace = "test_incremental_group_by", team = "chronon"),
+      metaData = Builders.MetaData(name = "intermediate_output_gb", namespace = namespace, team = "chronon"),
       backfillStartDate = tableUtils.partitionSpec.minus(tableUtils.partitionSpec.at(System.currentTimeMillis()),
         new Window(60, TimeUnit.DAYS)),
     )
@@ -756,6 +738,9 @@ class GroupByTest {
 
     //check if the table exists
     assertTrue(s"Output table $outputTableName should exist", spark.catalog.tableExists(outputTableName))
+
+    val df1 = spark.sql(s"SELECT * FROM $outputTableName").toDF()
+    println(s"Table output ${df1.show()}")
     
     // Create GroupBy with incrementalMode = true
     /*

From e903683dc6d7d3a28a943d0bfd95e7e16a20ef5b Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 18 Jul 2025 09:39:14 -0700
Subject: [PATCH 14/54] Update GroupByTest

---
 .../main/scala/ai/chronon/spark/GroupBy.scala | 20 +++--
 .../ai/chronon/spark/test/GroupByTest.scala   | 83 +++++++------------
 2 files changed, 43 insertions(+), 60 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 0f607680c5..939843be1f 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -419,6 +419,16 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
       windowAggregator.normalize(ir)
     }
 
+
+    def computeIncrementalDf(incrementalOutputTable: String,
+                             range: PartitionRange,
+                             tableProps: Map[String, String]) = {
+
+      val hops = hopsAggregate(range.toTimePoints.min, DailyResolution)
+      println(s"Saving incremental hops to ${hops.map(x  => x._1.data.mkString(",")).take(20)}.")
+      val hopsDf: DataFrame = convertHopsToDf(hops, incrementalSchema)
+      hopsDf.save(incrementalOutputTable, tableProps)
+    }
 }
 
 // TODO: truncate queryRange for caching
@@ -723,19 +733,15 @@ object GroupBy {
                             tableUtils: TableUtils,
                           ): GroupBy = {
 
-    val incrementalOutputTable = groupByConf.metaData.incrementalOutputTable
-    val tableProps = Option(groupByConf.metaData.tableProperties)
+    val incrementalOutputTable: String = groupByConf.metaData.incrementalOutputTable
+    val tableProps: Map[String, String] = Option(groupByConf.metaData.tableProperties)
       .map(_.toScala)
       .orNull
 
     val incrementalGroupByBackfill =
       from(groupByConf, range, tableUtils, computeDependency = true, incrementalMode = true)
 
-    val incrementalSchema = incrementalGroupByBackfill.incrementalSchema
-
-    val hops = incrementalGroupByBackfill.hopsAggregate(range.toTimePoints.min, DailyResolution)
-    val hopsDf = incrementalGroupByBackfill.convertHopsToDf(hops, incrementalSchema)
-    hopsDf.save(incrementalOutputTable, tableProps)
+    incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, range, tableProps)
 
     incrementalGroupByBackfill
   }
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index de934561e6..8cb886c498 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -701,7 +701,8 @@ class GroupByTest {
   def testIncrementalMode(): Unit = {
     lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncremental" + "_" + Random.alphanumeric.take(6).mkString, local = true)
     implicit val tableUtils = TableUtils(spark)
-
+    val namespace = "incremental"
+    tableUtils.createDatabase(namespace)
     val schema = List(
       Column("user", StringType, 10), // ts = last 10 days
       Column("session_length", IntType, 2),
@@ -713,64 +714,40 @@ class GroupByTest {
     println(s"Input DataFrame: ${df.count()}")
 
     val aggregations: Seq[Aggregation] = Seq(
-        Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "session_length", Seq(new Window(10, TimeUnit.DAYS), WindowUtils.Unbounded)),
-        Builders.Aggregation(Operation.UNIQUE_COUNT, "session_length", Seq(new Window(10, TimeUnit.DAYS), WindowUtils.Unbounded)),
+        //Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "session_length", Seq(new Window(10, TimeUnit.DAYS), WindowUtils.Unbounded)),
+        //Builders.Aggregation(Operation.UNIQUE_COUNT, "session_length", Seq(new Window(10, TimeUnit.DAYS), WindowUtils.Unbounded)),
         Builders.Aggregation(Operation.SUM, "session_length", Seq(new Window(10, TimeUnit.DAYS)))
     )
 
-
-    val groupByConf = Builders.GroupBy(
-      sources = Seq(source),
-      keyColumns = Seq("user"),
-      aggregations = aggregations,
-      metaData = Builders.MetaData(name = "intermediate_output_gb", namespace = namespace, team = "chronon"),
-      backfillStartDate = tableUtils.partitionSpec.minus(tableUtils.partitionSpec.at(System.currentTimeMillis()),
-        new Window(60, TimeUnit.DAYS)),
+    val tableProps: Map[String, String] = Map(
+      "source" -> "chronon"
     )
 
-    val outputTableName = groupByConf.metaData.incrementalOutputTable
+    val groupBy = new GroupBy(aggregations, Seq("user"), df)
+    groupBy.computeIncrementalDf("incremental.testIncrementalOutput", PartitionRange("2025-05-01", "2025-06-01"), tableProps)
 
-    GroupBy.computeIncrementalDf(
-        groupByConf,
-        PartitionRange("2025-06-01", "2025-06-01"),
-        tableUtils
-    )
+    val actualIncrementalDf = spark.sql(s"select * from incremental.testIncrementalOutput where ds='2025-05-11'")
+    df.createOrReplaceTempView("test_incremental_input")
+    //spark.sql(s"select * from test_incremental_input where user='user7' and ds='2025-05-11'").show(numRows=100)
+
+    spark.sql(s"select * from incremental.testIncrementalOutput where ds='2025-05-11'").show()
+
+    val query =
+      s"""
+         |select user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts, sum(session_length) as session_length_sum
+         |from test_incremental_input
+         |where ds='2025-05-11'
+         |group by user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000
+         |""".stripMargin
+
+    val expectedDf = spark.sql(query)
+
+    val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
+    if (diff.count() > 0) {
+      diff.show()
+      println("diff result rows")
+    }
+    assertEquals(0, diff.count())
 
-    //check if the table exists
-    assertTrue(s"Output table $outputTableName should exist", spark.catalog.tableExists(outputTableName))
-
-    val df1 = spark.sql(s"SELECT * FROM $outputTableName").toDF()
-    println(s"Table output ${df1.show()}")
-    
-    // Create GroupBy with incrementalMode = true
-    /*
-    val incrementalGroupBy = new GroupBy(
-      aggregations = aggregations,
-      keyColumns = Seq("user"),
-      inputDf = df,
-      incrementalMode = true
-    )
-     */
-    
-    // Test that incremental schema is available
-    //val incrementalSchema = incrementalGroupBy.incrementalSchema
-    //assertNotNull("Incremental schema should not be null", incrementalSchema)
-    //assertEquals("Should have correct number of incremental schema columns",
-    //            aggregations.length, incrementalSchema.length)
-
-
-    // Test that we can compute snapshot events
-    //val result = incrementalGroupBy.snapshotEvents(PartitionRange("2025-06-01", "2025-06-01"))
-    //println("================================")
-    //println(df.show())
-    //println(result.show())
-    //println("================================")
-    //assertNotNull("Should be able to compute snapshot events", result)
-    //assertTrue("Result should have data", result.count() > 100)
-    
-    // Verify that result contains expected columns
-    //val resultColumns = result.columns.toSet
-    //assertTrue("Result should contain user column", resultColumns.contains("user"))
-    //assertTrue("Result should contain partition column", resultColumns.contains(tableUtils.partitionColumn))
   }
 }

From 0bdc4fc5670ecebc1d5cfe3de4c9677485c47307 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 18 Jul 2025 15:43:23 -0700
Subject: [PATCH 15/54] Add GroupByTest for events

---
 .../ai/chronon/spark/test/GroupByTest.scala   | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index 8cb886c498..5a7d2f7e58 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -101,6 +101,7 @@ class GroupByTest {
 
     val groupBy = new GroupBy(aggregations, Seq("user"), df)
     val actualDf = groupBy.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
+
     val outputDatesRdd: RDD[Row] = spark.sparkContext.parallelize(outputDates.map(Row(_)))
     val outputDatesDf = spark.createDataFrame(outputDatesRdd, StructType(Seq(StructField("ds", SparkStringType))))
     val datesViewName = "test_group_by_snapshot_events_output_range"
@@ -748,6 +749,61 @@ class GroupByTest {
       println("diff result rows")
     }
     assertEquals(0, diff.count())
+  }
+
+  @Test
+  def testSnapshotIncrementalEvents(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val schema = List(
+      Column("user", StringType, 10), // ts = last 10 days
+      Column("session_length", IntType, 2),
+      Column("rating", DoubleType, 2000)
+    )
+
+    val outputDates = CStream.genPartitions(10, tableUtils.partitionSpec)
+
+    val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
+    df.drop("ts") // snapshots don't need ts.
+    val viewName = "test_group_by_snapshot_events"
+    df.createOrReplaceTempView(viewName)
+    val aggregations: Seq[Aggregation] = Seq(
+      Builders.Aggregation(Operation.SUM, "session_length", Seq(new Window(10, TimeUnit.DAYS))),
+        Builders.Aggregation(Operation.SUM, "rating", Seq(new Window(10, TimeUnit.DAYS)))
+    )
+
+    val groupBy = new GroupBy(aggregations, Seq("user"), df)
+    val actualDf = groupBy.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
+
+    val groupByIncremental = new GroupBy(aggregations, Seq("user"), df, incrementalMode = true)
+    val actualDfIncremental = groupByIncremental.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
+
+    val outputDatesRdd: RDD[Row] = spark.sparkContext.parallelize(outputDates.map(Row(_)))
+    val outputDatesDf = spark.createDataFrame(outputDatesRdd, StructType(Seq(StructField("ds", SparkStringType))))
+    val datesViewName = "test_group_by_snapshot_events_output_range"
+    outputDatesDf.createOrReplaceTempView(datesViewName)
+    val expectedDf = df.sqlContext.sql(s"""
+                                          |select user,
+                                          |       $datesViewName.ds,
+                                          |       SUM(IF(ts  >= (unix_timestamp($datesViewName.ds, 'yyyy-MM-dd') - 86400*(10-1)) * 1000, session_length, null)) AS session_length_sum_10d,
+                                          |       SUM(IF(ts  >= (unix_timestamp($datesViewName.ds, 'yyyy-MM-dd') - 86400*(10-1)) * 1000, rating, null)) AS rating_sum_10d
+                                          |FROM $viewName CROSS JOIN $datesViewName
+                                          |WHERE ts < unix_timestamp($datesViewName.ds, 'yyyy-MM-dd') * 1000 + ${tableUtils.partitionSpec.spanMillis}
+                                          |group by user, $datesViewName.ds
+                                          |""".stripMargin)
+
+    val diff = Comparison.sideBySide(actualDf, expectedDf, List("user", tableUtils.partitionColumn))
+    if (diff.count() > 0) {
+      diff.show()
+      println("diff result rows")
+    }
+    assertEquals(0, diff.count())
 
+    val diffIncremental = Comparison.sideBySide(actualDfIncremental, expectedDf, List("user", tableUtils.partitionColumn))
+    if (diffIncremental.count() > 0) {
+      diffIncremental.show()
+      println("diff result rows incremental")
+    }
+    assertEquals(0, diffIncremental.count())
   }
 }

From 7987931b1fda6229a689fc0779ce137d846269c7 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Tue, 2 Sep 2025 21:45:19 -0700
Subject: [PATCH 16/54] changes for incrementalg

---
 api/py/test/sample/scripts/spark_submit.sh | 9 ++++++++-
 api/py/test/sample/teams.json              | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/api/py/test/sample/scripts/spark_submit.sh b/api/py/test/sample/scripts/spark_submit.sh
index 45102e8843..ef048a5532 100644
--- a/api/py/test/sample/scripts/spark_submit.sh
+++ b/api/py/test/sample/scripts/spark_submit.sh
@@ -28,13 +28,14 @@
 
 set -euxo pipefail
 CHRONON_WORKING_DIR=${CHRONON_TMPDIR:-/tmp}/${USER}
+echo $CHRONON_WORKING_DIR
 mkdir -p ${CHRONON_WORKING_DIR}
 export TEST_NAME="${APP_NAME}_${USER}_test"
 unset PYSPARK_DRIVER_PYTHON
 unset PYSPARK_PYTHON
 unset SPARK_HOME
 unset SPARK_CONF_DIR
-export LOG4J_FILE="${CHRONON_WORKING_DIR}/log4j_file"
+export LOG4J_FILE="${CHRONON_WORKING_DIR}/log4j.properties"
 cat > ${LOG4J_FILE} << EOF
 log4j.rootLogger=INFO, stdout
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
@@ -47,6 +48,9 @@ EOF
 $SPARK_SUBMIT_PATH \
 --driver-java-options " -Dlog4j.configuration=file:${LOG4J_FILE}" \
 --conf "spark.executor.extraJavaOptions= -XX:ParallelGCThreads=4 -XX:+UseParallelGC -XX:+UseCompressedOops" \
+--conf "spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005 -Dlog4j.configuration=file:${LOG4J_FILE}" \
+--conf "spark.sql.warehouse.dir=/home/chaitu/projects/chronon/spark-warehouse" \
+--conf "javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=/home/chaitu/projects/chronon/hive-metastore/metastore_db;create=true" \
 --conf spark.sql.shuffle.partitions=${PARALLELISM:-4000} \
 --conf spark.dynamicAllocation.maxExecutors=${MAX_EXECUTORS:-1000} \
 --conf spark.default.parallelism=${PARALLELISM:-4000} \
@@ -77,3 +81,6 @@ tee ${CHRONON_WORKING_DIR}/${APP_NAME}_spark.log
 
 
 
+
+#--conf "spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005 -Dlog4j.rootLogger=INFO,console" \
+
diff --git a/api/py/test/sample/teams.json b/api/py/test/sample/teams.json
index 39f7a25559..a60502b65d 100644
--- a/api/py/test/sample/teams.json
+++ b/api/py/test/sample/teams.json
@@ -5,7 +5,7 @@
         },
         "common_env": {
             "VERSION": "latest",
-            "SPARK_SUBMIT_PATH": "[TODO]/path/to/spark-submit",
+            "SPARK_SUBMIT_PATH": "spark-submit",
             "JOB_MODE": "local[*]",
             "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing",
             "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class",

From 7b62a43415a45abffe837b4a89d3fdd4370d13db Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 5 Sep 2025 02:14:13 -0700
Subject: [PATCH 17/54] add last hole logic for incrementnal bacckfill

---
 .../main/scala/ai/chronon/spark/GroupBy.scala | 43 ++++++++++++++++---
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index d536ed2876..e5f265c5b4 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -36,6 +36,7 @@ import org.slf4j.LoggerFactory
 import java.util
 import scala.collection.{Seq, mutable}
 import scala.util.ScalaJavaConversions.{JListOps, ListOps, MapOps}
+import _root_.com.google.common.collect.Table
 
 class GroupBy(val aggregations: Seq[api.Aggregation],
               val keyColumns: Seq[String],
@@ -426,7 +427,6 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
                              tableProps: Map[String, String]) = {
 
       val hops = hopsAggregate(range.toTimePoints.min, DailyResolution)
-      println(s"Saving incremental hops to ${hops.map(x  => x._1.data.mkString(",")).take(20)}.")
       val hopsDf: DataFrame = convertHopsToDf(hops, incrementalSchema)
       hopsDf.save(incrementalOutputTable, tableProps)
     }
@@ -748,14 +748,49 @@ object GroupBy {
       .map(_.toScala)
       .orNull
 
+    logger.info(s"Writing incremental df to $incrementalOutputTable")
     val incrementalGroupByBackfill =
       from(groupByConf, range, tableUtils, computeDependency = true, incrementalMode = true)
 
-    incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, range, tableProps)
+    val incTableRange = PartitionRange(
+      tableUtils.firstAvailablePartition(incrementalOutputTable).get,
+      tableUtils.lastAvailablePartition(incrementalOutputTable).get
+    )(tableUtils)
+
+    val allPartitionRangeHoles: Option[Seq[PartitionRange]] = computePartitionRangeHoles(incTableRange, range, tableUtils)
+
+    allPartitionRangeHoles.foreach { holes =>
+      holes.foreach { hole =>
+        logger.info(s"Filling hole in incremental table: $hole")
+        incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, range, tableProps)
+      }
+    }
 
     incrementalGroupByBackfill
   }
 
+/**
+  * Compute the holes in the incremental output table
+  * 
+  * holes are partitions that are not in the incremetnal output table but are in the source queryable range
+  *
+  * @param incTableRange the range of the incremental output table
+  * @param sourceQueryableRange the range of the source queryable range
+  * @return the holes in the incremental output table
+  */
+  private def computePartitionRangeHoles(
+                                 incTableRange: PartitionRange,
+                                 queryRange: PartitionRange,
+                                 tableUtils: TableUtils): Option[Seq[PartitionRange]] = {
+
+
+     if (queryRange.end <= incTableRange.end) {
+      None
+     } else {
+       Some(Seq(PartitionRange(tableUtils.partitionSpec.shift(incTableRange.end, 1), queryRange.end)))
+     }
+  }
+
   def fromIncrementalDf(
                        groupByConf: api.GroupBy,
                        range: PartitionRange,
@@ -779,9 +814,7 @@ object GroupBy {
       incTableLastPartition.get
     )(tableUtils)
 
-    val incrementalDf: DataFrame = tableUtils.sql(
-      incTableRange.intersect(sourceQueryableRange).genScanQuery(null, incrementalOutputTable)
-    )
+    val (_, incrementalDf: DataFrame) =  incTableRange.intersect(sourceQueryableRange).scanQueryStringAndDf(null, incrementalOutputTable)
 
     val incrementalAggregations = incrementalGroupByBackfill.flattenedAgg.aggregationParts.zip(groupByConf.getAggregations.toScala).map{ case (part, agg) =>
       val newAgg = agg.deepCopy()

From aeeb5ecb71f773a262ebd2333bd42e3363bcc1a5 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 5 Sep 2025 02:20:14 -0700
Subject: [PATCH 18/54] fix syntax

---
 spark/src/main/scala/ai/chronon/spark/GroupBy.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index e5f265c5b4..b1e4390984 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -787,7 +787,7 @@ object GroupBy {
      if (queryRange.end <= incTableRange.end) {
       None
      } else {
-       Some(Seq(PartitionRange(tableUtils.partitionSpec.shift(incTableRange.end, 1), queryRange.end)))
+       Some(Seq(PartitionRange(tableUtils.partitionSpec.shift(incTableRange.end, 1), queryRange.end)(tableUtils)))
      }
   }
 

From 9180d233cdd74976cdb18528b08552f198f5841e Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Sat, 6 Sep 2025 01:09:01 -0700
Subject: [PATCH 19/54] fix bug : backfill only for missing holes

---
 spark/src/main/scala/ai/chronon/spark/GroupBy.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index b1e4390984..a8f5b4e45b 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -762,7 +762,7 @@ object GroupBy {
     allPartitionRangeHoles.foreach { holes =>
       holes.foreach { hole =>
         logger.info(s"Filling hole in incremental table: $hole")
-        incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, range, tableProps)
+        incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, hole, tableProps)
       }
     }
 

From ee81672109f80a022d3b53b268e21486b66698c9 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Sun, 7 Sep 2025 05:38:37 -0700
Subject: [PATCH 20/54] fix none error for inc Table

---
 .../main/scala/ai/chronon/spark/GroupBy.scala | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index a8f5b4e45b..fe142949d5 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -752,14 +752,20 @@ object GroupBy {
     val incrementalGroupByBackfill =
       from(groupByConf, range, tableUtils, computeDependency = true, incrementalMode = true)
 
-    val incTableRange = PartitionRange(
-      tableUtils.firstAvailablePartition(incrementalOutputTable).get,
-      tableUtils.lastAvailablePartition(incrementalOutputTable).get
-    )(tableUtils)
+    val incTableRange: Option[PartitionRange] = for {
+        first <- tableUtils.firstAvailablePartition(incrementalOutputTable)
+        last  <- tableUtils.lastAvailablePartition(incrementalOutputTable)
+    } yield
+      PartitionRange(first, last)(tableUtils)
 
-    val allPartitionRangeHoles: Option[Seq[PartitionRange]] = computePartitionRangeHoles(incTableRange, range, tableUtils)
 
-    allPartitionRangeHoles.foreach { holes =>
+    val partitionRangeHoles: Option[Seq[PartitionRange]] = incTableRange match {
+      case None => Some(Seq(range))
+      case Some(incrementalTableRange) =>
+        computePartitionRangeHoles(incrementalTableRange, range, tableUtils)
+    }
+
+    partitionRangeHoles.foreach { holes =>
       holes.foreach { hole =>
         logger.info(s"Filling hole in incremental table: $hole")
         incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, hole, tableProps)
@@ -771,8 +777,7 @@ object GroupBy {
 
 /**
   * Compute the holes in the incremental output table
-  * 
-  * holes are partitions that are not in the incremetnal output table but are in the source queryable range
+  *
   *
   * @param incTableRange the range of the incremental output table
   * @param sourceQueryableRange the range of the source queryable range

From 29a3f2814ff8c720f0f7528fc7c9c53e9e1749ae Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 19 Sep 2025 13:41:11 -0700
Subject: [PATCH 21/54] add incremental table queryable range

---
 .../main/scala/ai/chronon/spark/GroupBy.scala | 68 ++++++-------------
 1 file changed, 19 insertions(+), 49 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index fe142949d5..debf47f1b9 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -741,29 +741,27 @@ object GroupBy {
                             groupByConf: api.GroupBy,
                             range: PartitionRange,
                             tableUtils: TableUtils,
-                          ): GroupBy = {
+                            incrementalOutputTable: String,
+                            incrementalGroupByBackfill: GroupBy,
+                          ): PartitionRange = {
 
-    val incrementalOutputTable: String = groupByConf.metaData.incrementalOutputTable
     val tableProps: Map[String, String] = Option(groupByConf.metaData.tableProperties)
       .map(_.toScala)
       .orNull
 
-    logger.info(s"Writing incremental df to $incrementalOutputTable")
-    val incrementalGroupByBackfill =
-      from(groupByConf, range, tableUtils, computeDependency = true, incrementalMode = true)
+    val incrementalQueryableRange = PartitionRange(
+      tableUtils.partitionSpec.minus(range.start, groupByConf.maxWindow.get),
+      range.end
+    )(tableUtils)
 
-    val incTableRange: Option[PartitionRange] = for {
-        first <- tableUtils.firstAvailablePartition(incrementalOutputTable)
-        last  <- tableUtils.lastAvailablePartition(incrementalOutputTable)
-    } yield
-      PartitionRange(first, last)(tableUtils)
 
+    logger.info(s"Writing incremental df to $incrementalOutputTable")
 
-    val partitionRangeHoles: Option[Seq[PartitionRange]] = incTableRange match {
-      case None => Some(Seq(range))
-      case Some(incrementalTableRange) =>
-        computePartitionRangeHoles(incrementalTableRange, range, tableUtils)
-    }
+
+    val partitionRangeHoles: Option[Seq[PartitionRange]] = tableUtils.unfilledRanges(
+      incrementalOutputTable,
+      incrementalQueryableRange,
+    )
 
     partitionRangeHoles.foreach { holes =>
       holes.foreach { hole =>
@@ -772,29 +770,9 @@ object GroupBy {
       }
     }
 
-    incrementalGroupByBackfill
+    incrementalQueryableRange
   }
 
-/**
-  * Compute the holes in the incremental output table
-  *
-  *
-  * @param incTableRange the range of the incremental output table
-  * @param sourceQueryableRange the range of the source queryable range
-  * @return the holes in the incremental output table
-  */
-  private def computePartitionRangeHoles(
-                                 incTableRange: PartitionRange,
-                                 queryRange: PartitionRange,
-                                 tableUtils: TableUtils): Option[Seq[PartitionRange]] = {
-
-
-     if (queryRange.end <= incTableRange.end) {
-      None
-     } else {
-       Some(Seq(PartitionRange(tableUtils.partitionSpec.shift(incTableRange.end, 1), queryRange.end)(tableUtils)))
-     }
-  }
 
   def fromIncrementalDf(
                        groupByConf: api.GroupBy,
@@ -802,24 +780,16 @@ object GroupBy {
                        tableUtils: TableUtils,
                      ): GroupBy = {
 
+    val incrementalOutputTable = groupByConf.metaData.incrementalOutputTable
 
-    val incrementalGroupByBackfill: GroupBy = computeIncrementalDf(groupByConf, range, tableUtils)
+    val incrementalGroupByBackfill =
+      from(groupByConf, range, tableUtils, computeDependency = true, incrementalMode = true)
 
-    val incrementalOutputTable = groupByConf.metaData.incrementalOutputTable
-    val sourceQueryableRange = PartitionRange(
-      tableUtils.partitionSpec.minus(range.start, groupByConf.maxWindow.get),
-      range.end
-    )(tableUtils)
 
-    val incTableFirstPartition: Option[String] = tableUtils.firstAvailablePartition(incrementalOutputTable)
-    val incTableLastPartition: Option[String] = tableUtils.lastAvailablePartition(incrementalOutputTable)
+    val incrementalQueryableRange = computeIncrementalDf(groupByConf, range, tableUtils, incrementalOutputTable, incrementalGroupByBackfill)
 
-    val incTableRange = PartitionRange(
-      incTableFirstPartition.get,
-      incTableLastPartition.get
-    )(tableUtils)
+    val (_, incrementalDf: DataFrame) =  incrementalQueryableRange.scanQueryStringAndDf(null, incrementalOutputTable)
 
-    val (_, incrementalDf: DataFrame) =  incTableRange.intersect(sourceQueryableRange).scanQueryStringAndDf(null, incrementalOutputTable)
 
     val incrementalAggregations = incrementalGroupByBackfill.flattenedAgg.aggregationParts.zip(groupByConf.getAggregations.toScala).map{ case (part, agg) =>
       val newAgg = agg.deepCopy()

From aa1601084f0676f040a1b9fd40aaf0c34f599078 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 19 Sep 2025 14:10:40 -0700
Subject: [PATCH 22/54] add logging for tableUtils

---
 .../src/main/scala/ai/chronon/spark/TableUtils.scala | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
index a83943407d..44d7b19758 100644
--- a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
+++ b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -855,6 +855,8 @@ case class TableUtils(sparkSession: SparkSession) {
                      inputToOutputShift: Int = 0,
                      skipFirstHole: Boolean = true): Option[Seq[PartitionRange]] = {
 
+    logger.info(s"-----------UnfilledRanges---------------------")
+    logger.info(s"unfilled range called for output table: $outputTable")
     val validPartitionRange = if (outputPartitionRange.start == null) { // determine partition range automatically
       val inputStart = inputTables.flatMap(
         _.map(table =>
@@ -872,6 +874,8 @@ case class TableUtils(sparkSession: SparkSession) {
     } else {
       outputPartitionRange
     }
+
+    logger.info(s"Determined valid partition range: $validPartitionRange")
     val outputExisting = partitions(outputTable)
     // To avoid recomputing partitions removed by retention mechanisms we will not fill holes in the very beginning of the range
     // If a user fills a new partition in the newer end of the range, then we will never fill any partitions before that range.
@@ -881,13 +885,19 @@ case class TableUtils(sparkSession: SparkSession) {
     } else {
       validPartitionRange.start
     }
+
+    logger.info(s"Cutoff partition for skipping holes is set to $cutoffPartition")
     val fillablePartitions =
       if (skipFirstHole) {
         validPartitionRange.partitions.toSet.filter(_ >= cutoffPartition)
       } else {
         validPartitionRange.partitions.toSet
       }
+
+    logger.info(s"Fillable partitions : ${fillablePartitions}")
     val outputMissing = fillablePartitions -- outputExisting
+
+    logger.info(s"outputMissing : ${outputMissing}")
     val allInputExisting = inputTables
       .map { tables =>
         tables
@@ -900,6 +910,8 @@ case class TableUtils(sparkSession: SparkSession) {
       }
       .getOrElse(fillablePartitions)
 
+    logger.info(s"allInputExisting : ${allInputExisting}")
+
     val inputMissing = fillablePartitions -- allInputExisting
     val missingPartitions = outputMissing -- inputMissing
     val missingChunks = chunk(missingPartitions)

From ff41cc9f317db023e5323d776e68daabac97e729 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Fri, 19 Sep 2025 15:34:15 -0700
Subject: [PATCH 23/54] add log

---
 spark/src/main/scala/ai/chronon/spark/TableUtils.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
index 44d7b19758..2710ccb873 100644
--- a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
+++ b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -877,6 +877,7 @@ case class TableUtils(sparkSession: SparkSession) {
 
     logger.info(s"Determined valid partition range: $validPartitionRange")
     val outputExisting = partitions(outputTable)
+    logger.info(s"outputExisting : ${outputExisting}")
     // To avoid recomputing partitions removed by retention mechanisms we will not fill holes in the very beginning of the range
     // If a user fills a new partition in the newer end of the range, then we will never fill any partitions before that range.
     // We instead log a message saying why we won't fill the earliest hole.

From aa25f9fb499c8a0ad4565b21fa1754fe18ecb6a3 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Sun, 21 Sep 2025 23:19:47 -0700
Subject: [PATCH 24/54] fill incremental holes

---
 .../scala/ai/chronon/spark/DataRange.scala    |  5 +++
 .../main/scala/ai/chronon/spark/GroupBy.scala | 35 ++++++++++---------
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/DataRange.scala b/spark/src/main/scala/ai/chronon/spark/DataRange.scala
index b0af96e05d..e6f13e0ea0 100644
--- a/spark/src/main/scala/ai/chronon/spark/DataRange.scala
+++ b/spark/src/main/scala/ai/chronon/spark/DataRange.scala
@@ -53,6 +53,11 @@ case class PartitionRange(start: String, end: String)(implicit tableUtils: Table
     }
   }
 
+  def daysBetween: Int = {
+    if (start == null || end == null) 0
+    else Stream.iterate(start)(tableUtils.partitionSpec.after).takeWhile(_ <= end).size
+  }
+
   def isSingleDay: Boolean = {
     start == end
   }
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index debf47f1b9..ac45afb825 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -23,7 +23,7 @@ import ai.chronon.aggregator.windowing._
 import ai.chronon.api
 import ai.chronon.api.DataModel.{Entities, Events}
 import ai.chronon.api.Extensions._
-import ai.chronon.api.{Accuracy, Constants, DataModel, ParametricMacro, Source}
+import ai.chronon.api.{Accuracy, Constants, DataModel, ParametricMacro, Source, TimeUnit, Window}
 import ai.chronon.online.{RowWrapper, SparkConversions}
 import ai.chronon.spark.Extensions._
 import org.apache.spark.rdd.RDD
@@ -505,6 +505,8 @@ object GroupBy {
            incrementalMode: Boolean = false): GroupBy = {
     logger.info(s"\n----[Processing GroupBy: ${groupByConfOld.metaData.name}]----")
     val groupByConf = replaceJoinSource(groupByConfOld, queryRange, tableUtils, computeDependency, showDf)
+    val sourceQueryWindow: Option[Window]  = if (incrementalMode) Some(new Window(queryRange.daysBetween, TimeUnit.DAYS)) else groupByConf.maxWindow
+    val backfillQueryRange: PartitionRange = if (incrementalMode) PartitionRange(queryRange.end, queryRange.end)(tableUtils) else queryRange
     val inputDf = groupByConf.sources.toScala
       .map { source =>
         val partitionColumn = tableUtils.getPartitionColumn(source.query)
@@ -513,9 +515,9 @@ object GroupBy {
             groupByConf,
             source,
             groupByConf.getKeyColumns.toScala,
-            queryRange,
+            backfillQueryRange,
             tableUtils,
-            groupByConf.maxWindow,
+            sourceQueryWindow,
             groupByConf.inferredAccuracy,
             partitionColumn = partitionColumn
           ),
@@ -742,8 +744,7 @@ object GroupBy {
                             range: PartitionRange,
                             tableUtils: TableUtils,
                             incrementalOutputTable: String,
-                            incrementalGroupByBackfill: GroupBy,
-                          ): PartitionRange = {
+                          ): (PartitionRange, Seq[api.AggregationPart]) = {
 
     val tableProps: Map[String, String] = Option(groupByConf.metaData.tableProperties)
       .map(_.toScala)
@@ -754,23 +755,28 @@ object GroupBy {
       range.end
     )(tableUtils)
 
-
     logger.info(s"Writing incremental df to $incrementalOutputTable")
 
-
     val partitionRangeHoles: Option[Seq[PartitionRange]] = tableUtils.unfilledRanges(
       incrementalOutputTable,
       incrementalQueryableRange,
     )
 
-    partitionRangeHoles.foreach { holes =>
+    val incrementalGroupByAggParts  = partitionRangeHoles.map { holes =>
       holes.foreach { hole =>
         logger.info(s"Filling hole in incremental table: $hole")
+        val incrementalGroupByBackfill =
+          from(groupByConf, hole, tableUtils, computeDependency = true, incrementalMode = true)
         incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, hole, tableProps)
       }
-    }
 
-    incrementalQueryableRange
+      holes.headOption.map { firstHole =>
+        from(groupByConf, firstHole, tableUtils, computeDependency = true, incrementalMode = true)
+          .flattenedAgg.aggregationParts
+        }.getOrElse(Seq.empty)
+      }.getOrElse(Seq.empty)
+
+    (incrementalQueryableRange, incrementalGroupByAggParts)
   }
 
 
@@ -782,16 +788,11 @@ object GroupBy {
 
     val incrementalOutputTable = groupByConf.metaData.incrementalOutputTable
 
-    val incrementalGroupByBackfill =
-      from(groupByConf, range, tableUtils, computeDependency = true, incrementalMode = true)
-
-
-    val incrementalQueryableRange = computeIncrementalDf(groupByConf, range, tableUtils, incrementalOutputTable, incrementalGroupByBackfill)
+    val (incrementalQueryableRange, aggregationParts) = computeIncrementalDf(groupByConf, range, tableUtils, incrementalOutputTable)
 
     val (_, incrementalDf: DataFrame) =  incrementalQueryableRange.scanQueryStringAndDf(null, incrementalOutputTable)
 
-
-    val incrementalAggregations = incrementalGroupByBackfill.flattenedAgg.aggregationParts.zip(groupByConf.getAggregations.toScala).map{ case (part, agg) =>
+    val incrementalAggregations = aggregationParts.zip(groupByConf.getAggregations.toScala).map{ case (part, agg) =>
       val newAgg = agg.deepCopy()
       newAgg.setInputColumn(part.incrementalOutputColumnName)
       newAgg

From 3efe8cd588edba06cb558fbc3621148a149e9244 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Wed, 1 Oct 2025 23:07:06 -0700
Subject: [PATCH 25/54] modify incremental aggregation parts

---
 spark/src/main/scala/ai/chronon/spark/GroupBy.scala | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index ac45afb825..5131a18121 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -763,18 +763,16 @@ object GroupBy {
     )
 
     val incrementalGroupByAggParts  = partitionRangeHoles.map { holes =>
-      holes.foreach { hole =>
+      val incrementalAggregationParts = holes.map{ hole =>
         logger.info(s"Filling hole in incremental table: $hole")
         val incrementalGroupByBackfill =
           from(groupByConf, hole, tableUtils, computeDependency = true, incrementalMode = true)
         incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, hole, tableProps)
+        incrementalGroupByBackfill.flattenedAgg.aggregationParts
       }
 
-      holes.headOption.map { firstHole =>
-        from(groupByConf, firstHole, tableUtils, computeDependency = true, incrementalMode = true)
-          .flattenedAgg.aggregationParts
-        }.getOrElse(Seq.empty)
-      }.getOrElse(Seq.empty)
+      incrementalAggregationParts.headOption.getOrElse(Seq.empty)
+    }.getOrElse(Seq.empty)
 
     (incrementalQueryableRange, incrementalGroupByAggParts)
   }

From a3bece636429584fd324940ef6a241dc9506d902 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Mon, 6 Oct 2025 22:10:14 -0700
Subject: [PATCH 26/54] remove logs for debugging

---
 spark/src/main/scala/ai/chronon/spark/TableUtils.scala | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
index 2710ccb873..3879bf1636 100644
--- a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
+++ b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -855,8 +855,6 @@ case class TableUtils(sparkSession: SparkSession) {
                      inputToOutputShift: Int = 0,
                      skipFirstHole: Boolean = true): Option[Seq[PartitionRange]] = {
 
-    logger.info(s"-----------UnfilledRanges---------------------")
-    logger.info(s"unfilled range called for output table: $outputTable")
     val validPartitionRange = if (outputPartitionRange.start == null) { // determine partition range automatically
       val inputStart = inputTables.flatMap(
         _.map(table =>
@@ -874,8 +872,6 @@ case class TableUtils(sparkSession: SparkSession) {
     } else {
       outputPartitionRange
     }
-
-    logger.info(s"Determined valid partition range: $validPartitionRange")
     val outputExisting = partitions(outputTable)
     logger.info(s"outputExisting : ${outputExisting}")
     // To avoid recomputing partitions removed by retention mechanisms we will not fill holes in the very beginning of the range
@@ -895,10 +891,8 @@ case class TableUtils(sparkSession: SparkSession) {
         validPartitionRange.partitions.toSet
       }
 
-    logger.info(s"Fillable partitions : ${fillablePartitions}")
     val outputMissing = fillablePartitions -- outputExisting
 
-    logger.info(s"outputMissing : ${outputMissing}")
     val allInputExisting = inputTables
       .map { tables =>
         tables
@@ -910,9 +904,7 @@ case class TableUtils(sparkSession: SparkSession) {
           .map(partitionSpec.shift(_, inputToOutputShift))
       }
       .getOrElse(fillablePartitions)
-
-    logger.info(s"allInputExisting : ${allInputExisting}")
-
+    
     val inputMissing = fillablePartitions -- allInputExisting
     val missingPartitions = outputMissing -- inputMissing
     val missingChunks = chunk(missingPartitions)

From 897d18c274e522f634308298b1e3aee06dbe64c9 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Sun, 12 Oct 2025 13:36:20 -0700
Subject: [PATCH 27/54] fix output schema from incremenntal aggregations. Added
 unit tests

---
 .../aggregator/row/RowAggregator.scala        |  18 +-
 .../main/scala/ai/chronon/spark/GroupBy.scala |  22 ++-
 .../ai/chronon/spark/test/GroupByTest.scala   | 174 ++++++++++++------
 3 files changed, 151 insertions(+), 63 deletions(-)

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
index e9d0608d25..43b2807740 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
@@ -24,7 +24,10 @@ import scala.collection.Seq
 
 // The primary API of the aggregator package.
 // the semantics are to mutate values in place for performance reasons
-class RowAggregator(val inputSchema: Seq[(String, DataType)], val aggregationParts: Seq[AggregationPart])
+// userAggregationParts is used when incrementalMode = True.
+class RowAggregator(val inputSchema: Seq[(String, DataType)],
+                    val aggregationParts: Seq[AggregationPart],
+                    val userInputAggregationParts: Option[Seq[AggregationPart]] = None )
     extends Serializable
     with SimpleAggregator[Row, Array[Any], Array[Any]] {
 
@@ -70,16 +73,25 @@ class RowAggregator(val inputSchema: Seq[(String, DataType)], val aggregationPar
     .toArray
     .zip(columnAggregators.map(_.irType))
 
-  val incrementalOutputSchema = aggregationParts
+  val incrementalOutputSchema: Array[(String, DataType)] = aggregationParts
     .map(_.incrementalOutputColumnName)
     .toArray
     .zip(columnAggregators.map(_.irType))
 
-  val outputSchema: Array[(String, DataType)] = aggregationParts
+  val aggregationPartsOutputSchema: Array[(String, DataType)] = aggregationParts
     .map(_.outputColumnName)
     .toArray
     .zip(columnAggregators.map(_.outputType))
 
+  val outputSchema: Array[(String, DataType)] = userInputAggregationParts
+    .map{ parts =>
+      parts
+        .map(_.outputColumnName)
+        .toArray
+        .zip(columnAggregators.map(_.outputType))
+    }.getOrElse(aggregationPartsOutputSchema)
+
+
   val isNotDeletable: Boolean = columnAggregators.forall(!_.isDeletable)
 
   // this will mutate in place
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 5131a18121..034e1e1342 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -44,7 +44,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
               val mutationDfFn: () => DataFrame = null,
               skewFilter: Option[String] = None,
               finalize: Boolean = true,
-              incrementalMode: Boolean = false
+              val userInputAggregations: Seq[api.Aggregation] = null
              )
     extends Serializable {
   @transient lazy val logger = LoggerFactory.getLogger(getClass)
@@ -105,8 +105,13 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
 
 
   @transient
-  protected[spark] lazy val windowAggregator: RowAggregator =
-    new RowAggregator(selectedSchema, aggregations.flatMap(_.unpack))
+  protected[spark] lazy val windowAggregator: RowAggregator = {
+    if (userInputAggregations != null) {
+       new RowAggregator(selectedSchema, aggregations.flatMap(_.unpack), Option(userInputAggregations.flatMap(_.unpack)))
+    } else {
+        new RowAggregator(selectedSchema, aggregations.flatMap(_.unpack))
+    }
+  }
 
   def snapshotEntitiesBase: RDD[(Array[Any], Array[Any])] = {
     val keys = (keyColumns :+ tableUtils.partitionColumn).toArray
@@ -421,7 +426,12 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
       windowAggregator.normalize(ir)
     }
 
-
+  /**
+   * computes incremental daily table
+   * @param incrementalOutputTable output of the incremental data stored here
+   * @param range date range to calculate daily aggregatiosn
+   * @param tableProps
+   */
     def computeIncrementalDf(incrementalOutputTable: String,
                              range: PartitionRange,
                              tableProps: Map[String, String]) = {
@@ -610,7 +620,6 @@ object GroupBy {
         nullFiltered,
         mutationDfFn,
         finalize = finalizeValue,
-        incrementalMode = incrementalMode,
     )
   }
 
@@ -760,6 +769,7 @@ object GroupBy {
     val partitionRangeHoles: Option[Seq[PartitionRange]] = tableUtils.unfilledRanges(
       incrementalOutputTable,
       incrementalQueryableRange,
+      skipFirstHole = false
     )
 
     val incrementalGroupByAggParts  = partitionRangeHoles.map { holes =>
@@ -802,7 +812,7 @@ object GroupBy {
       incrementalDf,
       () => null,
       finalize = true,
-      incrementalMode = false,
+      userInputAggregations=groupByConf.aggregations.toScala
     )
 
   }
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index b7fa2ef651..e4f04ebc1b 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -19,20 +19,7 @@ package ai.chronon.spark.test
 import ai.chronon.aggregator.test.{CStream, Column, NaiveAggregator}
 import ai.chronon.aggregator.windowing.FiveMinuteResolution
 import ai.chronon.api.Extensions._
-import ai.chronon.api.{
-  Aggregation,
-  Builders,
-  Constants,
-  Derivation,
-  DoubleType,
-  IntType,
-  LongType,
-  Operation,
-  Source,
-  StringType,
-  TimeUnit,
-  Window
-}
+import ai.chronon.api.{Aggregation, Builders, Constants, Derivation, DoubleType, IntType, LongType, Operation, Source, StringType, TimeUnit, Window}
 import ai.chronon.online.{RowWrapper, SparkConversions}
 import ai.chronon.spark.Extensions._
 import ai.chronon.spark._
@@ -961,11 +948,51 @@ class GroupByTest {
     assertTrue("Should be able to filter GroupBy results", filteredResult.count() >= 0)
   }
 
+
+  private def createTestSourceIncremental(windowSize: Int = 365,
+                               suffix: String = "",
+                               partitionColOpt: Option[String] = None): (Source, String) = {
+    lazy val spark: SparkSession =
+      SparkSessionBuilder.build("GroupByIncrementalTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val today = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val startPartition = tableUtils.partitionSpec.minus(today, new Window(windowSize, TimeUnit.DAYS))
+    val endPartition = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val sourceSchema = List(
+      Column("user", StringType, 10), // ts = last 10 days
+      Column("session_length", IntType, 2),
+      Column("rating", DoubleType, 2000)
+    )
+    val namespace = "chronon_incremental_test"
+    val sourceTable = s"$namespace.test_group_by_steps$suffix"
+
+    tableUtils.createDatabase(namespace)
+    val genDf =
+      DataFrameGen.events(spark, sourceSchema, count = 1000, partitions = 200, partitionColOpt = partitionColOpt)
+    partitionColOpt match {
+      case Some(partitionCol) => genDf.save(sourceTable, partitionColumns = Seq(partitionCol))
+      case None               => genDf.save(sourceTable)
+    }
+
+    val source = Builders.Source.events(
+      query = Builders.Query(selects = Builders.Selects("ts", "user", "time_spent_ms", "price"),
+        startPartition = startPartition,
+        partitionColumn = partitionColOpt.orNull),
+      table = sourceTable
+    )
+    (source, endPartition)
+  }
+
+  /**
+   * the test compute daily intermediate aggregations
+   *
+   *  Test is one daily partition data is correct
+   */
   @Test
-  def testIncrementalMode(): Unit = {
+  def testIncrementalDailyData(): Unit = {
     lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncremental" + "_" + Random.alphanumeric.take(6).mkString, local = true)
     implicit val tableUtils = TableUtils(spark)
-    val namespace = "incremental"
+    val namespace = s"incremental_groupBy_${Random.alphanumeric.take(6).mkString}"
     tableUtils.createDatabase(namespace)
     val schema = List(
       Column("user", StringType, 10), // ts = last 10 days
@@ -975,11 +1002,7 @@ class GroupByTest {
 
     val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
 
-    println(s"Input DataFrame: ${df.count()}")
-
     val aggregations: Seq[Aggregation] = Seq(
-        //Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "session_length", Seq(new Window(10, TimeUnit.DAYS), WindowUtils.Unbounded)),
-        //Builders.Aggregation(Operation.UNIQUE_COUNT, "session_length", Seq(new Window(10, TimeUnit.DAYS), WindowUtils.Unbounded)),
         Builders.Aggregation(Operation.SUM, "session_length", Seq(new Window(10, TimeUnit.DAYS)))
     )
 
@@ -987,14 +1010,12 @@ class GroupByTest {
       "source" -> "chronon"
     )
 
+    val partitionRange = PartitionRange("2025-05-01", "2025-06-01")
     val groupBy = new GroupBy(aggregations, Seq("user"), df)
-    groupBy.computeIncrementalDf("incremental.testIncrementalOutput", PartitionRange("2025-05-01", "2025-06-01"), tableProps)
+    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalOutput", partitionRange, tableProps)
 
-    val actualIncrementalDf = spark.sql(s"select * from incremental.testIncrementalOutput where ds='2025-05-11'")
+    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalOutput where ds='2025-05-11'")
     df.createOrReplaceTempView("test_incremental_input")
-    //spark.sql(s"select * from test_incremental_input where user='user7' and ds='2025-05-11'").show(numRows=100)
-
-    spark.sql(s"select * from incremental.testIncrementalOutput where ds='2025-05-11'").show()
 
     val query =
       s"""
@@ -1018,55 +1039,100 @@ class GroupByTest {
   def testSnapshotIncrementalEvents(): Unit = {
     lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
     implicit val tableUtils = TableUtils(spark)
-    val schema = List(
-      Column("user", StringType, 10), // ts = last 10 days
-      Column("session_length", IntType, 2),
-      Column("rating", DoubleType, 2000)
-    )
+    val namespace =  s"incremental_groupBy_snapshot_${Random.alphanumeric.take(6).mkString}"
+    tableUtils.createDatabase(namespace)
+
 
     val outputDates = CStream.genPartitions(10, tableUtils.partitionSpec)
 
-    val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
-    df.drop("ts") // snapshots don't need ts.
-    val viewName = "test_group_by_snapshot_events"
-    df.createOrReplaceTempView(viewName)
     val aggregations: Seq[Aggregation] = Seq(
-      Builders.Aggregation(Operation.SUM, "session_length", Seq(new Window(10, TimeUnit.DAYS))),
-        Builders.Aggregation(Operation.SUM, "rating", Seq(new Window(10, TimeUnit.DAYS)))
+      Builders.Aggregation(Operation.SUM, "time_spent_ms", Seq(new Window(10, TimeUnit.DAYS), new Window(5, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.SUM, "price", Seq(new Window(10, TimeUnit.DAYS)))
     )
 
-    val groupBy = new GroupBy(aggregations, Seq("user"), df)
+    val (source, endPartition) = createTestSource(windowSize = 30, suffix = "_snapshot_events", partitionColOpt = Some(tableUtils.partitionColumn))
+    val groupByConf = Builders.GroupBy(
+      sources = Seq(source),
+      keyColumns = Seq("item"),
+      aggregations = aggregations,
+      metaData = Builders.MetaData(name = "testSnapshotIncremental", namespace = namespace, team = "chronon"),
+      backfillStartDate = tableUtils.partitionSpec.minus(tableUtils.partitionSpec.at(System.currentTimeMillis()),
+        new Window(20, TimeUnit.DAYS))
+    )
+
+    val df = spark.read.table(source.table)
+    val groupBy = new GroupBy(aggregations, Seq("item"), df.filter("item is not null"))
     val actualDf = groupBy.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
 
-    val groupByIncremental = new GroupBy(aggregations, Seq("user"), df, incrementalMode = true)
-    val actualDfIncremental = groupByIncremental.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
+    val  groupByIncremental = GroupBy.fromIncrementalDf(groupByConf, PartitionRange(outputDates.min, outputDates.max), tableUtils)
+    val incrementalExpectedDf = groupByIncremental.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
 
     val outputDatesRdd: RDD[Row] = spark.sparkContext.parallelize(outputDates.map(Row(_)))
     val outputDatesDf = spark.createDataFrame(outputDatesRdd, StructType(Seq(StructField("ds", SparkStringType))))
     val datesViewName = "test_group_by_snapshot_events_output_range"
     outputDatesDf.createOrReplaceTempView(datesViewName)
-    val expectedDf = df.sqlContext.sql(s"""
-                                          |select user,
-                                          |       $datesViewName.ds,
-                                          |       SUM(IF(ts  >= (unix_timestamp($datesViewName.ds, 'yyyy-MM-dd') - 86400*(10-1)) * 1000, session_length, null)) AS session_length_sum_10d,
-                                          |       SUM(IF(ts  >= (unix_timestamp($datesViewName.ds, 'yyyy-MM-dd') - 86400*(10-1)) * 1000, rating, null)) AS rating_sum_10d
-                                          |FROM $viewName CROSS JOIN $datesViewName
-                                          |WHERE ts < unix_timestamp($datesViewName.ds, 'yyyy-MM-dd') * 1000 + ${tableUtils.partitionSpec.spanMillis}
-                                          |group by user, $datesViewName.ds
-                                          |""".stripMargin)
 
-    val diff = Comparison.sideBySide(actualDf, expectedDf, List("user", tableUtils.partitionColumn))
+    val diff = Comparison.sideBySide(actualDf, incrementalExpectedDf, List("item", tableUtils.partitionColumn))
     if (diff.count() > 0) {
       diff.show()
       println("diff result rows")
     }
     assertEquals(0, diff.count())
+  }
+
+
+  @Test
+  def testIncrementalModeReuseAggregation(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncrementalReuseAgg" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val namespace = s"incremental_groupBy_reuse_agg_${Random.alphanumeric.take(6).mkString}"
+    tableUtils.createDatabase(namespace)
+    val schema = List(
+      Column("user", StringType, 10), // ts = last 10 days
+      Column("session_length", IntType, 2),
+      Column("rating", DoubleType, 2000)
+    )
+
+    val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
 
-    val diffIncremental = Comparison.sideBySide(actualDfIncremental, expectedDf, List("user", tableUtils.partitionColumn))
-    if (diffIncremental.count() > 0) {
-      diffIncremental.show()
-      println("diff result rows incremental")
+    println(s"Input DataFrame: ${df.count()}")
+
+    val aggregations: Seq[Aggregation] = Seq(
+      Builders.Aggregation(Operation.SUM, "session_length", Seq(new Window(10, TimeUnit.DAYS), new Window(20, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.COUNT, "session_length", Seq(new Window(20, TimeUnit.DAYS)))
+    )
+
+    val tableProps: Map[String, String] = Map(
+      "source" -> "chronon"
+    )
+
+    val incrementalOutputTableName = "testIncrementalOutputReuseAgg"
+    val groupBy = new GroupBy(aggregations, Seq("user"), df)
+    groupBy.computeIncrementalDf(s"${namespace}.${incrementalOutputTableName}", PartitionRange("2025-05-01", "2025-06-01"), tableProps)
+
+    val tempView = "test_incremental_input_reuse_agg"
+    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.${incrementalOutputTableName} where ds='2025-05-11'")
+    df.createOrReplaceTempView(tempView)
+
+    spark.sql(s"select * from ${namespace}.${incrementalOutputTableName} where ds='2025-05-11'").show()
+
+    val query =
+      s"""
+         |select user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts, sum(session_length) as session_length_sum, count(session_length) as session_length_count
+         |from ${tempView}
+         |where ds='2025-05-11'
+         |group by user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000
+         |""".stripMargin
+
+    val expectedDf = spark.sql(query)
+
+    val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
+    if (diff.count() > 0) {
+      diff.show()
+      println("diff result rows")
     }
-    assertEquals(0, diffIncremental.count())
+    assertEquals(0, diff.count())
   }
+
+
 }

From 9c446ec846b2d5b85b5f6ab1e3722b3fe99c5e85 Mon Sep 17 00:00:00 2001
From: chaitu <chkc127@gmail.com>
Date: Sun, 12 Oct 2025 14:18:54 -0700
Subject: [PATCH 28/54] resolve merge conflict

---
 api/py/ai/chronon/group_by.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/api/py/ai/chronon/group_by.py b/api/py/ai/chronon/group_by.py
index f8aea48b28..8a383cccf2 100644
--- a/api/py/ai/chronon/group_by.py
+++ b/api/py/ai/chronon/group_by.py
@@ -362,6 +362,7 @@ def GroupBy(
     tags: Optional[Dict[str, str]] = None,
     derivations: Optional[List[ttypes.Derivation]] = None,
     deprecation_date: Optional[str] = None,
+    description: Optional[str] = None,
     is_incremental: Optional[bool] = False,
     **kwargs,
 ) -> ttypes.GroupBy:

From ca1430962f399b2fc5a6ced664486797ee2d0262 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sun, 2 Nov 2025 12:51:28 -0800
Subject: [PATCH 29/54] add test case to test struct of Average

---
 .../ai/chronon/spark/test/GroupByTest.scala   | 65 +++++++++++++++++--
 1 file changed, 60 insertions(+), 5 deletions(-)

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index a3ebb6f8cf..3d9d070589 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -987,6 +987,7 @@ class GroupByTest {
    * the test compute daily intermediate aggregations
    *
    *  Test is one daily partition data is correct
+   *  Tests SUM, AVERAGE (with IR structure verification), and COUNT operations
    */
   @Test
   def testIncrementalDailyData(): Unit = {
@@ -1003,36 +1004,90 @@ class GroupByTest {
     val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
 
     val aggregations: Seq[Aggregation] = Seq(
-        Builders.Aggregation(Operation.SUM, "session_length", Seq(new Window(10, TimeUnit.DAYS)))
+        Builders.Aggregation(Operation.SUM, "session_length", Seq(new Window(10, TimeUnit.DAYS))),
+        Builders.Aggregation(Operation.AVERAGE, "rating", Seq(new Window(10, TimeUnit.DAYS))),
+        Builders.Aggregation(Operation.COUNT, "session_length", Seq(new Window(10, TimeUnit.DAYS)))
     )
 
     val tableProps: Map[String, String] = Map(
       "source" -> "chronon"
     )
 
-    val partitionRange = PartitionRange("2025-05-01", "2025-06-01")
+    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val today_minus_10_date = tableUtils.partitionSpec.minus(today_date, new Window(10, TimeUnit.DAYS))
+    val today_minus_30_date = tableUtils.partitionSpec.minus(today_date, new Window(30, TimeUnit.DAYS))
+
+    val partitionRange = PartitionRange(
+        today_minus_30_date,
+        today_date
+    )
+
     val groupBy = new GroupBy(aggregations, Seq("user"), df)
     groupBy.computeIncrementalDf(s"${namespace}.testIncrementalOutput", partitionRange, tableProps)
 
-    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalOutput where ds='2025-05-11'")
+    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalOutput where ds='$today_minus_10_date'")
     df.createOrReplaceTempView("test_incremental_input")
 
+    // ASSERTION 1: Verify IR table has expected columns
+    val irColumns = actualIncrementalDf.schema.fieldNames.toSet
+    assertTrue("IR must contain 'user' column", irColumns.contains("user"))
+    assertTrue("IR must contain 'ds' column", irColumns.contains("ds"))
+    assertTrue("IR must contain 'ts' column", irColumns.contains("ts"))
+
+    // ASSERTION 2: Verify AVERAGE IR structure (must be StructType with sum and count)
+    val ratingIRColumnOpt = actualIncrementalDf.schema.fields
+      .find(_.name.contains("rating_average"))
+
+    assertTrue("Should have rating IR column", ratingIRColumnOpt.isDefined)
+    val ratingIRColumn = ratingIRColumnOpt.get
+
+    println(s"=== Rating IR Column: ${ratingIRColumn.name}, Type: ${ratingIRColumn.dataType} ===")
+
+    // ASSERT: IR should be a StructType with sum and count fields
+    assertTrue(s"Rating IR should be StructType for AVERAGE, got ${ratingIRColumn.dataType}",
+               ratingIRColumn.dataType.isInstanceOf[StructType])
+
+    val structType = ratingIRColumn.dataType.asInstanceOf[StructType]
+    val structFieldNames = structType.fieldNames.toSet
+
+    // ASSERT: Struct should contain 'sum' and 'count' fields
+    assertTrue(s"AVERAGE IR must contain 'sum' field, found fields: ${structFieldNames}",
+               structFieldNames.exists(_.toLowerCase.contains("sum")))
+    assertTrue(s"AVERAGE IR must contain 'count' field, found fields: ${structFieldNames}",
+               structFieldNames.exists(_.toLowerCase.contains("count")))
+
+    // ASSERTION 3: Verify IR table has data
+    val irRowCount = actualIncrementalDf.count()
+    assertTrue(s"IR table should have rows, found ${irRowCount}", irRowCount > 0)
+
     val query =
       s"""
-         |select user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts, sum(session_length) as session_length_sum
+         |select user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts,
+         |  sum(session_length) as session_length_sum,
+         |  struct( sum(rating) as sum, count(rating) as count ) as rating_average,
+         |  count(session_length) as session_length_count
          |from test_incremental_input
-         |where ds='2025-05-11'
+         |where ds='$today_minus_10_date'
          |group by user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000
          |""".stripMargin
 
     val expectedDf = spark.sql(query)
 
     val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
+
     if (diff.count() > 0) {
+      println(s"=== Diff Details ===")
+      println(s"Actual count: ${irRowCount}")
+      println(s"Expected count: ${expectedDf.count()}")
+      println(s"Diff count: ${diff.count()}")
       diff.show()
       println("diff result rows")
     }
+
+    // ASSERTION 5: Main verification - no differences
     assertEquals(0, diff.count())
+
+    println("=== All Incremental Assertions Passed (SUM, AVERAGE with IR verification, COUNT) ===")
   }
 
   @Test

From dfb9226801c19c68ba5a3bd0f59fd058e91e8aa5 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sun, 2 Nov 2025 12:52:27 -0800
Subject: [PATCH 30/54] add option for isIncremental for backward compatibility

---
 spark/src/main/scala/ai/chronon/spark/Driver.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala
index ebbeb729f3..0afe808535 100644
--- a/spark/src/main/scala/ai/chronon/spark/Driver.scala
+++ b/spark/src/main/scala/ai/chronon/spark/Driver.scala
@@ -468,7 +468,7 @@ object Driver {
         args.stepDays.toOption,
         args.startPartitionOverride.toOption,
         !args.runFirstHole(),
-        args.groupByConf.isIncremental
+        Option(args.groupByConf.isIncremental).getOrElse(false)
       )
 
       if (args.shouldExport()) {

From ab994bcd422ca81c42312550702f68688e7c618d Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Tue, 4 Nov 2025 11:12:30 -0800
Subject: [PATCH 31/54] fix count operation from incremental IRS

---
 .../main/scala/ai/chronon/spark/GroupBy.scala  |  6 +++++-
 .../ai/chronon/spark/test/GroupByTest.scala    | 18 +++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 11d25e81fa..317c248658 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -23,7 +23,7 @@ import ai.chronon.aggregator.windowing._
 import ai.chronon.api
 import ai.chronon.api.DataModel.{Entities, Events}
 import ai.chronon.api.Extensions._
-import ai.chronon.api.{Accuracy, Constants, DataModel, ParametricMacro, TimeUnit, Window}
+import ai.chronon.api.{Accuracy, Constants, DataModel, Operation, ParametricMacro, TimeUnit, Window}
 import ai.chronon.online.serde.{RowWrapper, SparkConversions}
 import ai.chronon.spark.Extensions._
 import ai.chronon.spark.catalog.TableUtils
@@ -804,6 +804,10 @@ object GroupBy {
     val incrementalAggregations = aggregationParts.zip(groupByConf.getAggregations.toScala).map{ case (part, agg) =>
       val newAgg = agg.deepCopy()
       newAgg.setInputColumn(part.incrementalOutputColumnName)
+      // Convert COUNT to SUM when reading from incremental IRs
+      if (newAgg.operation == Operation.COUNT) {
+        newAgg.setOperation(Operation.SUM)
+      }
       newAgg
     }
 
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index 2888bf6963..2ee10cd3e5 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -972,9 +972,10 @@ class GroupByTest {
     val startPartition = tableUtils.partitionSpec.minus(today, new Window(windowSize, TimeUnit.DAYS))
     val endPartition = tableUtils.partitionSpec.at(System.currentTimeMillis())
     val sourceSchema = List(
-      Column("user", StringType, 10), // ts = last 10 days
-      Column("session_length", IntType, 2),
-      Column("rating", DoubleType, 2000)
+      Column("user", StringType, 10000),
+      Column("item", StringType, 100),
+      Column("time_spent_ms", LongType, 5000),
+      Column("price", DoubleType, 100)
     )
     val namespace = "chronon_incremental_test"
     val sourceTable = s"$namespace.test_group_by_steps$suffix"
@@ -988,7 +989,7 @@ class GroupByTest {
     }
 
     val source = Builders.Source.events(
-      query = Builders.Query(selects = Builders.Selects("ts", "user", "time_spent_ms", "price"),
+      query = Builders.Query(selects = Builders.Selects("ts", "user", "time_spent_ms", "price", "item"),
         startPartition = startPartition,
         partitionColumn = partitionColOpt.orNull),
       table = sourceTable
@@ -1115,10 +1116,12 @@ class GroupByTest {
 
     val aggregations: Seq[Aggregation] = Seq(
       Builders.Aggregation(Operation.SUM, "time_spent_ms", Seq(new Window(10, TimeUnit.DAYS), new Window(5, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.SUM, "price", Seq(new Window(10, TimeUnit.DAYS)))
+      Builders.Aggregation(Operation.SUM, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.AVERAGE, "price", Seq(new Window(10, TimeUnit.DAYS)))
     )
 
-    val (source, endPartition) = createTestSource(windowSize = 30, suffix = "_snapshot_events", partitionColOpt = Some(tableUtils.partitionColumn))
+    val (source, endPartition) = createTestSourceIncremental(windowSize = 30, suffix = "_snapshot_events", partitionColOpt = Some(tableUtils.partitionColumn))
     val groupByConf = Builders.GroupBy(
       sources = Seq(source),
       keyColumns = Seq("item"),
@@ -1129,6 +1132,7 @@ class GroupByTest {
     )
 
     val df = spark.read.table(source.table)
+
     val groupBy = new GroupBy(aggregations, Seq("item"), df.filter("item is not null"))
     val actualDf = groupBy.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
 
@@ -1143,7 +1147,7 @@ class GroupByTest {
     val diff = Comparison.sideBySide(actualDf, incrementalExpectedDf, List("item", tableUtils.partitionColumn))
     if (diff.count() > 0) {
       diff.show()
-      println("diff result rows")
+      println("=== Diff result rows ===")
     }
     assertEquals(0, diff.count())
   }

From 2714ad5e4f91a03dafc05ac96cf7d20edee9ae0f Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sun, 9 Nov 2025 16:28:26 -0800
Subject: [PATCH 32/54] use saw tooth aggregator to compute from daily IRs

---
 .../aggregator/row/RowAggregator.scala        |  13 +-
 .../main/scala/ai/chronon/spark/GroupBy.scala | 141 ++++++++++++------
 .../ai/chronon/spark/test/GroupByTest.scala   |  62 +-------
 3 files changed, 107 insertions(+), 109 deletions(-)

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
index 43b2807740..4f0dda6fe0 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
@@ -27,7 +27,7 @@ import scala.collection.Seq
 // userAggregationParts is used when incrementalMode = True.
 class RowAggregator(val inputSchema: Seq[(String, DataType)],
                     val aggregationParts: Seq[AggregationPart],
-                    val userInputAggregationParts: Option[Seq[AggregationPart]] = None )
+                    )
     extends Serializable
     with SimpleAggregator[Row, Array[Any], Array[Any]] {
 
@@ -78,20 +78,11 @@ class RowAggregator(val inputSchema: Seq[(String, DataType)],
     .toArray
     .zip(columnAggregators.map(_.irType))
 
-  val aggregationPartsOutputSchema: Array[(String, DataType)] = aggregationParts
+  val outputSchema: Array[(String, DataType)] = aggregationParts
     .map(_.outputColumnName)
     .toArray
     .zip(columnAggregators.map(_.outputType))
 
-  val outputSchema: Array[(String, DataType)] = userInputAggregationParts
-    .map{ parts =>
-      parts
-        .map(_.outputColumnName)
-        .toArray
-        .zip(columnAggregators.map(_.outputType))
-    }.getOrElse(aggregationPartsOutputSchema)
-
-
   val isNotDeletable: Boolean = columnAggregators.forall(!_.isDeletable)
 
   // this will mutate in place
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 317c248658..f3ed4af71d 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -45,7 +45,6 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
               val mutationDfFn: () => DataFrame = null,
               skewFilter: Option[String] = None,
               finalize: Boolean = true,
-              val userInputAggregations: Seq[api.Aggregation] = null
              )
     extends Serializable {
   @transient lazy val logger = LoggerFactory.getLogger(getClass)
@@ -106,13 +105,8 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
 
 
   @transient
-  protected[spark] lazy val windowAggregator: RowAggregator = {
-    if (userInputAggregations != null) {
-       new RowAggregator(selectedSchema, aggregations.flatMap(_.unpack), Option(userInputAggregations.flatMap(_.unpack)))
-    } else {
+  protected[spark] lazy val windowAggregator: RowAggregator =
         new RowAggregator(selectedSchema, aggregations.flatMap(_.unpack))
-    }
-  }
 
   def snapshotEntitiesBase: RDD[(Array[Any], Array[Any])] = {
     val keys = (keyColumns :+ tableUtils.partitionColumn).toArray
@@ -518,28 +512,7 @@ object GroupBy {
     val groupByConf = replaceJoinSource(groupByConfOld, queryRange, tableUtils, computeDependency, showDf)
     val sourceQueryWindow: Option[Window]  = if (incrementalMode) Some(new Window(queryRange.daysBetween, TimeUnit.DAYS)) else groupByConf.maxWindow
     val backfillQueryRange: PartitionRange = if (incrementalMode) PartitionRange(queryRange.end, queryRange.end)(tableUtils) else queryRange
-    val inputDf = groupByConf.sources.toScala
-      .map { source =>
-        val partitionColumn = tableUtils.getPartitionColumn(source.query)
-        tableUtils.sqlWithDefaultPartitionColumn(
-          renderDataSourceQuery(
-            groupByConf,
-            source,
-            groupByConf.getKeyColumns.toScala,
-            backfillQueryRange,
-            tableUtils,
-            sourceQueryWindow,
-            groupByConf.inferredAccuracy,
-            partitionColumn = partitionColumn
-          ),
-          existingPartitionColumn = partitionColumn
-        )
-      }
-      .reduce { (df1, df2) =>
-        // align the columns by name - when one source has select * the ordering might not be aligned
-        val columns1 = df1.schema.fields.map(_.name)
-        df1.union(df2.selectExpr(columns1: _*))
-      }
+    val inputDf = buildSourceDataFrame(groupByConf, backfillQueryRange, sourceQueryWindow, tableUtils, schemaOnly = false)
 
     def doesNotNeedTime = !Option(groupByConf.getAggregations).exists(_.toScala.needsTimestamp)
     def hasValidTimeColumn = inputDf.schema.find(_.name == Constants.TimeColumn).exists(_.dataType == LongType)
@@ -741,6 +714,49 @@ object GroupBy {
     query
   }
 
+  /**
+    * Builds a unified DataFrame from all sources in the GroupBy configuration.
+    * Used to create input DataFrames with proper schema alignment.
+    *
+    * @param groupByConf the GroupBy configuration
+    * @param range the partition range to query
+    * @param window the window size for querying (None uses maxWindow from config)
+    * @param tableUtils table utilities for partition handling
+    * @param schemaOnly if true, returns empty DataFrame with just schema (uses .limit(0))
+    * @return unified DataFrame from all sources
+    */
+  private def buildSourceDataFrame(
+      groupByConf: api.GroupBy,
+      range: PartitionRange,
+      window: Option[Window],
+      tableUtils: TableUtils,
+      schemaOnly: Boolean = false
+  ): DataFrame = {
+    groupByConf.sources.toScala
+      .map { source =>
+        val partitionColumn = tableUtils.getPartitionColumn(source.query)
+        val df = tableUtils.sqlWithDefaultPartitionColumn(
+          renderDataSourceQuery(
+            groupByConf,
+            source,
+            groupByConf.getKeyColumns.toScala,
+            range,
+            tableUtils,
+            window.orElse(groupByConf.maxWindow),
+            groupByConf.inferredAccuracy,
+            partitionColumn = partitionColumn
+          ),
+          existingPartitionColumn = partitionColumn
+        )
+        if (schemaOnly) df.limit(0) else df
+      }
+      .reduce { (df1, df2) =>
+        // align the columns by name - when one source has select * the ordering might not be aligned
+        val columns1 = df1.schema.fields.map(_.name)
+        df1.union(df2.selectExpr(columns1: _*))
+      }
+  }
+
   /**
     * Computes and saves the output of hopsAggregation.
     * HopsAggregate computes event level data to daily aggregates and saves the output in IR format
@@ -788,6 +804,43 @@ object GroupBy {
     (incrementalQueryableRange, incrementalGroupByAggParts)
   }
 
+  private def convertIncrementalDfToHops(
+      incrementalDf: DataFrame,
+      aggregationParts: Seq[api.AggregationPart],
+      groupByConf: api.GroupBy,
+      tableUtils: TableUtils) : RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+
+    val keyColumns = groupByConf.getKeyColumns.toScala
+    val keyBuilder: Row => KeyWithHash =
+      FastHashing.generateKeyBuilder(keyColumns.toArray, incrementalDf.schema)
+
+    incrementalDf.rdd
+      .map { row =>
+        //Extract timestamp from partition column
+        val ds = row.getAs[String](tableUtils.partitionColumn)
+        val ts = tableUtils.partitionSpec.epochMillis(ds)
+
+        val irs = new Array[Any](aggregationParts.length)
+        aggregationParts.zipWithIndex.foreach { case (part, idx) =>
+          val value = row.get(row.fieldIndex(part.incrementalOutputColumnName))
+          // Convert Spark Row (struct) to Array for complex IRs like AVERAGE
+          irs(idx) = value match {
+            case r: Row => r.toSeq.toArray
+            case other => other
+          }
+        }
+
+        // Build HopIR : [IR1, IR2, ..., IRn, ts]
+        val hopIr: HopIr = irs :+ ts
+        (keyBuilder(row), hopIr)
+      }
+      .groupByKey()
+      .mapValues{ hopIrs =>
+        //Convert to HopsAggregator.OutputArrayType: Array[Array[HopIr]]
+        val sortedHops = hopIrs.toArray.sortBy(_.last.asInstanceOf[Long])
+        Array(sortedHops)
+      }
+  }
 
   def fromIncrementalDf(
                        groupByConf: api.GroupBy,
@@ -796,29 +849,31 @@ object GroupBy {
                      ): GroupBy = {
 
     val incrementalOutputTable = groupByConf.metaData.incrementalOutputTable
+    assert(incrementalOutputTable != null,
+      s"incrementalOutputTable is not set for GroupBy: ${groupByConf.metaData.name}")
 
     val (incrementalQueryableRange, aggregationParts) = computeIncrementalDf(groupByConf, range, tableUtils, incrementalOutputTable)
 
     val (_, incrementalDf: DataFrame) =  incrementalQueryableRange.scanQueryStringAndDf(null, incrementalOutputTable)
 
-    val incrementalAggregations = aggregationParts.zip(groupByConf.getAggregations.toScala).map{ case (part, agg) =>
-      val newAgg = agg.deepCopy()
-      newAgg.setInputColumn(part.incrementalOutputColumnName)
-      // Convert COUNT to SUM when reading from incremental IRs
-      if (newAgg.operation == Operation.COUNT) {
-        newAgg.setOperation(Operation.SUM)
-      }
-      newAgg
-    }
+    val incrementalHops  = convertIncrementalDfToHops(incrementalDf, aggregationParts, groupByConf, tableUtils)
+
+    // Create a DataFrame with the source schema (raw data schema) to match aggregations
+    // We need this because GroupBy class variables expect inputDf schema to match aggregation input columns
+    // We create an empty DataFrame with the correct schema - it won't be used for computation
+    val sourceDf = buildSourceDataFrame(groupByConf, range, None, tableUtils, schemaOnly = true)
 
     new GroupBy(
-      incrementalAggregations,
+      groupByConf.getAggregations.toScala,
       groupByConf.getKeyColumns.toScala,
-      incrementalDf,
+      sourceDf, // Use source schema, not incremental schema
       () => null,
-      finalize = true,
-      userInputAggregations=groupByConf.aggregations.toScala
-    )
+    ) {
+      // Override hopsAggregate to return precomputed hops instead of computing from raw data
+      override def hopsAggregate(minQueryTs: Long, resolution: Resolution): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+        incrementalHops
+      }
+    }
 
   }
 
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index 2ee10cd3e5..ad1b1f6bd8 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -1104,6 +1104,13 @@ class GroupByTest {
     println("=== All Incremental Assertions Passed (SUM, AVERAGE with IR verification, COUNT) ===")
   }
 
+  /**
+   * This test verifies that the incremental snapshotEvents output matches the non-incremental output.
+   *
+   * 1. Computes snapshotEvents using the standard GroupBy on the full input data.
+   * 2. Computes snapshotEvents using GroupBy in incremental mode over the same date range.
+   * 3. Compares the two outputs to ensure they are identical.
+   */
   @Test
   def testSnapshotIncrementalEvents(): Unit = {
     lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
@@ -1152,59 +1159,4 @@ class GroupByTest {
     assertEquals(0, diff.count())
   }
 
-
-  @Test
-  def testIncrementalModeReuseAggregation(): Unit = {
-    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncrementalReuseAgg" + "_" + Random.alphanumeric.take(6).mkString, local = true)
-    implicit val tableUtils = TableUtils(spark)
-    val namespace = s"incremental_groupBy_reuse_agg_${Random.alphanumeric.take(6).mkString}"
-    tableUtils.createDatabase(namespace)
-    val schema = List(
-      Column("user", StringType, 10), // ts = last 10 days
-      Column("session_length", IntType, 2),
-      Column("rating", DoubleType, 2000)
-    )
-
-    val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
-
-    println(s"Input DataFrame: ${df.count()}")
-
-    val aggregations: Seq[Aggregation] = Seq(
-      Builders.Aggregation(Operation.SUM, "session_length", Seq(new Window(10, TimeUnit.DAYS), new Window(20, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.COUNT, "session_length", Seq(new Window(20, TimeUnit.DAYS)))
-    )
-
-    val tableProps: Map[String, String] = Map(
-      "source" -> "chronon"
-    )
-
-    val incrementalOutputTableName = "testIncrementalOutputReuseAgg"
-    val groupBy = new GroupBy(aggregations, Seq("user"), df)
-    groupBy.computeIncrementalDf(s"${namespace}.${incrementalOutputTableName}", PartitionRange("2025-05-01", "2025-06-01"), tableProps)
-
-    val tempView = "test_incremental_input_reuse_agg"
-    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.${incrementalOutputTableName} where ds='2025-05-11'")
-    df.createOrReplaceTempView(tempView)
-
-    spark.sql(s"select * from ${namespace}.${incrementalOutputTableName} where ds='2025-05-11'").show()
-
-    val query =
-      s"""
-         |select user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts, sum(session_length) as session_length_sum, count(session_length) as session_length_count
-         |from ${tempView}
-         |where ds='2025-05-11'
-         |group by user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000
-         |""".stripMargin
-
-    val expectedDf = spark.sql(query)
-
-    val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
-    if (diff.count() > 0) {
-      diff.show()
-      println("diff result rows")
-    }
-    assertEquals(0, diff.count())
-  }
-
-
 }

From d70cfa75c33e118106b5d4280f253046178c8c14 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sun, 9 Nov 2025 16:49:04 -0800
Subject: [PATCH 33/54] remove Average IR

---
 .../aggregator/base/SimpleAggregators.scala   | 50 -------------------
 .../aggregator/row/ColumnAggregator.scala     |  1 -
 .../main/scala/ai/chronon/spark/GroupBy.scala |  1 -
 3 files changed, 52 deletions(-)

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala b/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
index afbe238d0a..eef365acf2 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
@@ -117,56 +117,6 @@ class UniqueCount[T](inputType: DataType) extends SimpleAggregator[T, util.HashS
   }
 }
 
-class AverageIR extends SimpleAggregator[Array[Any], Array[Any], Double] {
-  override def outputType: DataType = DoubleType
-
-  override def irType: DataType =
-    StructType(
-      "AvgIr",
-      Array(StructField("sum", DoubleType), StructField("count", IntType))
-    )
-
-  override def prepare(input: Array[Any]): Array[Any] = {
-    Array(input(0).asInstanceOf[Double], input(1).asInstanceOf[Int])
-  }
-
-  // mutating
-  override def update(ir: Array[Any], input: Array[Any]): Array[Any] = {
-    val inputSum = input(0).asInstanceOf[Double]
-    val inputCount = input(1).asInstanceOf[Int]
-    ir.update(0, ir(0).asInstanceOf[Double] + inputSum)
-    ir.update(1, ir(1).asInstanceOf[Int] + inputCount)
-    ir
-  }
-
-  // mutating
-  override def merge(ir1: Array[Any], ir2: Array[Any]): Array[Any] = {
-    ir1.update(0, ir1(0).asInstanceOf[Double] + ir2(0).asInstanceOf[Double])
-    ir1.update(1, ir1(1).asInstanceOf[Int] + ir2(1).asInstanceOf[Int])
-    ir1
-  }
-
-  override def finalize(ir: Array[Any]): Double =
-    ir(0).asInstanceOf[Double] / ir(1).asInstanceOf[Int].toDouble
-
-  override def delete(ir: Array[Any], input: Array[Any]): Array[Any] = {
-    val inputSum = input(0).asInstanceOf[Double]
-    val inputCount = input(1).asInstanceOf[Int]
-    ir.update(0, ir(0).asInstanceOf[Double] - inputSum)
-    ir.update(1, ir(1).asInstanceOf[Int] - inputCount)
-    ir
-  }
-
-  override def clone(ir: Array[Any]): Array[Any] = {
-    val arr = new Array[Any](ir.length)
-    ir.copyToArray(arr)
-    arr
-  }
-
-  override def isDeletable: Boolean = true
-}
-
-
 
 class Average extends SimpleAggregator[Double, Array[Any], Double] {
   override def outputType: DataType = DoubleType
diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
index 8873b3e938..d8cd5c2c90 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
@@ -348,7 +348,6 @@ object ColumnAggregator {
           case ShortType  => simple(new Average, toDouble[Short])
           case DoubleType => simple(new Average)
           case FloatType  => simple(new Average, toDouble[Float])
-          case StructType(name, fields) => simple(new AverageIR, toStructArray)
           case _          => mismatchException
         }
 
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index f3ed4af71d..f466975862 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -823,7 +823,6 @@ object GroupBy {
         val irs = new Array[Any](aggregationParts.length)
         aggregationParts.zipWithIndex.foreach { case (part, idx) =>
           val value = row.get(row.fieldIndex(part.incrementalOutputColumnName))
-          // Convert Spark Row (struct) to Array for complex IRs like AVERAGE
           irs(idx) = value match {
             case r: Row => r.toSeq.toArray
             case other => other

From 053fd9d9faca739f8e5638307183f310fb13d905 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sun, 9 Nov 2025 16:59:35 -0800
Subject: [PATCH 34/54] remove empty spaces and unused functions

---
 .../ai/chronon/aggregator/base/SimpleAggregators.scala   | 1 -
 .../ai/chronon/aggregator/row/ColumnAggregator.scala     | 9 +--------
 .../main/scala/ai/chronon/spark/catalog/TableUtils.scala | 2 --
 3 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala b/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
index eef365acf2..ff80f02253 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
@@ -117,7 +117,6 @@ class UniqueCount[T](inputType: DataType) extends SimpleAggregator[T, util.HashS
   }
 }
 
-
 class Average extends SimpleAggregator[Double, Array[Any], Double] {
   override def outputType: DataType = DoubleType
 
diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
index d8cd5c2c90..649a69104d 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
@@ -216,14 +216,7 @@ object ColumnAggregator {
   private def toJavaDouble[A: Numeric](inp: Any) =
     implicitly[Numeric[A]].toDouble(inp.asInstanceOf[A]).asInstanceOf[java.lang.Double]
 
-
-  private def toStructArray(inp: Any): Array[Any] = inp match {
-    case r: org.apache.spark.sql.Row => r.toSeq.toArray
-    case null => null
-    case other => throw new IllegalArgumentException(s"Expected Row, got: $other")
-  }
-
-  def construct(baseInputType: DataType,
+ def construct(baseInputType: DataType,
                 aggregationPart: AggregationPart,
                 columnIndices: ColumnIndices,
                 bucketIndex: Option[Int]): ColumnAggregator = {
diff --git a/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala
index 81067923e1..1faaf9acc9 100644
--- a/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala
+++ b/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala
@@ -893,9 +893,7 @@ case class TableUtils(sparkSession: SparkSession) {
       } else {
         validPartitionRange.partitions.toSet
       }
-
     val outputMissing = fillablePartitions -- outputExisting
-
     val allInputExisting = inputTables
       .map { tables =>
         tables

From 99f9788c0da98c6ae72cf73643e04b55d5286bcb Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sat, 22 Nov 2025 10:14:09 -0800
Subject: [PATCH 35/54] testing subset of aggregations

---
 .../ai/chronon/spark/test/GroupByTest.scala   | 127 +++++++++++++++++-
 1 file changed, 126 insertions(+), 1 deletion(-)

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index ad1b1f6bd8..cc63626f69 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -39,7 +39,7 @@ import ai.chronon.spark._
 import ai.chronon.spark.catalog.TableUtils
 import com.google.gson.Gson
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
+import org.apache.spark.sql.types.{ArrayType, StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
 import org.apache.spark.sql.{Encoders, Row, SparkSession}
 import org.junit.Assert._
 import org.junit.Test
@@ -1104,6 +1104,131 @@ class GroupByTest {
     println("=== All Incremental Assertions Passed (SUM, AVERAGE with IR verification, COUNT) ===")
   }
 
+  /**
+   * Comprehensive test for incremental aggregations covering all directly comparable operations.
+   * Tests: SUM, COUNT, AVERAGE, MIN, MAX, UNIQUE_COUNT, VARIANCE
+   * All these operations have IRs that can be directly compared (no binary sketches).
+   */
+  @Test
+  def testIncrementalAllAggregations(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncrementalAll" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val namespace = s"incremental_all_aggs_${Random.alphanumeric.take(6).mkString}"
+    tableUtils.createDatabase(namespace)
+
+    val schema = List(
+      Column("user", StringType, 10),
+      Column("price", DoubleType, 100),
+      Column("quantity", IntType, 50),
+      Column("product_id", StringType, 20),  // Low cardinality for UNIQUE_COUNT
+      Column("rating", DoubleType, 2000)
+    )
+
+    val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
+
+    val aggregations: Seq[Aggregation] = Seq(
+      // Simple aggregations
+      Builders.Aggregation(Operation.SUM, "price", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.COUNT, "quantity", Seq(new Window(7, TimeUnit.DAYS))),
+
+      // Complex aggregation - AVERAGE (struct IR with sum/count)
+      Builders.Aggregation(Operation.AVERAGE, "rating", Seq(new Window(7, TimeUnit.DAYS))),
+
+      // Min/Max
+      Builders.Aggregation(Operation.MIN, "price", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.MAX, "quantity", Seq(new Window(7, TimeUnit.DAYS))),
+
+      // Variance (struct IR with sum/sum_of_squares/count)
+      Builders.Aggregation(Operation.VARIANCE, "price", Seq(new Window(7, TimeUnit.DAYS)))
+    )
+
+    val tableProps: Map[String, String] = Map("source" -> "chronon")
+
+    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val today_minus_7_date = tableUtils.partitionSpec.minus(today_date, new Window(7, TimeUnit.DAYS))
+    val today_minus_20_date = tableUtils.partitionSpec.minus(today_date, new Window(20, TimeUnit.DAYS))
+
+    val partitionRange = PartitionRange(today_minus_20_date, today_date)
+
+    val groupBy = new GroupBy(aggregations, Seq("user"), df)
+    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalAllAggsOutput", partitionRange, tableProps)
+
+    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalAllAggsOutput where ds='$today_minus_7_date'")
+    df.createOrReplaceTempView("test_all_aggs_input")
+
+    println("=== Incremental IR Schema ===")
+    actualIncrementalDf.printSchema()
+
+    // ASSERTION 1: Verify IR table has expected key columns
+    val irColumns = actualIncrementalDf.schema.fieldNames.toSet
+    assertTrue("IR must contain 'user' column", irColumns.contains("user"))
+    assertTrue("IR must contain 'ds' column", irColumns.contains("ds"))
+    assertTrue("IR must contain 'ts' column", irColumns.contains("ts"))
+
+    // ASSERTION 2: Verify all aggregation columns exist
+    assertTrue("IR must contain SUM price", irColumns.exists(_.contains("price_sum")))
+    assertTrue("IR must contain COUNT quantity", irColumns.exists(_.contains("quantity_count")))
+    assertTrue("IR must contain AVERAGE rating", irColumns.exists(_.contains("rating_average")))
+    assertTrue("IR must contain MIN price", irColumns.exists(_.contains("price_min")))
+    assertTrue("IR must contain MAX quantity", irColumns.exists(_.contains("quantity_max")))
+    assertTrue("IR must contain VARIANCE price", irColumns.exists(_.contains("price_variance")))
+
+    // ASSERTION 3: Verify complex IR structures
+    val avgColumn = actualIncrementalDf.schema.fields.find(_.name.contains("rating_average"))
+    assertTrue("AVERAGE IR should be StructType", avgColumn.isDefined && avgColumn.get.dataType.isInstanceOf[StructType])
+
+    val varianceColumn = actualIncrementalDf.schema.fields.find(_.name.contains("price_variance"))
+    assertTrue("VARIANCE IR should be StructType", varianceColumn.isDefined && varianceColumn.get.dataType.isInstanceOf[StructType])
+
+    println(s"✓ AVERAGE IR type: ${avgColumn.get.dataType}")
+    println(s"✓ VARIANCE IR type: ${varianceColumn.get.dataType}")
+
+    // ASSERTION 4: Verify IR table has data
+    val irRowCount = actualIncrementalDf.count()
+    assertTrue(s"IR table should have rows, found ${irRowCount}", irRowCount > 0)
+
+    // ASSERTION 5: Compare against SQL computation
+    val query =
+      s"""
+         |select user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts,
+         |  sum(price) as price_sum,
+         |  count(quantity) as quantity_count,
+         |  struct(sum(rating) as sum, count(rating) as count) as rating_average,
+         |  min(price) as price_min,
+         |  max(quantity) as quantity_max,
+         |  struct(
+         |    cast(count(price) as int) as count,
+         |    avg(price) as mean,
+         |    sum(price * price) - count(price) * avg(price) * avg(price) as m2
+         |  ) as price_variance
+         |from test_all_aggs_input
+         |where ds='$today_minus_7_date'
+         |group by user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000
+         |""".stripMargin
+
+    val expectedDf = spark.sql(query)
+    val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
+
+    if (diff.count() > 0) {
+      println(s"=== Diff Details for All Aggregations ===")
+      println(s"Actual count: ${irRowCount}")
+      println(s"Expected count: ${expectedDf.count()}")
+      println(s"Diff count: ${diff.count()}")
+      diff.show(100, truncate = false)
+    }
+
+    assertEquals(0, diff.count())
+
+    println("=== All Incremental Assertions Passed ===")
+    println("✓ SUM: Simple numeric IR")
+    println("✓ COUNT: Simple numeric IR")
+    println("✓ AVERAGE: Struct IR {sum, count}")
+    println("✓ MIN: Simple numeric IR")
+    println("✓ MAX: Simple numeric IR")
+    println("✓ UNIQUE_COUNT: Array IR [unique values] - directly comparable!")
+    println("✓ VARIANCE: Struct IR {sum, sum_of_squares, count}")
+  }
+
   /**
    * This test verifies that the incremental snapshotEvents output matches the non-incremental output.
    *

From 7dff16eb1d8c5cea3cd26f7ae891a699c4b887f7 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Mon, 26 Jan 2026 23:25:48 -0800
Subject: [PATCH 36/54] remove duplicate test

---
 .../ai/chronon/spark/test/GroupByTest.scala   | 108 +-----------------
 1 file changed, 1 insertion(+), 107 deletions(-)

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index cc63626f69..34a8242dda 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -41,6 +41,7 @@ import com.google.gson.Gson
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.types.{ArrayType, StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
 import org.apache.spark.sql.{Encoders, Row, SparkSession}
+import org.apache.spark.sql.functions.col
 import org.junit.Assert._
 import org.junit.Test
 
@@ -997,113 +998,6 @@ class GroupByTest {
     (source, endPartition)
   }
 
-  /**
-   * the test compute daily intermediate aggregations
-   *
-   *  Test is one daily partition data is correct
-   *  Tests SUM, AVERAGE (with IR structure verification), and COUNT operations
-   */
-  @Test
-  def testIncrementalDailyData(): Unit = {
-    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncremental" + "_" + Random.alphanumeric.take(6).mkString, local = true)
-    implicit val tableUtils = TableUtils(spark)
-    val namespace = s"incremental_groupBy_${Random.alphanumeric.take(6).mkString}"
-    tableUtils.createDatabase(namespace)
-    val schema = List(
-      Column("user", StringType, 10), // ts = last 10 days
-      Column("session_length", IntType, 2),
-      Column("rating", DoubleType, 2000)
-    )
-
-    val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
-
-    val aggregations: Seq[Aggregation] = Seq(
-        Builders.Aggregation(Operation.SUM, "session_length", Seq(new Window(10, TimeUnit.DAYS))),
-        Builders.Aggregation(Operation.AVERAGE, "rating", Seq(new Window(10, TimeUnit.DAYS))),
-        Builders.Aggregation(Operation.COUNT, "session_length", Seq(new Window(10, TimeUnit.DAYS)))
-    )
-
-    val tableProps: Map[String, String] = Map(
-      "source" -> "chronon"
-    )
-
-    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
-    val today_minus_10_date = tableUtils.partitionSpec.minus(today_date, new Window(10, TimeUnit.DAYS))
-    val today_minus_30_date = tableUtils.partitionSpec.minus(today_date, new Window(30, TimeUnit.DAYS))
-
-    val partitionRange = PartitionRange(
-        today_minus_30_date,
-        today_date
-    )
-
-    val groupBy = new GroupBy(aggregations, Seq("user"), df)
-    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalOutput", partitionRange, tableProps)
-
-    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalOutput where ds='$today_minus_10_date'")
-    df.createOrReplaceTempView("test_incremental_input")
-
-    // ASSERTION 1: Verify IR table has expected columns
-    val irColumns = actualIncrementalDf.schema.fieldNames.toSet
-    assertTrue("IR must contain 'user' column", irColumns.contains("user"))
-    assertTrue("IR must contain 'ds' column", irColumns.contains("ds"))
-    assertTrue("IR must contain 'ts' column", irColumns.contains("ts"))
-
-    // ASSERTION 2: Verify AVERAGE IR structure (must be StructType with sum and count)
-    val ratingIRColumnOpt = actualIncrementalDf.schema.fields
-      .find(_.name.contains("rating_average"))
-
-    assertTrue("Should have rating IR column", ratingIRColumnOpt.isDefined)
-    val ratingIRColumn = ratingIRColumnOpt.get
-
-    println(s"=== Rating IR Column: ${ratingIRColumn.name}, Type: ${ratingIRColumn.dataType} ===")
-
-    // ASSERT: IR should be a StructType with sum and count fields
-    assertTrue(s"Rating IR should be StructType for AVERAGE, got ${ratingIRColumn.dataType}",
-               ratingIRColumn.dataType.isInstanceOf[StructType])
-
-    val structType = ratingIRColumn.dataType.asInstanceOf[StructType]
-    val structFieldNames = structType.fieldNames.toSet
-
-    // ASSERT: Struct should contain 'sum' and 'count' fields
-    assertTrue(s"AVERAGE IR must contain 'sum' field, found fields: ${structFieldNames}",
-               structFieldNames.exists(_.toLowerCase.contains("sum")))
-    assertTrue(s"AVERAGE IR must contain 'count' field, found fields: ${structFieldNames}",
-               structFieldNames.exists(_.toLowerCase.contains("count")))
-
-    // ASSERTION 3: Verify IR table has data
-    val irRowCount = actualIncrementalDf.count()
-    assertTrue(s"IR table should have rows, found ${irRowCount}", irRowCount > 0)
-
-    val query =
-      s"""
-         |select user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts,
-         |  sum(session_length) as session_length_sum,
-         |  struct( sum(rating) as sum, count(rating) as count ) as rating_average,
-         |  count(session_length) as session_length_count
-         |from test_incremental_input
-         |where ds='$today_minus_10_date'
-         |group by user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000
-         |""".stripMargin
-
-    val expectedDf = spark.sql(query)
-
-    val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
-
-    if (diff.count() > 0) {
-      println(s"=== Diff Details ===")
-      println(s"Actual count: ${irRowCount}")
-      println(s"Expected count: ${expectedDf.count()}")
-      println(s"Diff count: ${diff.count()}")
-      diff.show()
-      println("diff result rows")
-    }
-
-    // ASSERTION 5: Main verification - no differences
-    assertEquals(0, diff.count())
-
-    println("=== All Incremental Assertions Passed (SUM, AVERAGE with IR verification, COUNT) ===")
-  }
-
   /**
    * Comprehensive test for incremental aggregations covering all directly comparable operations.
    * Tests: SUM, COUNT, AVERAGE, MIN, MAX, UNIQUE_COUNT, VARIANCE

From 5852a1b6b0d44a31b06c4f9d5dc9f74d771f9662 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Wed, 4 Feb 2026 22:36:05 -0800
Subject: [PATCH 37/54] add unit tests for all aggregations

---
 .../scala/ai/chronon/spark/Comparison.scala   |  87 ++++-
 .../main/scala/ai/chronon/spark/GroupBy.scala |  26 +-
 .../ai/chronon/spark/test/GroupByTest.scala   | 314 +++++++++++++++---
 3 files changed, 365 insertions(+), 62 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/Comparison.scala b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
index 83c0db33d7..a1cb56b0b9 100644
--- a/spark/src/main/scala/ai/chronon/spark/Comparison.scala
+++ b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
@@ -20,13 +20,32 @@ import org.slf4j.LoggerFactory
 import ai.chronon.online.Extensions.StructTypeOps
 import com.google.gson.{Gson, GsonBuilder}
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.types.{DecimalType, DoubleType, FloatType, MapType}
+import org.apache.spark.sql.types.{ArrayType, DecimalType, DoubleType, FloatType, MapType, StructType}
+import org.apache.spark.sql.functions.col
 
 import java.util
+import scala.collection.mutable
 
 object Comparison {
   @transient lazy val logger = LoggerFactory.getLogger(getClass)
 
+  // Flatten struct columns into individual columns so nested double fields can be compared with tolerance
+  private def flattenStructs(df: DataFrame): DataFrame = {
+    val flattenedSelects = df.schema.fields.flatMap { field =>
+      field.dataType match {
+        case structType: StructType =>
+          // Flatten struct fields: struct_name.field_name -> struct_name_field_name
+          structType.fields.map { subField =>
+            col(s"${field.name}.${subField.name}").alias(s"${field.name}_${subField.name}")
+          }
+        case _ =>
+          // Keep non-struct fields as-is
+          Seq(col(field.name))
+      }
+    }
+    df.select(flattenedSelects: _*)
+  }
+
   // used for comparison
   def sortedJson(m: Map[String, Any]): String = {
     if (m == null) return null
@@ -38,6 +57,40 @@ object Comparison {
     gson.toJson(tm)
   }
 
+  // Convert element to simple representation (Row → Array)
+  private def simplifyElement(x: Any): Any = {
+    if (x == null) return null
+    x match {
+      case row: org.apache.spark.sql.Row =>
+        // Extract just the values from Row, without Spark schema metadata
+        (0 until row.length).map(i => if (row.isNullAt(i)) null else row.get(i)).toArray
+      case other => other
+    }
+  }
+
+  // Convert element to string for sorting
+  private def elementToString(x: Any): String = {
+    if (x == null) return ""
+    val simplified = simplifyElement(x)
+    simplified match {
+      case arr: Array[_] => arr.mkString("[", ",", "]")
+      case other => other.toString
+    }
+  }
+
+  // Sort lists/arrays for comparison (order shouldn't matter for sets)
+  def sortedList(list: mutable.WrappedArray[Any]): String = {
+    if (list == null) return null
+    // Sort using clean string representation
+    val sorted = list.sorted(Ordering.by[Any, String](elementToString))
+    val gson = new GsonBuilder()
+      .serializeSpecialFloatingPointValues()
+      .create()
+    // Simplify Row objects to plain arrays before JSON serialization
+    val simplified = sorted.map(simplifyElement)
+    gson.toJson(simplified.toArray)
+  }
+
   def stringifyMaps(df: DataFrame): DataFrame = {
     try {
       df.sparkSession.udf.register("sorted_json", (m: Map[String, Any]) => sortedJson(m))
@@ -54,6 +107,22 @@ object Comparison {
     df.selectExpr(selects: _*)
   }
 
+  def sortLists(df: DataFrame): DataFrame = {
+    try {
+      df.sparkSession.udf.register("sorted_list", (list: mutable.WrappedArray[Any]) => sortedList(list))
+    } catch {
+      case e: Exception => e.printStackTrace()
+    }
+    val selects = for (field <- df.schema.fields) yield {
+      if (field.dataType.isInstanceOf[ArrayType]) {
+        s"sorted_list(${field.name}) as `${field.name}`"
+      } else {
+        s"${field.name} as `${field.name}`"
+      }
+    }
+    df.selectExpr(selects: _*)
+  }
+
   // Produces a "comparison" dataframe - given two dataframes that are supposed to have same data
   // The result contains the differing rows of the same key
   def sideBySide(a: DataFrame,
@@ -69,8 +138,13 @@ object Comparison {
         |""".stripMargin
     )
 
-    val prefixedExpectedDf = prefixColumnName(stringifyMaps(a), s"${aName}_")
-    val prefixedOutputDf = prefixColumnName(stringifyMaps(b), s"${bName}_")
+    // Flatten structs so nested double fields can be compared with tolerance
+    // Sort lists so order doesn't matter for comparison (e.g., UNIQUE_COUNT arrays)
+    val aFlattened = flattenStructs(sortLists(stringifyMaps(a)))
+    val bFlattened = flattenStructs(sortLists(stringifyMaps(b)))
+
+    val prefixedExpectedDf = prefixColumnName(aFlattened, s"${aName}_")
+    val prefixedOutputDf = prefixColumnName(bFlattened, s"${bName}_")
 
     val joinExpr = keys
       .map(key => prefixedExpectedDf(s"${aName}_$key") <=> prefixedOutputDf(s"${bName}_$key"))
@@ -82,15 +156,16 @@ object Comparison {
     )
 
     var finalDf = joined
+    // Use flattened schema for comparison
     val comparisonColumns =
-      a.schema.fieldNames.toSet.diff(keys.toSet).toList.sorted
+      aFlattened.schema.fieldNames.toSet.diff(keys.toSet).toList.sorted
     val colOrder =
       keys.map(key => { finalDf(s"${aName}_$key").as(key) }) ++
         comparisonColumns.flatMap { col =>
           List(finalDf(s"${aName}_$col"), finalDf(s"${bName}_$col"))
         }
-    // double columns need to be compared approximately
-    val doubleCols = a.schema.fields
+    // double columns need to be compared approximately (now includes flattened struct fields)
+    val doubleCols = aFlattened.schema.fields
       .filter(field =>
         field.dataType == DoubleType || field.dataType == FloatType || field.dataType.isInstanceOf[DecimalType])
       .map(_.name)
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index f466975862..8b805fc9c9 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -100,7 +100,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     SparkConversions.fromChrononSchema(valueChrononSchema)
   }
 
-  lazy val flattenedAgg: RowAggregator = new RowAggregator(selectedSchema, aggregations.flatMap(_.unWindowed))
+  @transient lazy val flattenedAgg: RowAggregator = new RowAggregator(selectedSchema, aggregations.flatMap(_.unWindowed))
   lazy val incrementalSchema: Array[(String, api.DataType)] = flattenedAgg.incrementalOutputSchema
 
 
@@ -374,7 +374,8 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
       hopsArrayHead.map { array: HopIr =>
         val timestamp = array.last.asInstanceOf[Long]
         val withoutTimestamp = array.dropRight(1)
-        ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp) :+ timestamp), withoutTimestamp)
+        val normalizedIR = flattenedAgg.normalize(withoutTimestamp)
+        ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp) :+ timestamp), normalizedIR)
       }
     }
   }
@@ -808,7 +809,8 @@ object GroupBy {
       incrementalDf: DataFrame,
       aggregationParts: Seq[api.AggregationPart],
       groupByConf: api.GroupBy,
-      tableUtils: TableUtils) : RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+      tableUtils: TableUtils,
+      rowAggregator: RowAggregator) : RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
 
     val keyColumns = groupByConf.getKeyColumns.toScala
     val keyBuilder: Row => KeyWithHash =
@@ -819,16 +821,20 @@ object GroupBy {
         //Extract timestamp from partition column
         val ds = row.getAs[String](tableUtils.partitionColumn)
         val ts = tableUtils.partitionSpec.epochMillis(ds)
-
-        val irs = new Array[Any](aggregationParts.length)
+        
+        // Extract normalized IRs from the row
+        val normalizedIrs = new Array[Any](aggregationParts.length)
         aggregationParts.zipWithIndex.foreach { case (part, idx) =>
           val value = row.get(row.fieldIndex(part.incrementalOutputColumnName))
-          irs(idx) = value match {
+          normalizedIrs(idx) = value match {
             case r: Row => r.toSeq.toArray
             case other => other
           }
         }
 
+        // Denormalize IRs to in-memory format (e.g., ArrayList -> HashSet)
+        val irs = rowAggregator.denormalize(normalizedIrs)
+
         // Build HopIR : [IR1, IR2, ..., IRn, ts]
         val hopIr: HopIr = irs :+ ts
         (keyBuilder(row), hopIr)
@@ -855,13 +861,17 @@ object GroupBy {
 
     val (_, incrementalDf: DataFrame) =  incrementalQueryableRange.scanQueryStringAndDf(null, incrementalOutputTable)
 
-    val incrementalHops  = convertIncrementalDfToHops(incrementalDf, aggregationParts, groupByConf, tableUtils)
-
     // Create a DataFrame with the source schema (raw data schema) to match aggregations
     // We need this because GroupBy class variables expect inputDf schema to match aggregation input columns
     // We create an empty DataFrame with the correct schema - it won't be used for computation
     val sourceDf = buildSourceDataFrame(groupByConf, range, None, tableUtils, schemaOnly = true)
 
+    // Create RowAggregator for denormalizing IRs when reading from incremental table
+    val chrononSchema = SparkConversions.toChrononSchema(sourceDf.schema)
+    val rowAggregator = new RowAggregator(chrononSchema, aggregationParts)
+
+    val incrementalHops  = convertIncrementalDfToHops(incrementalDf, aggregationParts, groupByConf, tableUtils, rowAggregator)
+
     new GroupBy(
       groupByConf.getAggregations.toScala,
       groupByConf.getKeyColumns.toScala,
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index 34a8242dda..1585cf7077 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -39,7 +39,7 @@ import ai.chronon.spark._
 import ai.chronon.spark.catalog.TableUtils
 import com.google.gson.Gson
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{ArrayType, StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
+import org.apache.spark.sql.types.{ArrayType, BinaryType, MapType, StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
 import org.apache.spark.sql.{Encoders, Row, SparkSession}
 import org.apache.spark.sql.functions.col
 import org.junit.Assert._
@@ -999,27 +999,30 @@ class GroupByTest {
   }
 
   /**
-   * Comprehensive test for incremental aggregations covering all directly comparable operations.
-   * Tests: SUM, COUNT, AVERAGE, MIN, MAX, UNIQUE_COUNT, VARIANCE
-   * All these operations have IRs that can be directly compared (no binary sketches).
+   * Tests basic aggregations in incremental mode by comparing Chronon's output against SQL.
+   *
+   * Operations: SUM, COUNT, AVERAGE, MIN, MAX, VARIANCE, UNIQUE_COUNT, HISTOGRAM, BOUNDED_UNIQUE_COUNT
+   *
+   * Actual:   Chronon computes daily IRs using computeIncrementalDf, storing intermediate results
+   * Expected: SQL query computes the same aggregations directly on the input data for the same date
    */
   @Test
-  def testIncrementalAllAggregations(): Unit = {
-    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncrementalAll" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+  def testIncrementalBasicAggregations(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncrementalBasic" + "_" + Random.alphanumeric.take(6).mkString, local = true)
     implicit val tableUtils = TableUtils(spark)
-    val namespace = s"incremental_all_aggs_${Random.alphanumeric.take(6).mkString}"
+    val namespace = s"incremental_basic_aggs_${Random.alphanumeric.take(6).mkString}"
     tableUtils.createDatabase(namespace)
 
     val schema = List(
       Column("user", StringType, 10),
       Column("price", DoubleType, 100),
       Column("quantity", IntType, 50),
-      Column("product_id", StringType, 20),  // Low cardinality for UNIQUE_COUNT
+      Column("product_id", StringType, 20),  // Low cardinality for UNIQUE_COUNT, HISTOGRAM, BOUNDED_UNIQUE_COUNT
       Column("rating", DoubleType, 2000)
     )
 
     val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
-
+    
     val aggregations: Seq[Aggregation] = Seq(
       // Simple aggregations
       Builders.Aggregation(Operation.SUM, "price", Seq(new Window(7, TimeUnit.DAYS))),
@@ -1032,8 +1035,17 @@ class GroupByTest {
       Builders.Aggregation(Operation.MIN, "price", Seq(new Window(7, TimeUnit.DAYS))),
       Builders.Aggregation(Operation.MAX, "quantity", Seq(new Window(7, TimeUnit.DAYS))),
 
-      // Variance (struct IR with sum/sum_of_squares/count)
-      Builders.Aggregation(Operation.VARIANCE, "price", Seq(new Window(7, TimeUnit.DAYS)))
+      // Variance (struct IR with count/mean/m2)
+      Builders.Aggregation(Operation.VARIANCE, "price", Seq(new Window(7, TimeUnit.DAYS))),
+
+      // UNIQUE_COUNT (array IR)
+      Builders.Aggregation(Operation.UNIQUE_COUNT, "price", Seq(new Window(7, TimeUnit.DAYS))),
+
+      // HISTOGRAM (map IR)
+      Builders.Aggregation(Operation.HISTOGRAM, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "0")),
+
+      // BOUNDED_UNIQUE_COUNT (array IR with bound)
+      Builders.Aggregation(Operation.BOUNDED_UNIQUE_COUNT, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "100"))
     )
 
     val tableProps: Map[String, String] = Map("source" -> "chronon")
@@ -1045,10 +1057,10 @@ class GroupByTest {
     val partitionRange = PartitionRange(today_minus_20_date, today_date)
 
     val groupBy = new GroupBy(aggregations, Seq("user"), df)
-    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalAllAggsOutput", partitionRange, tableProps)
+    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalBasicAggsOutput", partitionRange, tableProps)
 
-    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalAllAggsOutput where ds='$today_minus_7_date'")
-    df.createOrReplaceTempView("test_all_aggs_input")
+    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalBasicAggsOutput where ds='$today_minus_7_date'")
+    df.createOrReplaceTempView("test_basic_aggs_input")
 
     println("=== Incremental IR Schema ===")
     actualIncrementalDf.printSchema()
@@ -1066,16 +1078,9 @@ class GroupByTest {
     assertTrue("IR must contain MIN price", irColumns.exists(_.contains("price_min")))
     assertTrue("IR must contain MAX quantity", irColumns.exists(_.contains("quantity_max")))
     assertTrue("IR must contain VARIANCE price", irColumns.exists(_.contains("price_variance")))
-
-    // ASSERTION 3: Verify complex IR structures
-    val avgColumn = actualIncrementalDf.schema.fields.find(_.name.contains("rating_average"))
-    assertTrue("AVERAGE IR should be StructType", avgColumn.isDefined && avgColumn.get.dataType.isInstanceOf[StructType])
-
-    val varianceColumn = actualIncrementalDf.schema.fields.find(_.name.contains("price_variance"))
-    assertTrue("VARIANCE IR should be StructType", varianceColumn.isDefined && varianceColumn.get.dataType.isInstanceOf[StructType])
-
-    println(s"✓ AVERAGE IR type: ${avgColumn.get.dataType}")
-    println(s"✓ VARIANCE IR type: ${varianceColumn.get.dataType}")
+    assertTrue("IR must contain UNIQUE COUNT price", irColumns.exists(_.contains("price_unique_count")))
+    assertTrue("IR must contain HISTOGRAM product_id", irColumns.exists(_.contains("product_id_histogram")))
+    assertTrue("IR must contain BOUNDED_UNIQUE_COUNT product_id", irColumns.exists(_.contains("product_id_bounded_unique_count")))
 
     // ASSERTION 4: Verify IR table has data
     val irRowCount = actualIncrementalDf.count()
@@ -1084,24 +1089,53 @@ class GroupByTest {
     // ASSERTION 5: Compare against SQL computation
     val query =
       s"""
-         |select user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts,
-         |  sum(price) as price_sum,
-         |  count(quantity) as quantity_count,
-         |  struct(sum(rating) as sum, count(rating) as count) as rating_average,
-         |  min(price) as price_min,
-         |  max(quantity) as quantity_max,
-         |  struct(
-         |    cast(count(price) as int) as count,
-         |    avg(price) as mean,
-         |    sum(price * price) - count(price) * avg(price) * avg(price) as m2
-         |  ) as price_variance
-         |from test_all_aggs_input
-         |where ds='$today_minus_7_date'
-         |group by user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000
+         |WITH base_aggs AS (
+         |  SELECT user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts,
+         |    sum(price) as price_sum,
+         |    count(quantity) as quantity_count,
+         |    struct(sum(rating) as sum, count(rating) as count) as rating_average,
+         |    min(price) as price_min,
+         |    max(quantity) as quantity_max,
+         |    struct(
+         |      cast(count(price) as int) as count,
+         |      avg(price) as mean,
+         |      sum(price * price) - count(price) * avg(price) * avg(price) as m2
+         |    ) as price_variance,
+         |    collect_set(price) as price_unique_count,
+         |    slice(collect_set(md5(product_id)), 1, 100) as product_id_bounded_unique_count
+         |  FROM test_basic_aggs_input
+         |  WHERE ds='$today_minus_7_date'
+         |  GROUP BY user, ds
+         |),
+         |histogram_agg AS (
+         |  SELECT user, ds,
+         |    map_from_entries(collect_list(struct(product_id, cast(cnt as int)))) as product_id_histogram
+         |  FROM (
+         |    SELECT user, ds, product_id, count(*) as cnt
+         |    FROM test_basic_aggs_input
+         |    WHERE ds='$today_minus_7_date' AND product_id IS NOT NULL
+         |    GROUP BY user, ds, product_id
+         |  )
+         |  GROUP BY user, ds
+         |)
+         |SELECT b.*, h.product_id_histogram
+         |FROM base_aggs b
+         |LEFT JOIN histogram_agg h ON b.user <=> h.user AND b.ds <=> h.ds
          |""".stripMargin
 
     val expectedDf = spark.sql(query)
-    val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
+
+    // Convert array columns to counts for comparison (since MD5 hashing differs between Scala and SQL)
+    import org.apache.spark.sql.functions.size
+    val actualWithCounts = actualIncrementalDf
+      .withColumn("price_unique_count", size(col("price_unique_count")))
+      .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
+
+    val expectedWithCounts = expectedDf
+      .withColumn("price_unique_count", size(col("price_unique_count")))
+      .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
+
+    val diff = Comparison.sideBySide(actualWithCounts, expectedWithCounts, List("user", tableUtils.partitionColumn))
 
     if (diff.count() > 0) {
       println(s"=== Diff Details for All Aggregations ===")
@@ -1112,15 +1146,6 @@ class GroupByTest {
     }
 
     assertEquals(0, diff.count())
-
-    println("=== All Incremental Assertions Passed ===")
-    println("✓ SUM: Simple numeric IR")
-    println("✓ COUNT: Simple numeric IR")
-    println("✓ AVERAGE: Struct IR {sum, count}")
-    println("✓ MIN: Simple numeric IR")
-    println("✓ MAX: Simple numeric IR")
-    println("✓ UNIQUE_COUNT: Array IR [unique values] - directly comparable!")
-    println("✓ VARIANCE: Struct IR {sum, sum_of_squares, count}")
   }
 
   /**
@@ -1178,4 +1203,197 @@ class GroupByTest {
     assertEquals(0, diff.count())
   }
 
+  /**
+   * Unit test for FIRST and LAST aggregations with incremental IR
+   * FIRST/LAST use TimeTuple IR: struct {epochMillis: Long, payload: Value}
+   * FIRST keeps the value with the earliest timestamp
+   * LAST keeps the value with the latest timestamp
+   */
+  @Test
+  def testIncrementalFirstLast(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestFirstLast" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val namespace = s"incremental_first_last_${Random.alphanumeric.take(6).mkString}"
+    tableUtils.createDatabase(namespace)
+
+    val schema = List(
+      Column("user", StringType, 5),
+      Column("value", DoubleType, 100)
+    )
+
+    // Generate events and add random milliseconds to ts for unique timestamps
+    import org.apache.spark.sql.functions.{rand, col}
+    import org.apache.spark.sql.types.{LongType => SparkLongType}
+
+    val dfWithRandom = DataFrameGen.events(spark, schema, count = 10000, partitions = 20)
+      .withColumn("ts", col("ts") + (rand() * 86400000).cast(SparkLongType))  // Add 0-24h random millis
+      .cache()  // Mark for caching
+
+    // Force materialization - computes and caches the random values
+    dfWithRandom.count()
+
+    // Write the CACHED data to table - writes already-materialized values
+    dfWithRandom.write.mode("overwrite").saveAsTable(s"${namespace}.test_first_last_input")
+
+    // Read back from table - guaranteed same data as what was written
+    val df = spark.table(s"${namespace}.test_first_last_input")
+
+    val aggregations: Seq[Aggregation] = Seq(
+      Builders.Aggregation(Operation.FIRST, "value", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.LAST, "value", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.FIRST_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.LAST_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.TOP_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.BOTTOM_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3"))
+    )
+
+    val tableProps: Map[String, String] = Map("source" -> "chronon")
+    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val today_minus_7_date = tableUtils.partitionSpec.minus(today_date, new Window(7, TimeUnit.DAYS))
+    val today_minus_20_date = tableUtils.partitionSpec.minus(today_date, new Window(20, TimeUnit.DAYS))
+    val partitionRange = PartitionRange(today_minus_20_date, today_date)
+
+    val groupBy = new GroupBy(aggregations, Seq("user"), df)
+    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalFirstLastOutput", partitionRange, tableProps)
+
+    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalFirstLastOutput where ds='$today_minus_7_date'")
+
+    println("=== Incremental FIRST/LAST IR Schema ===")
+    actualIncrementalDf.printSchema()
+
+    // Compare against SQL computation
+    // Note: ts column in IR table is the partition timestamp (derived from ds)
+    // But FIRST/LAST use the actual event timestamps (with random milliseconds)
+    val query =
+      s"""
+         |SELECT user,
+         |  to_date(from_unixtime(ts / 1000, 'yyyy-MM-dd HH:mm:ss')) as ds,
+         |  named_struct(
+         |    'epochMillis', min(ts),
+         |    'payload', sort_array(collect_list(struct(ts, value)))[0].value
+         |  ) as value_first,
+         |  named_struct(
+         |    'epochMillis', max(ts),
+         |    'payload', reverse(sort_array(collect_list(struct(ts, value))))[0].value
+         |  ) as value_last,
+         |  transform(
+         |    slice(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL)), 1, 3),
+         |    x -> named_struct('epochMillis', x.ts, 'payload', x.value)
+         |  ) as value_first3,
+         |  transform(
+         |    slice(reverse(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL))), 1, 3),
+         |    x -> named_struct('epochMillis', x.ts, 'payload', x.value)
+         |  ) as value_last3,
+         |  transform(
+         |    slice(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL), false), 1, 3),
+         |    x -> x.value
+         |  ) as value_top3,
+         |  transform(
+         |    slice(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL), true), 1, 3),
+         |    x -> x.value
+         |  ) as value_bottom3
+         |FROM ${namespace}.test_first_last_input
+         |WHERE to_date(from_unixtime(ts / 1000, 'yyyy-MM-dd HH:mm:ss'))='$today_minus_7_date'
+         |GROUP BY user, to_date(from_unixtime(ts / 1000, 'yyyy-MM-dd HH:mm:ss'))
+         |""".stripMargin
+
+    val expectedDf = spark.sql(query)
+
+    // Drop ts from comparison - it's just the partition timestamp, not part of the aggregation IR
+    val actualWithoutTs = actualIncrementalDf.drop("ts")
+
+    // Comparison.sideBySide handles sorting arrays and converting Row objects to clean JSON
+    val diff = Comparison.sideBySide(actualWithoutTs, expectedDf, List("user", tableUtils.partitionColumn))
+
+    if (diff.count() > 0) {
+      println(s"=== Diff Details for Time-based Aggregations ===")
+      println(s"Expected count: ${expectedDf.count()}")
+      println(s"Diff count: ${diff.count()}")
+      diff.show(100, truncate = false)
+    }
+
+    assertEquals(0, diff.count())
+
+    println("=== Time-based Aggregations Incremental Test Passed ===")
+    println("✓ FIRST: TimeTuple IR {epochMillis, payload}")
+    println("✓ LAST: TimeTuple IR {epochMillis, payload}")
+    println("✓ FIRST_K: Array[TimeTuple] - stores timestamps")
+    println("✓ LAST_K: Array[TimeTuple] - stores timestamps")
+    println("✓ TOP_K: Array[Double] - stores only values")
+    println("✓ BOTTOM_K: Array[Double] - stores only values")
+
+    // Cleanup
+    spark.stop()
+  }
+
+  @Test
+  def testIncrementalStatisticalAggregations(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestStatistical" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val namespace = s"incremental_stats_${Random.alphanumeric.take(6).mkString}"
+    tableUtils.createDatabase(namespace)
+
+    val schema = List(
+      Column("user", StringType, 5),
+      Column("value", DoubleType, 100),
+      Column("category", StringType, 10)  // For APPROX_UNIQUE_COUNT
+    )
+
+    // Generate sufficient data for statistical aggregations
+    val df = DataFrameGen.events(spark, schema, count = 10000, partitions = 20)
+    df.write.mode("overwrite").saveAsTable(s"${namespace}.test_stats_input")
+    val inputDf = spark.table(s"${namespace}.test_stats_input")
+
+    val aggregations: Seq[Aggregation] = Seq(
+      Builders.Aggregation(Operation.SKEW, "value", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.KURTOSIS, "value", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.APPROX_PERCENTILE, "value", Seq(new Window(7, TimeUnit.DAYS)),
+        argMap = Map("percentiles" -> "[0.5, 0.25, 0.75]")),
+      Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "category", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.APPROX_HISTOGRAM_K, "category", Seq(new Window(7, TimeUnit.DAYS)),
+        argMap = Map("k" -> "10"))
+    )
+
+    val tableProps: Map[String, String] = Map("source" -> "chronon")
+    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val today_minus_7_date = tableUtils.partitionSpec.minus(today_date, new Window(7, TimeUnit.DAYS))
+    val today_minus_20_date = tableUtils.partitionSpec.minus(today_date, new Window(20, TimeUnit.DAYS))
+    val partitionRange = PartitionRange(today_minus_20_date, today_date)
+
+    val groupBy = new GroupBy(aggregations, Seq("user"), inputDf)
+    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalStatsOutput", partitionRange, tableProps)
+
+    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalStatsOutput where ds='$today_minus_7_date'")
+
+    // Verify IR table has data
+    assertTrue(s"IR table should have rows", actualIncrementalDf.count() > 0)
+
+    // Verify APPROX_HISTOGRAM_K column exists and has binary data (sketch)
+    val histogramCol = actualIncrementalDf.schema.fields.find(_.name.contains("category_approx_histogram_k"))
+    assertTrue("APPROX_HISTOGRAM_K column should exist", histogramCol.isDefined)
+    assertTrue("APPROX_HISTOGRAM_K should be BinaryType (sketch)", histogramCol.get.dataType.isInstanceOf[BinaryType])
+
+    // Verify histogram sketch is non-null
+    val histogramData = spark.sql(
+      s"""
+         |SELECT category_approx_histogram_k
+         |FROM ${namespace}.testIncrementalStatsOutput
+         |WHERE ds='$today_minus_7_date' AND category_approx_histogram_k IS NOT NULL
+         |LIMIT 1
+         |""".stripMargin
+    ).collect()
+
+    assertTrue("APPROX_HISTOGRAM_K should produce non-null sketch data", histogramData.nonEmpty)
+
+    println("=== Statistical Aggregations Incremental Test Passed ===")
+    println("✓ SKEW: Statistical skewness")
+    println("✓ KURTOSIS: Statistical kurtosis")
+    println("✓ APPROX_PERCENTILE: Approximate percentiles")
+    println("✓ APPROX_UNIQUE_COUNT: Approximate distinct count")
+    println("✓ APPROX_HISTOGRAM_K: Approximate histogram with k buckets")
+
+    // Cleanup
+    spark.stop()
+  }
+
 }

From c29a1df8477a39835c151be2b1cf48707963342c Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Fri, 6 Feb 2026 22:24:43 -0800
Subject: [PATCH 38/54] convert spark datatype to java

---
 .../main/scala/ai/chronon/spark/GroupBy.scala | 72 +++++++++++++------
 .../ai/chronon/spark/test/GroupByTest.scala   | 25 ++++++-
 2 files changed, 74 insertions(+), 23 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 8b805fc9c9..1f22c0fe10 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -34,6 +34,8 @@ import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 import org.apache.spark.util.sketch.BloomFilter
 import org.slf4j.LoggerFactory
 
+import scala.jdk.CollectionConverters._
+
 import java.util
 import scala.collection.{Seq, mutable}
 import scala.util.ScalaJavaConversions.{JListOps, ListOps, MapOps}
@@ -810,34 +812,60 @@ object GroupBy {
       aggregationParts: Seq[api.AggregationPart],
       groupByConf: api.GroupBy,
       tableUtils: TableUtils,
-      rowAggregator: RowAggregator) : RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+      inputSchema: Seq[(String, api.DataType)]) : RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
 
     val keyColumns = groupByConf.getKeyColumns.toScala
     val keyBuilder: Row => KeyWithHash =
       FastHashing.generateKeyBuilder(keyColumns.toArray, incrementalDf.schema)
 
     incrementalDf.rdd
-      .map { row =>
-        //Extract timestamp from partition column
-        val ds = row.getAs[String](tableUtils.partitionColumn)
-        val ts = tableUtils.partitionSpec.epochMillis(ds)
-        
-        // Extract normalized IRs from the row
-        val normalizedIrs = new Array[Any](aggregationParts.length)
-        aggregationParts.zipWithIndex.foreach { case (part, idx) =>
-          val value = row.get(row.fieldIndex(part.incrementalOutputColumnName))
-          normalizedIrs(idx) = value match {
-            case r: Row => r.toSeq.toArray
-            case other => other
+      .mapPartitions { rows =>
+        // Reconstruct RowAggregator once per partition on executor
+        // Avoids serializing TimedDispatcher and reduces network overhead
+        val rowAggregator = new RowAggregator(inputSchema, aggregationParts)
+
+        rows.map { row =>
+          //Extract timestamp from partition column
+          val ds = row.getAs[String](tableUtils.partitionColumn)
+          val ts = tableUtils.partitionSpec.epochMillis(ds)
+
+          // Extract normalized IRs from the row
+          // Recursively convert Spark types to Java types that denormalize() expects
+          // This handles nested structures (e.g., FIRST_K: Array<Struct> → ArrayList[Array])
+          def convertSparkToJava(value: Any): Any = value match {
+            case null => null
+            case r: Row =>
+              // Struct → Array[Any], recursively convert nested values
+              r.toSeq.map(convertSparkToJava).toArray
+            case arr: scala.collection.mutable.WrappedArray[_] =>
+              // Array → ArrayList, recursively convert elements
+              val converted = arr.map(convertSparkToJava)
+              new java.util.ArrayList[Any](converted.toSeq.asJava)
+            case map: scala.collection.Map[_, _] =>
+              // Map → HashMap, recursively convert values
+              val javaMap = new java.util.HashMap[Any, Any]()
+              map.foreach { case (k, v) =>
+                javaMap.put(k, convertSparkToJava(v))
+              }
+              javaMap
+            case other =>
+              // Scalars (Long, Double, String, byte arrays, etc.) pass through
+              other
           }
-        }
 
-        // Denormalize IRs to in-memory format (e.g., ArrayList -> HashSet)
-        val irs = rowAggregator.denormalize(normalizedIrs)
+          val normalizedIrs = new Array[Any](aggregationParts.length)
+          aggregationParts.zipWithIndex.foreach { case (part, idx) =>
+            val value = row.get(row.fieldIndex(part.incrementalOutputColumnName))
+            normalizedIrs(idx) = convertSparkToJava(value)
+          }
+
+          // Denormalize IRs to in-memory format (e.g., ArrayList -> HashSet)
+          val irs = rowAggregator.denormalize(normalizedIrs)
 
-        // Build HopIR : [IR1, IR2, ..., IRn, ts]
-        val hopIr: HopIr = irs :+ ts
-        (keyBuilder(row), hopIr)
+          // Build HopIR : [IR1, IR2, ..., IRn, ts]
+          val hopIr: HopIr = irs :+ ts
+          (keyBuilder(row), hopIr)
+        }
       }
       .groupByKey()
       .mapValues{ hopIrs =>
@@ -866,11 +894,11 @@ object GroupBy {
     // We create an empty DataFrame with the correct schema - it won't be used for computation
     val sourceDf = buildSourceDataFrame(groupByConf, range, None, tableUtils, schemaOnly = true)
 
-    // Create RowAggregator for denormalizing IRs when reading from incremental table
+    // Extract input schema for RowAggregator reconstruction on executors
+    // Pass lightweight schema instead of heavy RowAggregator to avoid serializing TimedDispatcher
     val chrononSchema = SparkConversions.toChrononSchema(sourceDf.schema)
-    val rowAggregator = new RowAggregator(chrononSchema, aggregationParts)
 
-    val incrementalHops  = convertIncrementalDfToHops(incrementalDf, aggregationParts, groupByConf, tableUtils, rowAggregator)
+    val incrementalHops  = convertIncrementalDfToHops(incrementalDf, aggregationParts, groupByConf, tableUtils, chrononSchema)
 
     new GroupBy(
       groupByConf.getAggregations.toScala,
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index 1585cf7077..508f111ad6 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -1166,10 +1166,33 @@ class GroupByTest {
     val outputDates = CStream.genPartitions(10, tableUtils.partitionSpec)
 
     val aggregations: Seq[Aggregation] = Seq(
+      // Basic
       Builders.Aggregation(Operation.SUM, "time_spent_ms", Seq(new Window(10, TimeUnit.DAYS), new Window(5, TimeUnit.DAYS))),
       Builders.Aggregation(Operation.SUM, "price", Seq(new Window(10, TimeUnit.DAYS))),
       Builders.Aggregation(Operation.COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.AVERAGE, "price", Seq(new Window(10, TimeUnit.DAYS)))
+      Builders.Aggregation(Operation.AVERAGE, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.MIN, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.MAX, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      // Statistical
+      Builders.Aggregation(Operation.VARIANCE, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.SKEW, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.KURTOSIS, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.APPROX_PERCENTILE, "price", Seq(new Window(10, TimeUnit.DAYS)),
+        argMap = Map("percentiles" -> "[0.5, 0.25, 0.75]")),
+      // Temporal
+      Builders.Aggregation(Operation.FIRST, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.LAST, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.FIRST_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.LAST_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.TOP_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.BOTTOM_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      // Cardinality / Set
+      Builders.Aggregation(Operation.UNIQUE_COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.BOUNDED_UNIQUE_COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      // Distribution
+      Builders.Aggregation(Operation.HISTOGRAM, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.APPROX_HISTOGRAM_K, "user", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "10"))
     )
 
     val (source, endPartition) = createTestSourceIncremental(windowSize = 30, suffix = "_snapshot_events", partitionColOpt = Some(tableUtils.partitionColumn))

From 2f78a05910a0561160cda71ac8c6733a60777fc1 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Fri, 6 Feb 2026 23:01:56 -0800
Subject: [PATCH 39/54] scala formatted

---
 .../aggregator/row/ColumnAggregator.scala     |   2 +-
 .../aggregator/row/RowAggregator.scala        |   4 +-
 .../scala/ai/chronon/api/Extensions.scala     |   1 -
 .../scala/ai/chronon/spark/Comparison.scala   |   2 +-
 .../main/scala/ai/chronon/spark/GroupBy.scala | 189 +++++++++---------
 .../ai/chronon/spark/catalog/TableUtils.scala |   2 +-
 .../ai/chronon/spark/test/GroupByTest.scala   |   2 +-
 7 files changed, 105 insertions(+), 97 deletions(-)

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
index ea36ca5344..dc335537ee 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
@@ -224,7 +224,7 @@ object ColumnAggregator {
   private def toJavaDouble[A: Numeric](inp: Any) =
     implicitly[Numeric[A]].toDouble(inp.asInstanceOf[A]).asInstanceOf[java.lang.Double]
 
- def construct(baseInputType: DataType,
+  def construct(baseInputType: DataType,
                 aggregationPart: AggregationPart,
                 columnIndices: ColumnIndices,
                 bucketIndex: Option[Int]): ColumnAggregator = {
diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
index 4f0dda6fe0..aac7ed859f 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
@@ -25,9 +25,7 @@ import scala.collection.Seq
 // The primary API of the aggregator package.
 // the semantics are to mutate values in place for performance reasons
 // userAggregationParts is used when incrementalMode = True.
-class RowAggregator(val inputSchema: Seq[(String, DataType)],
-                    val aggregationParts: Seq[AggregationPart],
-                    )
+class RowAggregator(val inputSchema: Seq[(String, DataType)], val aggregationParts: Seq[AggregationPart])
     extends Serializable
     with SimpleAggregator[Row, Array[Any], Array[Any]] {
 
diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala
index 1719137c1f..25df552d41 100644
--- a/api/src/main/scala/ai/chronon/api/Extensions.scala
+++ b/api/src/main/scala/ai/chronon/api/Extensions.scala
@@ -184,7 +184,6 @@ object Extensions {
 
   }
 
-
   implicit class AggregationOps(aggregation: Aggregation) {
 
     // one agg part per bucket per window
diff --git a/spark/src/main/scala/ai/chronon/spark/Comparison.scala b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
index a1cb56b0b9..9623d87202 100644
--- a/spark/src/main/scala/ai/chronon/spark/Comparison.scala
+++ b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
@@ -74,7 +74,7 @@ object Comparison {
     val simplified = simplifyElement(x)
     simplified match {
       case arr: Array[_] => arr.mkString("[", ",", "]")
-      case other => other.toString
+      case other         => other.toString
     }
   }
 
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index 5bda8af714..c364a1da98 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -46,8 +46,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
               val inputDf: DataFrame,
               val mutationDfFn: () => DataFrame = null,
               skewFilter: Option[String] = None,
-              finalize: Boolean = true,
-             )
+              finalize: Boolean = true)
     extends Serializable {
   @transient lazy val logger = LoggerFactory.getLogger(getClass)
 
@@ -102,13 +101,13 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     SparkConversions.fromChrononSchema(valueChrononSchema)
   }
 
-  @transient lazy val flattenedAgg: RowAggregator = new RowAggregator(selectedSchema, aggregations.flatMap(_.unWindowed))
+  @transient lazy val flattenedAgg: RowAggregator =
+    new RowAggregator(selectedSchema, aggregations.flatMap(_.unWindowed))
   lazy val incrementalSchema: Array[(String, api.DataType)] = flattenedAgg.incrementalOutputSchema
 
-
   @transient
   protected[spark] lazy val windowAggregator: RowAggregator =
-        new RowAggregator(selectedSchema, aggregations.flatMap(_.unpack))
+    new RowAggregator(selectedSchema, aggregations.flatMap(_.unpack))
 
   def snapshotEntitiesBase: RDD[(Array[Any], Array[Any])] = {
     val keys = (keyColumns :+ tableUtils.partitionColumn).toArray
@@ -370,23 +369,26 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     toDf(outputRdd, Seq(Constants.TimeColumn -> LongType, tableUtils.partitionColumn -> StringType))
   }
 
-  def flattenOutputArrayType(hopsArrays: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)]): RDD[(Array[Any], Array[Any])] = {
-    hopsArrays.flatMap { case (keyWithHash: KeyWithHash, hopsArray: HopsAggregator.OutputArrayType) =>
-      val hopsArrayHead: Array[HopIr] = hopsArray.headOption.get
-      hopsArrayHead.map { array: HopIr =>
-        val timestamp = array.last.asInstanceOf[Long]
-        val withoutTimestamp = array.dropRight(1)
-        val normalizedIR = flattenedAgg.normalize(withoutTimestamp)
-        ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp) :+ timestamp), normalizedIR)
-      }
+  def flattenOutputArrayType(
+      hopsArrays: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)]): RDD[(Array[Any], Array[Any])] = {
+    hopsArrays.flatMap {
+      case (keyWithHash: KeyWithHash, hopsArray: HopsAggregator.OutputArrayType) =>
+        val hopsArrayHead: Array[HopIr] = hopsArray.headOption.get
+        hopsArrayHead.map { array: HopIr =>
+          val timestamp = array.last.asInstanceOf[Long]
+          val withoutTimestamp = array.dropRight(1)
+          val normalizedIR = flattenedAgg.normalize(withoutTimestamp)
+          ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp) :+ timestamp), normalizedIR)
+        }
     }
   }
 
   def convertHopsToDf(hops: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)],
-                             schema: Array[(String, ai.chronon.api.DataType)]
-                            ): DataFrame = {
+                      schema: Array[(String, ai.chronon.api.DataType)]): DataFrame = {
     val hopsDf = flattenOutputArrayType(hops)
-    toDf(hopsDf, Seq(tableUtils.partitionColumn -> StringType, Constants.TimeColumn -> LongType), Some(SparkConversions.fromChrononSchema(schema)))
+    toDf(hopsDf,
+         Seq(tableUtils.partitionColumn -> StringType, Constants.TimeColumn -> LongType),
+         Some(SparkConversions.fromChrononSchema(schema)))
   }
 
   // convert raw data into IRs, collected by hopSizes
@@ -412,7 +414,8 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
   }
 
   protected[spark] def toDf(aggregateRdd: RDD[(Array[Any], Array[Any])],
-                            additionalFields: Seq[(String, DataType)], schema: Option[StructType] = None): DataFrame = {
+                            additionalFields: Seq[(String, DataType)],
+                            schema: Option[StructType] = None): DataFrame = {
     val finalKeySchema = StructType(keySchema ++ additionalFields.map { case (name, typ) => StructField(name, typ) })
     KvRdd(aggregateRdd, finalKeySchema, schema.getOrElse(postAggSchema)).toFlatDf
   }
@@ -425,19 +428,17 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     }
 
   /**
-   * computes incremental daily table
-   * @param incrementalOutputTable output of the incremental data stored here
-   * @param range date range to calculate daily aggregatiosn
-   * @param tableProps
-   */
-    def computeIncrementalDf(incrementalOutputTable: String,
-                             range: PartitionRange,
-                             tableProps: Map[String, String]) = {
-
-      val hops = hopsAggregate(range.toTimePoints.min, DailyResolution)
-      val hopsDf: DataFrame = convertHopsToDf(hops, incrementalSchema)
-      hopsDf.save(incrementalOutputTable, tableProps)
-    }
+    * computes incremental daily table
+    * @param incrementalOutputTable output of the incremental data stored here
+    * @param range date range to calculate daily aggregatiosn
+    * @param tableProps
+    */
+  def computeIncrementalDf(incrementalOutputTable: String, range: PartitionRange, tableProps: Map[String, String]) = {
+
+    val hops = hopsAggregate(range.toTimePoints.min, DailyResolution)
+    val hopsDf: DataFrame = convertHopsToDf(hops, incrementalSchema)
+    hopsDf.save(incrementalOutputTable, tableProps)
+  }
 }
 
 // TODO: truncate queryRange for caching
@@ -513,9 +514,12 @@ object GroupBy {
            incrementalMode: Boolean = false): GroupBy = {
     logger.info(s"\n----[Processing GroupBy: ${groupByConfOld.metaData.name}]----")
     val groupByConf = replaceJoinSource(groupByConfOld, queryRange, tableUtils, computeDependency, showDf)
-    val sourceQueryWindow: Option[Window]  = if (incrementalMode) Some(new Window(queryRange.daysBetween, TimeUnit.DAYS)) else groupByConf.maxWindow
-    val backfillQueryRange: PartitionRange = if (incrementalMode) PartitionRange(queryRange.end, queryRange.end)(tableUtils) else queryRange
-    val inputDf = buildSourceDataFrame(groupByConf, backfillQueryRange, sourceQueryWindow, tableUtils, schemaOnly = false)
+    val sourceQueryWindow: Option[Window] =
+      if (incrementalMode) Some(new Window(queryRange.daysBetween, TimeUnit.DAYS)) else groupByConf.maxWindow
+    val backfillQueryRange: PartitionRange =
+      if (incrementalMode) PartitionRange(queryRange.end, queryRange.end)(tableUtils) else queryRange
+    val inputDf =
+      buildSourceDataFrame(groupByConf, backfillQueryRange, sourceQueryWindow, tableUtils, schemaOnly = false)
 
     def doesNotNeedTime = !Option(groupByConf.getAggregations).exists(_.toScala.needsTimestamp)
     def hasValidTimeColumn = inputDf.schema.find(_.name == Constants.TimeColumn).exists(_.dataType == LongType)
@@ -593,11 +597,10 @@ object GroupBy {
     }
 
     new GroupBy(Option(groupByConf.getAggregations).map(_.toScala).orNull,
-        keyColumns,
-        nullFiltered,
-        mutationDfFn,
-        finalize = finalizeValue,
-    )
+                keyColumns,
+                nullFiltered,
+                mutationDfFn,
+                finalize = finalizeValue)
   }
 
   def getIntersectedRange(source: api.Source,
@@ -776,11 +779,11 @@ object GroupBy {
     * @param tableUtils
     */
   def computeIncrementalDf(
-                            groupByConf: api.GroupBy,
-                            range: PartitionRange,
-                            tableUtils: TableUtils,
-                            incrementalOutputTable: String,
-                          ): (PartitionRange, Seq[api.AggregationPart]) = {
+      groupByConf: api.GroupBy,
+      range: PartitionRange,
+      tableUtils: TableUtils,
+      incrementalOutputTable: String
+  ): (PartitionRange, Seq[api.AggregationPart]) = {
 
     val tableProps: Map[String, String] = Option(groupByConf.metaData.tableProperties)
       .map(_.toScala)
@@ -799,17 +802,19 @@ object GroupBy {
       skipFirstHole = false
     )
 
-    val incrementalGroupByAggParts  = partitionRangeHoles.map { holes =>
-      val incrementalAggregationParts = holes.map{ hole =>
-        logger.info(s"Filling hole in incremental table: $hole")
-        val incrementalGroupByBackfill =
-          from(groupByConf, hole, tableUtils, computeDependency = true, incrementalMode = true)
-        incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, hole, tableProps)
-        incrementalGroupByBackfill.flattenedAgg.aggregationParts
-      }
+    val incrementalGroupByAggParts = partitionRangeHoles
+      .map { holes =>
+        val incrementalAggregationParts = holes.map { hole =>
+          logger.info(s"Filling hole in incremental table: $hole")
+          val incrementalGroupByBackfill =
+            from(groupByConf, hole, tableUtils, computeDependency = true, incrementalMode = true)
+          incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, hole, tableProps)
+          incrementalGroupByBackfill.flattenedAgg.aggregationParts
+        }
 
-      incrementalAggregationParts.headOption.getOrElse(Seq.empty)
-    }.getOrElse(Seq.empty)
+        incrementalAggregationParts.headOption.getOrElse(Seq.empty)
+      }
+      .getOrElse(Seq.empty)
 
     (incrementalQueryableRange, incrementalGroupByAggParts)
   }
@@ -819,7 +824,7 @@ object GroupBy {
       aggregationParts: Seq[api.AggregationPart],
       groupByConf: api.GroupBy,
       tableUtils: TableUtils,
-      inputSchema: Seq[(String, api.DataType)]) : RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+      inputSchema: Seq[(String, api.DataType)]): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
 
     val keyColumns = groupByConf.getKeyColumns.toScala
     val keyBuilder: Row => KeyWithHash =
@@ -839,31 +844,34 @@ object GroupBy {
           // Extract normalized IRs from the row
           // Recursively convert Spark types to Java types that denormalize() expects
           // This handles nested structures (e.g., FIRST_K: Array<Struct> → ArrayList[Array])
-          def convertSparkToJava(value: Any): Any = value match {
-            case null => null
-            case r: Row =>
-              // Struct → Array[Any], recursively convert nested values
-              r.toSeq.map(convertSparkToJava).toArray
-            case arr: scala.collection.mutable.WrappedArray[_] =>
-              // Array → ArrayList, recursively convert elements
-              val converted = arr.map(convertSparkToJava)
-              new java.util.ArrayList[Any](converted.toSeq.asJava)
-            case map: scala.collection.Map[_, _] =>
-              // Map → HashMap, recursively convert values
-              val javaMap = new java.util.HashMap[Any, Any]()
-              map.foreach { case (k, v) =>
-                javaMap.put(k, convertSparkToJava(v))
-              }
-              javaMap
-            case other =>
-              // Scalars (Long, Double, String, byte arrays, etc.) pass through
-              other
-          }
+          def convertSparkToJava(value: Any): Any =
+            value match {
+              case null   => null
+              case r: Row =>
+                // Struct → Array[Any], recursively convert nested values
+                r.toSeq.map(convertSparkToJava).toArray
+              case arr: scala.collection.mutable.WrappedArray[_] =>
+                // Array → ArrayList, recursively convert elements
+                val converted = arr.map(convertSparkToJava)
+                new java.util.ArrayList[Any](converted.toSeq.asJava)
+              case map: scala.collection.Map[_, _] =>
+                // Map → HashMap, recursively convert values
+                val javaMap = new java.util.HashMap[Any, Any]()
+                map.foreach {
+                  case (k, v) =>
+                    javaMap.put(k, convertSparkToJava(v))
+                }
+                javaMap
+              case other =>
+                // Scalars (Long, Double, String, byte arrays, etc.) pass through
+                other
+            }
 
           val normalizedIrs = new Array[Any](aggregationParts.length)
-          aggregationParts.zipWithIndex.foreach { case (part, idx) =>
-            val value = row.get(row.fieldIndex(part.incrementalOutputColumnName))
-            normalizedIrs(idx) = convertSparkToJava(value)
+          aggregationParts.zipWithIndex.foreach {
+            case (part, idx) =>
+              val value = row.get(row.fieldIndex(part.incrementalOutputColumnName))
+              normalizedIrs(idx) = convertSparkToJava(value)
           }
 
           // Denormalize IRs to in-memory format (e.g., ArrayList -> HashSet)
@@ -875,7 +883,7 @@ object GroupBy {
         }
       }
       .groupByKey()
-      .mapValues{ hopIrs =>
+      .mapValues { hopIrs =>
         //Convert to HopsAggregator.OutputArrayType: Array[Array[HopIr]]
         val sortedHops = hopIrs.toArray.sortBy(_.last.asInstanceOf[Long])
         Array(sortedHops)
@@ -883,18 +891,19 @@ object GroupBy {
   }
 
   def fromIncrementalDf(
-                       groupByConf: api.GroupBy,
-                       range: PartitionRange,
-                       tableUtils: TableUtils,
-                     ): GroupBy = {
+      groupByConf: api.GroupBy,
+      range: PartitionRange,
+      tableUtils: TableUtils
+  ): GroupBy = {
 
     val incrementalOutputTable = groupByConf.metaData.incrementalOutputTable
     assert(incrementalOutputTable != null,
-      s"incrementalOutputTable is not set for GroupBy: ${groupByConf.metaData.name}")
+           s"incrementalOutputTable is not set for GroupBy: ${groupByConf.metaData.name}")
 
-    val (incrementalQueryableRange, aggregationParts) = computeIncrementalDf(groupByConf, range, tableUtils, incrementalOutputTable)
+    val (incrementalQueryableRange, aggregationParts) =
+      computeIncrementalDf(groupByConf, range, tableUtils, incrementalOutputTable)
 
-    val (_, incrementalDf: DataFrame) =  incrementalQueryableRange.scanQueryStringAndDf(null, incrementalOutputTable)
+    val (_, incrementalDf: DataFrame) = incrementalQueryableRange.scanQueryStringAndDf(null, incrementalOutputTable)
 
     // Create a DataFrame with the source schema (raw data schema) to match aggregations
     // We need this because GroupBy class variables expect inputDf schema to match aggregation input columns
@@ -905,16 +914,18 @@ object GroupBy {
     // Pass lightweight schema instead of heavy RowAggregator to avoid serializing TimedDispatcher
     val chrononSchema = SparkConversions.toChrononSchema(sourceDf.schema)
 
-    val incrementalHops  = convertIncrementalDfToHops(incrementalDf, aggregationParts, groupByConf, tableUtils, chrononSchema)
+    val incrementalHops =
+      convertIncrementalDfToHops(incrementalDf, aggregationParts, groupByConf, tableUtils, chrononSchema)
 
     new GroupBy(
       groupByConf.getAggregations.toScala,
       groupByConf.getKeyColumns.toScala,
       sourceDf, // Use source schema, not incremental schema
-      () => null,
+      () => null
     ) {
       // Override hopsAggregate to return precomputed hops instead of computing from raw data
-      override def hopsAggregate(minQueryTs: Long, resolution: Resolution): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+      override def hopsAggregate(minQueryTs: Long,
+                                 resolution: Resolution): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
         incrementalHops
       }
     }
diff --git a/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala
index 331b437f7e..fca8d723be 100644
--- a/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala
+++ b/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala
@@ -909,7 +909,7 @@ case class TableUtils(sparkSession: SparkSession) {
           .map(partitionSpec.shift(_, inputToOutputShift))
       }
       .getOrElse(fillablePartitions)
-    
+
     val inputMissing = fillablePartitions -- allInputExisting
     val missingPartitions = outputMissing -- inputMissing
     val missingChunks = chunk(missingPartitions)
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index 75e4600430..fb0c3727e6 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -961,7 +961,7 @@ class GroupByTest {
     """)
     assertTrue("Should be able to filter GroupBy results", filteredResult.count() >= 0)
   }
-  
+
 
   private def createTestSourceIncremental(windowSize: Int = 365,
                                suffix: String = "",

From b296d70c3e813425cb8d5b49a19198d9b891104f Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Fri, 6 Feb 2026 23:07:26 -0800
Subject: [PATCH 40/54] fix for scala 2.13

---
 spark/src/main/scala/ai/chronon/spark/Comparison.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/Comparison.scala b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
index 9623d87202..378505cd42 100644
--- a/spark/src/main/scala/ai/chronon/spark/Comparison.scala
+++ b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
@@ -37,7 +37,7 @@ object Comparison {
           // Flatten struct fields: struct_name.field_name -> struct_name_field_name
           structType.fields.map { subField =>
             col(s"${field.name}.${subField.name}").alias(s"${field.name}_${subField.name}")
-          }
+          }.toSeq
         case _ =>
           // Keep non-struct fields as-is
           Seq(col(field.name))

From 3202e7da25cc9931cf98cf51276e14f34c0bd276 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sat, 7 Feb 2026 09:56:35 -0800
Subject: [PATCH 41/54] fix for scala 2.13

---
 spark/src/main/scala/ai/chronon/spark/Comparison.scala | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/Comparison.scala b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
index 378505cd42..071d92a8df 100644
--- a/spark/src/main/scala/ai/chronon/spark/Comparison.scala
+++ b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
@@ -79,7 +79,8 @@ object Comparison {
   }
 
   // Sort lists/arrays for comparison (order shouldn't matter for sets)
-  def sortedList(list: mutable.WrappedArray[Any]): String = {
+  // Use Seq[Any] for Scala 2.13 compatibility (WrappedArray in 2.11/2.12, ArraySeq in 2.13)
+  def sortedList(list: Seq[Any]): String = {
     if (list == null) return null
     // Sort using clean string representation
     val sorted = list.sorted(Ordering.by[Any, String](elementToString))
@@ -109,7 +110,7 @@ object Comparison {
 
   def sortLists(df: DataFrame): DataFrame = {
     try {
-      df.sparkSession.udf.register("sorted_list", (list: mutable.WrappedArray[Any]) => sortedList(list))
+      df.sparkSession.udf.register("sorted_list", (list: Seq[Any]) => sortedList(list))
     } catch {
       case e: Exception => e.printStackTrace()
     }

From fa2fcba19ddd33da4d656c2aee2731bc12c325bc Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sat, 7 Feb 2026 13:31:58 -0800
Subject: [PATCH 42/54] change unit test

---
 .../scala/ai/chronon/spark/Comparison.scala   | 58 +------------------
 .../ai/chronon/spark/test/GroupByTest.scala   | 31 +++++-----
 2 files changed, 20 insertions(+), 69 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/Comparison.scala b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
index 071d92a8df..b515c31c5e 100644
--- a/spark/src/main/scala/ai/chronon/spark/Comparison.scala
+++ b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
@@ -37,7 +37,7 @@ object Comparison {
           // Flatten struct fields: struct_name.field_name -> struct_name_field_name
           structType.fields.map { subField =>
             col(s"${field.name}.${subField.name}").alias(s"${field.name}_${subField.name}")
-          }.toSeq
+          }
         case _ =>
           // Keep non-struct fields as-is
           Seq(col(field.name))
@@ -57,41 +57,6 @@ object Comparison {
     gson.toJson(tm)
   }
 
-  // Convert element to simple representation (Row → Array)
-  private def simplifyElement(x: Any): Any = {
-    if (x == null) return null
-    x match {
-      case row: org.apache.spark.sql.Row =>
-        // Extract just the values from Row, without Spark schema metadata
-        (0 until row.length).map(i => if (row.isNullAt(i)) null else row.get(i)).toArray
-      case other => other
-    }
-  }
-
-  // Convert element to string for sorting
-  private def elementToString(x: Any): String = {
-    if (x == null) return ""
-    val simplified = simplifyElement(x)
-    simplified match {
-      case arr: Array[_] => arr.mkString("[", ",", "]")
-      case other         => other.toString
-    }
-  }
-
-  // Sort lists/arrays for comparison (order shouldn't matter for sets)
-  // Use Seq[Any] for Scala 2.13 compatibility (WrappedArray in 2.11/2.12, ArraySeq in 2.13)
-  def sortedList(list: Seq[Any]): String = {
-    if (list == null) return null
-    // Sort using clean string representation
-    val sorted = list.sorted(Ordering.by[Any, String](elementToString))
-    val gson = new GsonBuilder()
-      .serializeSpecialFloatingPointValues()
-      .create()
-    // Simplify Row objects to plain arrays before JSON serialization
-    val simplified = sorted.map(simplifyElement)
-    gson.toJson(simplified.toArray)
-  }
-
   def stringifyMaps(df: DataFrame): DataFrame = {
     try {
       df.sparkSession.udf.register("sorted_json", (m: Map[String, Any]) => sortedJson(m))
@@ -108,22 +73,6 @@ object Comparison {
     df.selectExpr(selects: _*)
   }
 
-  def sortLists(df: DataFrame): DataFrame = {
-    try {
-      df.sparkSession.udf.register("sorted_list", (list: Seq[Any]) => sortedList(list))
-    } catch {
-      case e: Exception => e.printStackTrace()
-    }
-    val selects = for (field <- df.schema.fields) yield {
-      if (field.dataType.isInstanceOf[ArrayType]) {
-        s"sorted_list(${field.name}) as `${field.name}`"
-      } else {
-        s"${field.name} as `${field.name}`"
-      }
-    }
-    df.selectExpr(selects: _*)
-  }
-
   // Produces a "comparison" dataframe - given two dataframes that are supposed to have same data
   // The result contains the differing rows of the same key
   def sideBySide(a: DataFrame,
@@ -140,9 +89,8 @@ object Comparison {
     )
 
     // Flatten structs so nested double fields can be compared with tolerance
-    // Sort lists so order doesn't matter for comparison (e.g., UNIQUE_COUNT arrays)
-    val aFlattened = flattenStructs(sortLists(stringifyMaps(a)))
-    val bFlattened = flattenStructs(sortLists(stringifyMaps(b)))
+    val aFlattened = flattenStructs(stringifyMaps(a))
+    val bFlattened = flattenStructs(stringifyMaps(b))
 
     val prefixedExpectedDf = prefixColumnName(aFlattened, s"${aName}_")
     val prefixedOutputDf = prefixColumnName(bFlattened, s"${bName}_")
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index fb0c3727e6..7e0b33f75c 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -1039,13 +1039,13 @@ class GroupByTest {
       Builders.Aggregation(Operation.VARIANCE, "price", Seq(new Window(7, TimeUnit.DAYS))),
 
       // UNIQUE_COUNT (array IR)
-      Builders.Aggregation(Operation.UNIQUE_COUNT, "price", Seq(new Window(7, TimeUnit.DAYS))),
+      //Builders.Aggregation(Operation.UNIQUE_COUNT, "price", Seq(new Window(7, TimeUnit.DAYS))),
 
       // HISTOGRAM (map IR)
       Builders.Aggregation(Operation.HISTOGRAM, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "0")),
 
       // BOUNDED_UNIQUE_COUNT (array IR with bound)
-      Builders.Aggregation(Operation.BOUNDED_UNIQUE_COUNT, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "100"))
+      //Builders.Aggregation(Operation.BOUNDED_UNIQUE_COUNT, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "100"))
     )
 
     val tableProps: Map[String, String] = Map("source" -> "chronon")
@@ -1078,14 +1078,18 @@ class GroupByTest {
     assertTrue("IR must contain MIN price", irColumns.exists(_.contains("price_min")))
     assertTrue("IR must contain MAX quantity", irColumns.exists(_.contains("quantity_max")))
     assertTrue("IR must contain VARIANCE price", irColumns.exists(_.contains("price_variance")))
-    assertTrue("IR must contain UNIQUE COUNT price", irColumns.exists(_.contains("price_unique_count")))
+    //assertTrue("IR must contain UNIQUE COUNT price", irColumns.exists(_.contains("price_unique_count")))
     assertTrue("IR must contain HISTOGRAM product_id", irColumns.exists(_.contains("product_id_histogram")))
-    assertTrue("IR must contain BOUNDED_UNIQUE_COUNT product_id", irColumns.exists(_.contains("product_id_bounded_unique_count")))
+    //assertTrue("IR must contain BOUNDED_UNIQUE_COUNT product_id", irColumns.exists(_.contains("product_id_bounded_unique_count")))
 
     // ASSERTION 4: Verify IR table has data
     val irRowCount = actualIncrementalDf.count()
     assertTrue(s"IR table should have rows, found ${irRowCount}", irRowCount > 0)
 
+
+    //    collect_set(price) as price_unique_count,
+    //    slice(collect_set(md5(product_id)), 1, 100) as product_id_bounded_unique_count
+
     // ASSERTION 5: Compare against SQL computation
     val query =
       s"""
@@ -1100,9 +1104,7 @@ class GroupByTest {
          |      cast(count(price) as int) as count,
          |      avg(price) as mean,
          |      sum(price * price) - count(price) * avg(price) * avg(price) as m2
-         |    ) as price_variance,
-         |    collect_set(price) as price_unique_count,
-         |    slice(collect_set(md5(product_id)), 1, 100) as product_id_bounded_unique_count
+         |    ) as price_variance
          |  FROM test_basic_aggs_input
          |  WHERE ds='$today_minus_7_date'
          |  GROUP BY user, ds
@@ -1127,15 +1129,16 @@ class GroupByTest {
 
     // Convert array columns to counts for comparison (since MD5 hashing differs between Scala and SQL)
     import org.apache.spark.sql.functions.size
-    val actualWithCounts = actualIncrementalDf
-      .withColumn("price_unique_count", size(col("price_unique_count")))
-      .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
+    //val actualWithCounts = actualIncrementalDf
+    //  .withColumn("price_unique_count", size(col("price_unique_count")))
+    //  .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
 
-    val expectedWithCounts = expectedDf
-      .withColumn("price_unique_count", size(col("price_unique_count")))
-      .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
+    //val expectedWithCounts = expectedDf
+    //  .withColumn("price_unique_count", size(col("price_unique_count")))
+    //  .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
 
-    val diff = Comparison.sideBySide(actualWithCounts, expectedWithCounts, List("user", tableUtils.partitionColumn))
+    //val diff = Comparison.sideBySide(actualWithCounts, expectedWithCounts, List("user", tableUtils.partitionColumn))
+    val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
 
     if (diff.count() > 0) {
       println(s"=== Diff Details for All Aggregations ===")

From b0e90bfdba8f74d0554ad9ce6dc2fa04c85d87ab Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sat, 7 Feb 2026 13:53:47 -0800
Subject: [PATCH 43/54] add toSeq for scala 2.13 compatibility

---
 spark/src/main/scala/ai/chronon/spark/Comparison.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/Comparison.scala b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
index b515c31c5e..2e5032deab 100644
--- a/spark/src/main/scala/ai/chronon/spark/Comparison.scala
+++ b/spark/src/main/scala/ai/chronon/spark/Comparison.scala
@@ -31,13 +31,13 @@ object Comparison {
 
   // Flatten struct columns into individual columns so nested double fields can be compared with tolerance
   private def flattenStructs(df: DataFrame): DataFrame = {
-    val flattenedSelects = df.schema.fields.flatMap { field =>
+    val flattenedSelects = df.schema.fields.toSeq.flatMap { field =>
       field.dataType match {
         case structType: StructType =>
           // Flatten struct fields: struct_name.field_name -> struct_name_field_name
           structType.fields.map { subField =>
             col(s"${field.name}.${subField.name}").alias(s"${field.name}_${subField.name}")
-          }
+          }.toSeq
         case _ =>
           // Keep non-struct fields as-is
           Seq(col(field.name))

From 7138a66771eac60a3e364f580946736aa45e9346 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sun, 8 Feb 2026 16:08:46 -0800
Subject: [PATCH 44/54] fix last/first tests

---
 .../ai/chronon/spark/test/GroupByTest.scala   | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index 7e0b33f75c..82b90f79ae 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -1282,10 +1282,19 @@ class GroupByTest {
     val groupBy = new GroupBy(aggregations, Seq("user"), df)
     groupBy.computeIncrementalDf(s"${namespace}.testIncrementalFirstLastOutput", partitionRange, tableProps)
 
-    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalFirstLastOutput where ds='$today_minus_7_date'")
+    val rawIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalFirstLastOutput where ds='$today_minus_7_date'")
 
     println("=== Incremental FIRST/LAST IR Schema ===")
-    actualIncrementalDf.printSchema()
+    rawIncrementalDf.printSchema()
+
+    // Sort array columns in raw IRs to match SQL output ordering
+    // Raw IRs store unsorted arrays for mergeability, but we need to sort them for comparison
+    import org.apache.spark.sql.functions.{sort_array, col}
+    val actualIncrementalDf = rawIncrementalDf
+      .withColumn("value_first3", sort_array(col("value_first3")))
+      .withColumn("value_last3", sort_array(col("value_last3")))
+      .withColumn("value_top3", sort_array(col("value_top3")))
+      .withColumn("value_bottom3", sort_array(col("value_bottom3")))
 
     // Compare against SQL computation
     // Note: ts column in IR table is the partition timestamp (derived from ds)
@@ -1307,11 +1316,13 @@ class GroupByTest {
          |    x -> named_struct('epochMillis', x.ts, 'payload', x.value)
          |  ) as value_first3,
          |  transform(
-         |    slice(reverse(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL))), 1, 3),
+         |    slice(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL)),
+         |          greatest(-size(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL))), -3), 3),
          |    x -> named_struct('epochMillis', x.ts, 'payload', x.value)
          |  ) as value_last3,
          |  transform(
-         |    slice(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL), false), 1, 3),
+         |    slice(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL), true),
+         |          greatest(-size(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL))), -3), 3),
          |    x -> x.value
          |  ) as value_top3,
          |  transform(

From 325302c5763a1ba00189ebc098386d6f9b3de0d9 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Mon, 9 Feb 2026 10:21:38 -0800
Subject: [PATCH 45/54] add new test file for incremental aggregations

---
 .../spark/test/GroupByIncrementalTest.scala   | 516 ++++++++++++++++++
 .../ai/chronon/spark/test/GroupByTest.scala   | 469 ----------------
 2 files changed, 516 insertions(+), 469 deletions(-)
 create mode 100644 spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala
new file mode 100644
index 0000000000..26a5071418
--- /dev/null
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala
@@ -0,0 +1,516 @@
+/*
+ *    Copyright (C) 2023 The Chronon Authors.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+package ai.chronon.spark.test
+
+import ai.chronon.aggregator.test.{CStream, Column}
+import ai.chronon.api.Extensions._
+import ai.chronon.api.{
+  Aggregation,
+  Builders,
+  DoubleType,
+  IntType,
+  LongType,
+  Operation,
+  Source,
+  StringType,
+  TimeUnit,
+  Window
+}
+import ai.chronon.spark.Extensions._
+import ai.chronon.spark._
+import ai.chronon.spark.catalog.TableUtils
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.types.{BinaryType, StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.functions.col
+import org.junit.Assert._
+import org.junit.Test
+
+import scala.util.Random
+
+class GroupByIncrementalTest {
+
+  private def createTestSourceIncremental(windowSize: Int = 365,
+                               suffix: String = "",
+                               partitionColOpt: Option[String] = None): (Source, String) = {
+    lazy val spark: SparkSession =
+      SparkSessionBuilder.build("GroupByIncrementalTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val today = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val startPartition = tableUtils.partitionSpec.minus(today, new Window(windowSize, TimeUnit.DAYS))
+    val endPartition = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val sourceSchema = List(
+      Column("user", StringType, 10000),
+      Column("item", StringType, 100),
+      Column("time_spent_ms", LongType, 5000),
+      Column("price", DoubleType, 100)
+    )
+    val namespace = "chronon_incremental_test"
+    val sourceTable = s"$namespace.test_group_by_steps$suffix"
+
+    tableUtils.createDatabase(namespace)
+    val genDf =
+      DataFrameGen.events(spark, sourceSchema, count = 1000, partitions = 200, partitionColOpt = partitionColOpt)
+    partitionColOpt match {
+      case Some(partitionCol) => genDf.save(sourceTable, partitionColumns = Seq(partitionCol))
+      case None               => genDf.save(sourceTable)
+    }
+
+    val source = Builders.Source.events(
+      query = Builders.Query(selects = Builders.Selects("ts", "user", "time_spent_ms", "price", "item"),
+        startPartition = startPartition,
+        partitionColumn = partitionColOpt.orNull),
+      table = sourceTable
+    )
+    (source, endPartition)
+  }
+
+  /**
+   * Tests basic aggregations in incremental mode by comparing Chronon's output against SQL.
+   *
+   * Operations: SUM, COUNT, AVERAGE, MIN, MAX, VARIANCE, UNIQUE_COUNT, HISTOGRAM, BOUNDED_UNIQUE_COUNT
+   *
+   * Actual:   Chronon computes daily IRs using computeIncrementalDf, storing intermediate results
+   * Expected: SQL query computes the same aggregations directly on the input data for the same date
+   */
+  @Test
+  def testIncrementalBasicAggregations(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncrementalBasic" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val namespace = s"incremental_basic_aggs_${Random.alphanumeric.take(6).mkString}"
+    tableUtils.createDatabase(namespace)
+
+    val schema = List(
+      Column("user", StringType, 10),
+      Column("price", DoubleType, 100),
+      Column("quantity", IntType, 50),
+      Column("product_id", StringType, 20),  // Low cardinality for UNIQUE_COUNT, HISTOGRAM, BOUNDED_UNIQUE_COUNT
+      Column("rating", DoubleType, 2000)
+    )
+
+    val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
+
+    val aggregations: Seq[Aggregation] = Seq(
+      // Simple aggregations
+      Builders.Aggregation(Operation.SUM, "price", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.COUNT, "quantity", Seq(new Window(7, TimeUnit.DAYS))),
+
+      // Complex aggregation - AVERAGE (struct IR with sum/count)
+      Builders.Aggregation(Operation.AVERAGE, "rating", Seq(new Window(7, TimeUnit.DAYS))),
+
+      // Min/Max
+      Builders.Aggregation(Operation.MIN, "price", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.MAX, "quantity", Seq(new Window(7, TimeUnit.DAYS))),
+
+      // Variance (struct IR with count/mean/m2)
+      Builders.Aggregation(Operation.VARIANCE, "price", Seq(new Window(7, TimeUnit.DAYS))),
+
+      // UNIQUE_COUNT (array IR)
+      //Builders.Aggregation(Operation.UNIQUE_COUNT, "price", Seq(new Window(7, TimeUnit.DAYS))),
+
+      // HISTOGRAM (map IR)
+      Builders.Aggregation(Operation.HISTOGRAM, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "0")),
+
+      // BOUNDED_UNIQUE_COUNT (array IR with bound)
+      //Builders.Aggregation(Operation.BOUNDED_UNIQUE_COUNT, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "100"))
+    )
+
+    val tableProps: Map[String, String] = Map("source" -> "chronon")
+
+    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val today_minus_7_date = tableUtils.partitionSpec.minus(today_date, new Window(7, TimeUnit.DAYS))
+    val today_minus_20_date = tableUtils.partitionSpec.minus(today_date, new Window(20, TimeUnit.DAYS))
+
+    val partitionRange = PartitionRange(today_minus_20_date, today_date)
+
+    val groupBy = new GroupBy(aggregations, Seq("user"), df)
+    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalBasicAggsOutput", partitionRange, tableProps)
+
+    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalBasicAggsOutput where ds='$today_minus_7_date'")
+    df.createOrReplaceTempView("test_basic_aggs_input")
+
+    println("=== Incremental IR Schema ===")
+    actualIncrementalDf.printSchema()
+
+    // ASSERTION 1: Verify IR table has expected key columns
+    val irColumns = actualIncrementalDf.schema.fieldNames.toSet
+    assertTrue("IR must contain 'user' column", irColumns.contains("user"))
+    assertTrue("IR must contain 'ds' column", irColumns.contains("ds"))
+    assertTrue("IR must contain 'ts' column", irColumns.contains("ts"))
+
+    // ASSERTION 2: Verify all aggregation columns exist
+    assertTrue("IR must contain SUM price", irColumns.exists(_.contains("price_sum")))
+    assertTrue("IR must contain COUNT quantity", irColumns.exists(_.contains("quantity_count")))
+    assertTrue("IR must contain AVERAGE rating", irColumns.exists(_.contains("rating_average")))
+    assertTrue("IR must contain MIN price", irColumns.exists(_.contains("price_min")))
+    assertTrue("IR must contain MAX quantity", irColumns.exists(_.contains("quantity_max")))
+    assertTrue("IR must contain VARIANCE price", irColumns.exists(_.contains("price_variance")))
+    //assertTrue("IR must contain UNIQUE COUNT price", irColumns.exists(_.contains("price_unique_count")))
+    assertTrue("IR must contain HISTOGRAM product_id", irColumns.exists(_.contains("product_id_histogram")))
+    //assertTrue("IR must contain BOUNDED_UNIQUE_COUNT product_id", irColumns.exists(_.contains("product_id_bounded_unique_count")))
+
+    // ASSERTION 4: Verify IR table has data
+    val irRowCount = actualIncrementalDf.count()
+    assertTrue(s"IR table should have rows, found ${irRowCount}", irRowCount > 0)
+
+
+    //    collect_set(price) as price_unique_count,
+    //    slice(collect_set(md5(product_id)), 1, 100) as product_id_bounded_unique_count
+
+    // ASSERTION 5: Compare against SQL computation
+    val query =
+      s"""
+         |WITH base_aggs AS (
+         |  SELECT user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts,
+         |    sum(price) as price_sum,
+         |    count(quantity) as quantity_count,
+         |    struct(sum(rating) as sum, count(rating) as count) as rating_average,
+         |    min(price) as price_min,
+         |    max(quantity) as quantity_max,
+         |    struct(
+         |      cast(count(price) as int) as count,
+         |      avg(price) as mean,
+         |      sum(price * price) - count(price) * avg(price) * avg(price) as m2
+         |    ) as price_variance
+         |  FROM test_basic_aggs_input
+         |  WHERE ds='$today_minus_7_date'
+         |  GROUP BY user, ds
+         |),
+         |histogram_agg AS (
+         |  SELECT user, ds,
+         |    map_from_entries(collect_list(struct(product_id, cast(cnt as int)))) as product_id_histogram
+         |  FROM (
+         |    SELECT user, ds, product_id, count(*) as cnt
+         |    FROM test_basic_aggs_input
+         |    WHERE ds='$today_minus_7_date' AND product_id IS NOT NULL
+         |    GROUP BY user, ds, product_id
+         |  )
+         |  GROUP BY user, ds
+         |)
+         |SELECT b.*, h.product_id_histogram
+         |FROM base_aggs b
+         |LEFT JOIN histogram_agg h ON b.user <=> h.user AND b.ds <=> h.ds
+         |""".stripMargin
+
+    val expectedDf = spark.sql(query)
+
+    // Convert array columns to counts for comparison (since MD5 hashing differs between Scala and SQL)
+    import org.apache.spark.sql.functions.size
+    //val actualWithCounts = actualIncrementalDf
+    //  .withColumn("price_unique_count", size(col("price_unique_count")))
+    //  .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
+
+    //val expectedWithCounts = expectedDf
+    //  .withColumn("price_unique_count", size(col("price_unique_count")))
+    //  .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
+
+    //val diff = Comparison.sideBySide(actualWithCounts, expectedWithCounts, List("user", tableUtils.partitionColumn))
+    val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
+
+    if (diff.count() > 0) {
+      println(s"=== Diff Details for All Aggregations ===")
+      println(s"Actual count: ${irRowCount}")
+      println(s"Expected count: ${expectedDf.count()}")
+      println(s"Diff count: ${diff.count()}")
+      diff.show(100, truncate = false)
+    }
+
+    assertEquals(0, diff.count())
+  }
+
+  /**
+   * This test verifies that the incremental snapshotEvents output matches the non-incremental output.
+   *
+   * 1. Computes snapshotEvents using the standard GroupBy on the full input data.
+   * 2. Computes snapshotEvents using GroupBy in incremental mode over the same date range.
+   * 3. Compares the two outputs to ensure they are identical.
+   */
+  @Test
+  def testSnapshotIncrementalEvents(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val namespace =  s"incremental_groupBy_snapshot_${Random.alphanumeric.take(6).mkString}"
+    tableUtils.createDatabase(namespace)
+
+
+    val outputDates = CStream.genPartitions(10, tableUtils.partitionSpec)
+
+    val aggregations: Seq[Aggregation] = Seq(
+      // Basic
+      Builders.Aggregation(Operation.SUM, "time_spent_ms", Seq(new Window(10, TimeUnit.DAYS), new Window(5, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.SUM, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.AVERAGE, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.MIN, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.MAX, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      // Statistical
+      Builders.Aggregation(Operation.VARIANCE, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.SKEW, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.KURTOSIS, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.APPROX_PERCENTILE, "price", Seq(new Window(10, TimeUnit.DAYS)),
+        argMap = Map("percentiles" -> "[0.5, 0.25, 0.75]")),
+      // Temporal
+      Builders.Aggregation(Operation.FIRST, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.LAST, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.FIRST_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.LAST_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.TOP_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.BOTTOM_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      // Cardinality / Set
+      Builders.Aggregation(Operation.UNIQUE_COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.BOUNDED_UNIQUE_COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      // Distribution
+      Builders.Aggregation(Operation.HISTOGRAM, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.APPROX_HISTOGRAM_K, "user", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "10"))
+    )
+
+    val (source, endPartition) = createTestSourceIncremental(windowSize = 30, suffix = "_snapshot_events", partitionColOpt = Some(tableUtils.partitionColumn))
+    val groupByConf = Builders.GroupBy(
+      sources = Seq(source),
+      keyColumns = Seq("item"),
+      aggregations = aggregations,
+      metaData = Builders.MetaData(name = "testSnapshotIncremental", namespace = namespace, team = "chronon"),
+      backfillStartDate = tableUtils.partitionSpec.minus(tableUtils.partitionSpec.at(System.currentTimeMillis()),
+        new Window(20, TimeUnit.DAYS))
+    )
+
+    val df = spark.read.table(source.table)
+
+    val groupBy = new GroupBy(aggregations, Seq("item"), df.filter("item is not null"))
+    val actualDf = groupBy.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
+
+    val  groupByIncremental = GroupBy.fromIncrementalDf(groupByConf, PartitionRange(outputDates.min, outputDates.max), tableUtils)
+    val incrementalExpectedDf = groupByIncremental.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
+
+    val outputDatesRdd: RDD[Row] = spark.sparkContext.parallelize(outputDates.map(Row(_)))
+    val outputDatesDf = spark.createDataFrame(outputDatesRdd, StructType(Seq(StructField("ds", SparkStringType))))
+    val datesViewName = "test_group_by_snapshot_events_output_range"
+    outputDatesDf.createOrReplaceTempView(datesViewName)
+
+    val diff = Comparison.sideBySide(actualDf, incrementalExpectedDf, List("item", tableUtils.partitionColumn))
+    if (diff.count() > 0) {
+      diff.show()
+      println("=== Diff result rows ===")
+    }
+    assertEquals(0, diff.count())
+  }
+
+  /**
+   * Unit test for FIRST and LAST aggregations with incremental IR
+   * FIRST/LAST use TimeTuple IR: struct {epochMillis: Long, payload: Value}
+   * FIRST keeps the value with the earliest timestamp
+   * LAST keeps the value with the latest timestamp
+   */
+  @Test
+  def testIncrementalFirstLast(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestFirstLast" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val namespace = s"incremental_first_last_${Random.alphanumeric.take(6).mkString}"
+    tableUtils.createDatabase(namespace)
+
+    val schema = List(
+      Column("user", StringType, 5),
+      Column("value", DoubleType, 100)
+    )
+
+    // Generate events and add random milliseconds to ts for unique timestamps
+    import org.apache.spark.sql.functions.{rand, col}
+    import org.apache.spark.sql.types.{LongType => SparkLongType}
+
+    val dfWithRandom = DataFrameGen.events(spark, schema, count = 10000, partitions = 20)
+      .withColumn("ts", col("ts") + (rand() * 86400000).cast(SparkLongType))  // Add 0-24h random millis
+      .cache()  // Mark for caching
+
+    // Force materialization - computes and caches the random values
+    dfWithRandom.count()
+
+    // Write the CACHED data to table - writes already-materialized values
+    dfWithRandom.write.mode("overwrite").saveAsTable(s"${namespace}.test_first_last_input")
+
+    // Read back from table - guaranteed same data as what was written
+    val df = spark.table(s"${namespace}.test_first_last_input")
+
+    val aggregations: Seq[Aggregation] = Seq(
+      Builders.Aggregation(Operation.FIRST, "value", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.LAST, "value", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.FIRST_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.LAST_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.TOP_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
+      Builders.Aggregation(Operation.BOTTOM_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3"))
+    )
+
+    val tableProps: Map[String, String] = Map("source" -> "chronon")
+    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val today_minus_7_date = tableUtils.partitionSpec.minus(today_date, new Window(7, TimeUnit.DAYS))
+    val today_minus_20_date = tableUtils.partitionSpec.minus(today_date, new Window(20, TimeUnit.DAYS))
+    val partitionRange = PartitionRange(today_minus_20_date, today_date)
+
+    val groupBy = new GroupBy(aggregations, Seq("user"), df)
+    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalFirstLastOutput", partitionRange, tableProps)
+
+    val rawIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalFirstLastOutput where ds='$today_minus_7_date'")
+
+    println("=== Incremental FIRST/LAST IR Schema ===")
+    rawIncrementalDf.printSchema()
+
+    // Sort array columns in raw IRs to match SQL output ordering
+    // Raw IRs store unsorted arrays for mergeability, but we need to sort them for comparison
+    import org.apache.spark.sql.functions.{sort_array, col}
+    val actualIncrementalDf = rawIncrementalDf
+      .withColumn("value_first3", sort_array(col("value_first3")))
+      .withColumn("value_last3", sort_array(col("value_last3")))
+      .withColumn("value_top3", sort_array(col("value_top3")))
+      .withColumn("value_bottom3", sort_array(col("value_bottom3")))
+
+    // Compare against SQL computation
+    // Note: ts column in IR table is the partition timestamp (derived from ds)
+    // But FIRST/LAST use the actual event timestamps (with random milliseconds)
+    val query =
+      s"""
+         |SELECT user,
+         |  to_date(from_unixtime(ts / 1000, 'yyyy-MM-dd HH:mm:ss')) as ds,
+         |  named_struct(
+         |    'epochMillis', min(ts),
+         |    'payload', sort_array(collect_list(struct(ts, value)))[0].value
+         |  ) as value_first,
+         |  named_struct(
+         |    'epochMillis', max(ts),
+         |    'payload', reverse(sort_array(collect_list(struct(ts, value))))[0].value
+         |  ) as value_last,
+         |  transform(
+         |    slice(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL)), 1, 3),
+         |    x -> named_struct('epochMillis', x.ts, 'payload', x.value)
+         |  ) as value_first3,
+         |  transform(
+         |    slice(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL)),
+         |          greatest(-size(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL))), -3), 3),
+         |    x -> named_struct('epochMillis', x.ts, 'payload', x.value)
+         |  ) as value_last3,
+         |  transform(
+         |    slice(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL), true),
+         |          greatest(-size(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL))), -3), 3),
+         |    x -> x.value
+         |  ) as value_top3,
+         |  transform(
+         |    slice(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL), true), 1, 3),
+         |    x -> x.value
+         |  ) as value_bottom3
+         |FROM ${namespace}.test_first_last_input
+         |WHERE to_date(from_unixtime(ts / 1000, 'yyyy-MM-dd HH:mm:ss'))='$today_minus_7_date'
+         |GROUP BY user, to_date(from_unixtime(ts / 1000, 'yyyy-MM-dd HH:mm:ss'))
+         |""".stripMargin
+
+    val expectedDf = spark.sql(query)
+
+    // Drop ts from comparison - it's just the partition timestamp, not part of the aggregation IR
+    val actualWithoutTs = actualIncrementalDf.drop("ts")
+
+    // Comparison.sideBySide handles sorting arrays and converting Row objects to clean JSON
+    val diff = Comparison.sideBySide(actualWithoutTs, expectedDf, List("user", tableUtils.partitionColumn))
+
+    if (diff.count() > 0) {
+      println(s"=== Diff Details for Time-based Aggregations ===")
+      println(s"Expected count: ${expectedDf.count()}")
+      println(s"Diff count: ${diff.count()}")
+      diff.show(100, truncate = false)
+    }
+
+    assertEquals(0, diff.count())
+
+    println("=== Time-based Aggregations Incremental Test Passed ===")
+    println("✓ FIRST: TimeTuple IR {epochMillis, payload}")
+    println("✓ LAST: TimeTuple IR {epochMillis, payload}")
+    println("✓ FIRST_K: Array[TimeTuple] - stores timestamps")
+    println("✓ LAST_K: Array[TimeTuple] - stores timestamps")
+    println("✓ TOP_K: Array[Double] - stores only values")
+    println("✓ BOTTOM_K: Array[Double] - stores only values")
+
+    // Cleanup
+    spark.stop()
+  }
+
+  @Test
+  def testIncrementalStatisticalAggregations(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestStatistical" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    implicit val tableUtils = TableUtils(spark)
+    val namespace = s"incremental_stats_${Random.alphanumeric.take(6).mkString}"
+    tableUtils.createDatabase(namespace)
+
+    val schema = List(
+      Column("user", StringType, 5),
+      Column("value", DoubleType, 100),
+      Column("category", StringType, 10)  // For APPROX_UNIQUE_COUNT
+    )
+
+    // Generate sufficient data for statistical aggregations
+    val df = DataFrameGen.events(spark, schema, count = 10000, partitions = 20)
+    df.write.mode("overwrite").saveAsTable(s"${namespace}.test_stats_input")
+    val inputDf = spark.table(s"${namespace}.test_stats_input")
+
+    val aggregations: Seq[Aggregation] = Seq(
+      Builders.Aggregation(Operation.SKEW, "value", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.KURTOSIS, "value", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.APPROX_PERCENTILE, "value", Seq(new Window(7, TimeUnit.DAYS)),
+        argMap = Map("percentiles" -> "[0.5, 0.25, 0.75]")),
+      Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "category", Seq(new Window(7, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.APPROX_HISTOGRAM_K, "category", Seq(new Window(7, TimeUnit.DAYS)),
+        argMap = Map("k" -> "10"))
+    )
+
+    val tableProps: Map[String, String] = Map("source" -> "chronon")
+    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val today_minus_7_date = tableUtils.partitionSpec.minus(today_date, new Window(7, TimeUnit.DAYS))
+    val today_minus_20_date = tableUtils.partitionSpec.minus(today_date, new Window(20, TimeUnit.DAYS))
+    val partitionRange = PartitionRange(today_minus_20_date, today_date)
+
+    val groupBy = new GroupBy(aggregations, Seq("user"), inputDf)
+    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalStatsOutput", partitionRange, tableProps)
+
+    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalStatsOutput where ds='$today_minus_7_date'")
+
+    // Verify IR table has data
+    assertTrue(s"IR table should have rows", actualIncrementalDf.count() > 0)
+
+    // Verify APPROX_HISTOGRAM_K column exists and has binary data (sketch)
+    val histogramCol = actualIncrementalDf.schema.fields.find(_.name.contains("category_approx_histogram_k"))
+    assertTrue("APPROX_HISTOGRAM_K column should exist", histogramCol.isDefined)
+    assertTrue("APPROX_HISTOGRAM_K should be BinaryType (sketch)", histogramCol.get.dataType.isInstanceOf[BinaryType])
+
+    // Verify histogram sketch is non-null
+    val histogramData = spark.sql(
+      s"""
+         |SELECT category_approx_histogram_k
+         |FROM ${namespace}.testIncrementalStatsOutput
+         |WHERE ds='$today_minus_7_date' AND category_approx_histogram_k IS NOT NULL
+         |LIMIT 1
+         |""".stripMargin
+    ).collect()
+
+    assertTrue("APPROX_HISTOGRAM_K should produce non-null sketch data", histogramData.nonEmpty)
+
+    println("=== Statistical Aggregations Incremental Test Passed ===")
+    println("✓ SKEW: Statistical skewness")
+    println("✓ KURTOSIS: Statistical kurtosis")
+    println("✓ APPROX_PERCENTILE: Approximate percentiles")
+    println("✓ APPROX_UNIQUE_COUNT: Approximate distinct count")
+    println("✓ APPROX_HISTOGRAM_K: Approximate histogram with k buckets")
+
+    // Cleanup
+    spark.stop()
+  }
+}
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index 82b90f79ae..44a92f84c0 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -963,475 +963,6 @@ class GroupByTest {
   }
 
 
-  private def createTestSourceIncremental(windowSize: Int = 365,
-                               suffix: String = "",
-                               partitionColOpt: Option[String] = None): (Source, String) = {
-    lazy val spark: SparkSession =
-      SparkSessionBuilder.build("GroupByIncrementalTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
-    implicit val tableUtils = TableUtils(spark)
-    val today = tableUtils.partitionSpec.at(System.currentTimeMillis())
-    val startPartition = tableUtils.partitionSpec.minus(today, new Window(windowSize, TimeUnit.DAYS))
-    val endPartition = tableUtils.partitionSpec.at(System.currentTimeMillis())
-    val sourceSchema = List(
-      Column("user", StringType, 10000),
-      Column("item", StringType, 100),
-      Column("time_spent_ms", LongType, 5000),
-      Column("price", DoubleType, 100)
-    )
-    val namespace = "chronon_incremental_test"
-    val sourceTable = s"$namespace.test_group_by_steps$suffix"
-
-    tableUtils.createDatabase(namespace)
-    val genDf =
-      DataFrameGen.events(spark, sourceSchema, count = 1000, partitions = 200, partitionColOpt = partitionColOpt)
-    partitionColOpt match {
-      case Some(partitionCol) => genDf.save(sourceTable, partitionColumns = Seq(partitionCol))
-      case None               => genDf.save(sourceTable)
-    }
-
-    val source = Builders.Source.events(
-      query = Builders.Query(selects = Builders.Selects("ts", "user", "time_spent_ms", "price", "item"),
-        startPartition = startPartition,
-        partitionColumn = partitionColOpt.orNull),
-      table = sourceTable
-    )
-    (source, endPartition)
-  }
-
-  /**
-   * Tests basic aggregations in incremental mode by comparing Chronon's output against SQL.
-   *
-   * Operations: SUM, COUNT, AVERAGE, MIN, MAX, VARIANCE, UNIQUE_COUNT, HISTOGRAM, BOUNDED_UNIQUE_COUNT
-   *
-   * Actual:   Chronon computes daily IRs using computeIncrementalDf, storing intermediate results
-   * Expected: SQL query computes the same aggregations directly on the input data for the same date
-   */
-  @Test
-  def testIncrementalBasicAggregations(): Unit = {
-    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestIncrementalBasic" + "_" + Random.alphanumeric.take(6).mkString, local = true)
-    implicit val tableUtils = TableUtils(spark)
-    val namespace = s"incremental_basic_aggs_${Random.alphanumeric.take(6).mkString}"
-    tableUtils.createDatabase(namespace)
-
-    val schema = List(
-      Column("user", StringType, 10),
-      Column("price", DoubleType, 100),
-      Column("quantity", IntType, 50),
-      Column("product_id", StringType, 20),  // Low cardinality for UNIQUE_COUNT, HISTOGRAM, BOUNDED_UNIQUE_COUNT
-      Column("rating", DoubleType, 2000)
-    )
-
-    val df = DataFrameGen.events(spark, schema, count = 100000, partitions = 100)
-    
-    val aggregations: Seq[Aggregation] = Seq(
-      // Simple aggregations
-      Builders.Aggregation(Operation.SUM, "price", Seq(new Window(7, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.COUNT, "quantity", Seq(new Window(7, TimeUnit.DAYS))),
-
-      // Complex aggregation - AVERAGE (struct IR with sum/count)
-      Builders.Aggregation(Operation.AVERAGE, "rating", Seq(new Window(7, TimeUnit.DAYS))),
-
-      // Min/Max
-      Builders.Aggregation(Operation.MIN, "price", Seq(new Window(7, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.MAX, "quantity", Seq(new Window(7, TimeUnit.DAYS))),
-
-      // Variance (struct IR with count/mean/m2)
-      Builders.Aggregation(Operation.VARIANCE, "price", Seq(new Window(7, TimeUnit.DAYS))),
-
-      // UNIQUE_COUNT (array IR)
-      //Builders.Aggregation(Operation.UNIQUE_COUNT, "price", Seq(new Window(7, TimeUnit.DAYS))),
-
-      // HISTOGRAM (map IR)
-      Builders.Aggregation(Operation.HISTOGRAM, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "0")),
-
-      // BOUNDED_UNIQUE_COUNT (array IR with bound)
-      //Builders.Aggregation(Operation.BOUNDED_UNIQUE_COUNT, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "100"))
-    )
-
-    val tableProps: Map[String, String] = Map("source" -> "chronon")
-
-    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
-    val today_minus_7_date = tableUtils.partitionSpec.minus(today_date, new Window(7, TimeUnit.DAYS))
-    val today_minus_20_date = tableUtils.partitionSpec.minus(today_date, new Window(20, TimeUnit.DAYS))
-
-    val partitionRange = PartitionRange(today_minus_20_date, today_date)
-
-    val groupBy = new GroupBy(aggregations, Seq("user"), df)
-    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalBasicAggsOutput", partitionRange, tableProps)
-
-    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalBasicAggsOutput where ds='$today_minus_7_date'")
-    df.createOrReplaceTempView("test_basic_aggs_input")
-
-    println("=== Incremental IR Schema ===")
-    actualIncrementalDf.printSchema()
-
-    // ASSERTION 1: Verify IR table has expected key columns
-    val irColumns = actualIncrementalDf.schema.fieldNames.toSet
-    assertTrue("IR must contain 'user' column", irColumns.contains("user"))
-    assertTrue("IR must contain 'ds' column", irColumns.contains("ds"))
-    assertTrue("IR must contain 'ts' column", irColumns.contains("ts"))
-
-    // ASSERTION 2: Verify all aggregation columns exist
-    assertTrue("IR must contain SUM price", irColumns.exists(_.contains("price_sum")))
-    assertTrue("IR must contain COUNT quantity", irColumns.exists(_.contains("quantity_count")))
-    assertTrue("IR must contain AVERAGE rating", irColumns.exists(_.contains("rating_average")))
-    assertTrue("IR must contain MIN price", irColumns.exists(_.contains("price_min")))
-    assertTrue("IR must contain MAX quantity", irColumns.exists(_.contains("quantity_max")))
-    assertTrue("IR must contain VARIANCE price", irColumns.exists(_.contains("price_variance")))
-    //assertTrue("IR must contain UNIQUE COUNT price", irColumns.exists(_.contains("price_unique_count")))
-    assertTrue("IR must contain HISTOGRAM product_id", irColumns.exists(_.contains("product_id_histogram")))
-    //assertTrue("IR must contain BOUNDED_UNIQUE_COUNT product_id", irColumns.exists(_.contains("product_id_bounded_unique_count")))
-
-    // ASSERTION 4: Verify IR table has data
-    val irRowCount = actualIncrementalDf.count()
-    assertTrue(s"IR table should have rows, found ${irRowCount}", irRowCount > 0)
-
-
-    //    collect_set(price) as price_unique_count,
-    //    slice(collect_set(md5(product_id)), 1, 100) as product_id_bounded_unique_count
-
-    // ASSERTION 5: Compare against SQL computation
-    val query =
-      s"""
-         |WITH base_aggs AS (
-         |  SELECT user, ds, UNIX_TIMESTAMP(ds, 'yyyy-MM-dd')*1000 as ts,
-         |    sum(price) as price_sum,
-         |    count(quantity) as quantity_count,
-         |    struct(sum(rating) as sum, count(rating) as count) as rating_average,
-         |    min(price) as price_min,
-         |    max(quantity) as quantity_max,
-         |    struct(
-         |      cast(count(price) as int) as count,
-         |      avg(price) as mean,
-         |      sum(price * price) - count(price) * avg(price) * avg(price) as m2
-         |    ) as price_variance
-         |  FROM test_basic_aggs_input
-         |  WHERE ds='$today_minus_7_date'
-         |  GROUP BY user, ds
-         |),
-         |histogram_agg AS (
-         |  SELECT user, ds,
-         |    map_from_entries(collect_list(struct(product_id, cast(cnt as int)))) as product_id_histogram
-         |  FROM (
-         |    SELECT user, ds, product_id, count(*) as cnt
-         |    FROM test_basic_aggs_input
-         |    WHERE ds='$today_minus_7_date' AND product_id IS NOT NULL
-         |    GROUP BY user, ds, product_id
-         |  )
-         |  GROUP BY user, ds
-         |)
-         |SELECT b.*, h.product_id_histogram
-         |FROM base_aggs b
-         |LEFT JOIN histogram_agg h ON b.user <=> h.user AND b.ds <=> h.ds
-         |""".stripMargin
-
-    val expectedDf = spark.sql(query)
-
-    // Convert array columns to counts for comparison (since MD5 hashing differs between Scala and SQL)
-    import org.apache.spark.sql.functions.size
-    //val actualWithCounts = actualIncrementalDf
-    //  .withColumn("price_unique_count", size(col("price_unique_count")))
-    //  .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
-
-    //val expectedWithCounts = expectedDf
-    //  .withColumn("price_unique_count", size(col("price_unique_count")))
-    //  .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
-
-    //val diff = Comparison.sideBySide(actualWithCounts, expectedWithCounts, List("user", tableUtils.partitionColumn))
-    val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
-
-    if (diff.count() > 0) {
-      println(s"=== Diff Details for All Aggregations ===")
-      println(s"Actual count: ${irRowCount}")
-      println(s"Expected count: ${expectedDf.count()}")
-      println(s"Diff count: ${diff.count()}")
-      diff.show(100, truncate = false)
-    }
-
-    assertEquals(0, diff.count())
-  }
-
-  /**
-   * This test verifies that the incremental snapshotEvents output matches the non-incremental output.
-   *
-   * 1. Computes snapshotEvents using the standard GroupBy on the full input data.
-   * 2. Computes snapshotEvents using GroupBy in incremental mode over the same date range.
-   * 3. Compares the two outputs to ensure they are identical.
-   */
-  @Test
-  def testSnapshotIncrementalEvents(): Unit = {
-    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
-    implicit val tableUtils = TableUtils(spark)
-    val namespace =  s"incremental_groupBy_snapshot_${Random.alphanumeric.take(6).mkString}"
-    tableUtils.createDatabase(namespace)
-
-
-    val outputDates = CStream.genPartitions(10, tableUtils.partitionSpec)
-
-    val aggregations: Seq[Aggregation] = Seq(
-      // Basic
-      Builders.Aggregation(Operation.SUM, "time_spent_ms", Seq(new Window(10, TimeUnit.DAYS), new Window(5, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.SUM, "price", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.AVERAGE, "price", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.MIN, "price", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.MAX, "price", Seq(new Window(10, TimeUnit.DAYS))),
-      // Statistical
-      Builders.Aggregation(Operation.VARIANCE, "price", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.SKEW, "price", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.KURTOSIS, "price", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.APPROX_PERCENTILE, "price", Seq(new Window(10, TimeUnit.DAYS)),
-        argMap = Map("percentiles" -> "[0.5, 0.25, 0.75]")),
-      // Temporal
-      Builders.Aggregation(Operation.FIRST, "price", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.LAST, "price", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.FIRST_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
-      Builders.Aggregation(Operation.LAST_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
-      Builders.Aggregation(Operation.TOP_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
-      Builders.Aggregation(Operation.BOTTOM_K, "price", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
-      // Cardinality / Set
-      Builders.Aggregation(Operation.UNIQUE_COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.BOUNDED_UNIQUE_COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
-      // Distribution
-      Builders.Aggregation(Operation.HISTOGRAM, "user", Seq(new Window(10, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.APPROX_HISTOGRAM_K, "user", Seq(new Window(10, TimeUnit.DAYS)), argMap = Map("k" -> "10"))
-    )
-
-    val (source, endPartition) = createTestSourceIncremental(windowSize = 30, suffix = "_snapshot_events", partitionColOpt = Some(tableUtils.partitionColumn))
-    val groupByConf = Builders.GroupBy(
-      sources = Seq(source),
-      keyColumns = Seq("item"),
-      aggregations = aggregations,
-      metaData = Builders.MetaData(name = "testSnapshotIncremental", namespace = namespace, team = "chronon"),
-      backfillStartDate = tableUtils.partitionSpec.minus(tableUtils.partitionSpec.at(System.currentTimeMillis()),
-        new Window(20, TimeUnit.DAYS))
-    )
-
-    val df = spark.read.table(source.table)
-
-    val groupBy = new GroupBy(aggregations, Seq("item"), df.filter("item is not null"))
-    val actualDf = groupBy.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
-
-    val  groupByIncremental = GroupBy.fromIncrementalDf(groupByConf, PartitionRange(outputDates.min, outputDates.max), tableUtils)
-    val incrementalExpectedDf = groupByIncremental.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
-
-    val outputDatesRdd: RDD[Row] = spark.sparkContext.parallelize(outputDates.map(Row(_)))
-    val outputDatesDf = spark.createDataFrame(outputDatesRdd, StructType(Seq(StructField("ds", SparkStringType))))
-    val datesViewName = "test_group_by_snapshot_events_output_range"
-    outputDatesDf.createOrReplaceTempView(datesViewName)
-
-    val diff = Comparison.sideBySide(actualDf, incrementalExpectedDf, List("item", tableUtils.partitionColumn))
-    if (diff.count() > 0) {
-      diff.show()
-      println("=== Diff result rows ===")
-    }
-    assertEquals(0, diff.count())
-  }
-
-  /**
-   * Unit test for FIRST and LAST aggregations with incremental IR
-   * FIRST/LAST use TimeTuple IR: struct {epochMillis: Long, payload: Value}
-   * FIRST keeps the value with the earliest timestamp
-   * LAST keeps the value with the latest timestamp
-   */
-  @Test
-  def testIncrementalFirstLast(): Unit = {
-    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestFirstLast" + "_" + Random.alphanumeric.take(6).mkString, local = true)
-    implicit val tableUtils = TableUtils(spark)
-    val namespace = s"incremental_first_last_${Random.alphanumeric.take(6).mkString}"
-    tableUtils.createDatabase(namespace)
-
-    val schema = List(
-      Column("user", StringType, 5),
-      Column("value", DoubleType, 100)
-    )
-
-    // Generate events and add random milliseconds to ts for unique timestamps
-    import org.apache.spark.sql.functions.{rand, col}
-    import org.apache.spark.sql.types.{LongType => SparkLongType}
-
-    val dfWithRandom = DataFrameGen.events(spark, schema, count = 10000, partitions = 20)
-      .withColumn("ts", col("ts") + (rand() * 86400000).cast(SparkLongType))  // Add 0-24h random millis
-      .cache()  // Mark for caching
-
-    // Force materialization - computes and caches the random values
-    dfWithRandom.count()
-
-    // Write the CACHED data to table - writes already-materialized values
-    dfWithRandom.write.mode("overwrite").saveAsTable(s"${namespace}.test_first_last_input")
-
-    // Read back from table - guaranteed same data as what was written
-    val df = spark.table(s"${namespace}.test_first_last_input")
-
-    val aggregations: Seq[Aggregation] = Seq(
-      Builders.Aggregation(Operation.FIRST, "value", Seq(new Window(7, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.LAST, "value", Seq(new Window(7, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.FIRST_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
-      Builders.Aggregation(Operation.LAST_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
-      Builders.Aggregation(Operation.TOP_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3")),
-      Builders.Aggregation(Operation.BOTTOM_K, "value", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "3"))
-    )
-
-    val tableProps: Map[String, String] = Map("source" -> "chronon")
-    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
-    val today_minus_7_date = tableUtils.partitionSpec.minus(today_date, new Window(7, TimeUnit.DAYS))
-    val today_minus_20_date = tableUtils.partitionSpec.minus(today_date, new Window(20, TimeUnit.DAYS))
-    val partitionRange = PartitionRange(today_minus_20_date, today_date)
-
-    val groupBy = new GroupBy(aggregations, Seq("user"), df)
-    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalFirstLastOutput", partitionRange, tableProps)
-
-    val rawIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalFirstLastOutput where ds='$today_minus_7_date'")
-
-    println("=== Incremental FIRST/LAST IR Schema ===")
-    rawIncrementalDf.printSchema()
-
-    // Sort array columns in raw IRs to match SQL output ordering
-    // Raw IRs store unsorted arrays for mergeability, but we need to sort them for comparison
-    import org.apache.spark.sql.functions.{sort_array, col}
-    val actualIncrementalDf = rawIncrementalDf
-      .withColumn("value_first3", sort_array(col("value_first3")))
-      .withColumn("value_last3", sort_array(col("value_last3")))
-      .withColumn("value_top3", sort_array(col("value_top3")))
-      .withColumn("value_bottom3", sort_array(col("value_bottom3")))
-
-    // Compare against SQL computation
-    // Note: ts column in IR table is the partition timestamp (derived from ds)
-    // But FIRST/LAST use the actual event timestamps (with random milliseconds)
-    val query =
-      s"""
-         |SELECT user,
-         |  to_date(from_unixtime(ts / 1000, 'yyyy-MM-dd HH:mm:ss')) as ds,
-         |  named_struct(
-         |    'epochMillis', min(ts),
-         |    'payload', sort_array(collect_list(struct(ts, value)))[0].value
-         |  ) as value_first,
-         |  named_struct(
-         |    'epochMillis', max(ts),
-         |    'payload', reverse(sort_array(collect_list(struct(ts, value))))[0].value
-         |  ) as value_last,
-         |  transform(
-         |    slice(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL)), 1, 3),
-         |    x -> named_struct('epochMillis', x.ts, 'payload', x.value)
-         |  ) as value_first3,
-         |  transform(
-         |    slice(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL)),
-         |          greatest(-size(sort_array(filter(collect_list(struct(ts, value)), x -> x.value IS NOT NULL))), -3), 3),
-         |    x -> named_struct('epochMillis', x.ts, 'payload', x.value)
-         |  ) as value_last3,
-         |  transform(
-         |    slice(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL), true),
-         |          greatest(-size(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL))), -3), 3),
-         |    x -> x.value
-         |  ) as value_top3,
-         |  transform(
-         |    slice(sort_array(filter(collect_list(struct(value, ts)), x -> x.value IS NOT NULL), true), 1, 3),
-         |    x -> x.value
-         |  ) as value_bottom3
-         |FROM ${namespace}.test_first_last_input
-         |WHERE to_date(from_unixtime(ts / 1000, 'yyyy-MM-dd HH:mm:ss'))='$today_minus_7_date'
-         |GROUP BY user, to_date(from_unixtime(ts / 1000, 'yyyy-MM-dd HH:mm:ss'))
-         |""".stripMargin
-
-    val expectedDf = spark.sql(query)
-
-    // Drop ts from comparison - it's just the partition timestamp, not part of the aggregation IR
-    val actualWithoutTs = actualIncrementalDf.drop("ts")
-
-    // Comparison.sideBySide handles sorting arrays and converting Row objects to clean JSON
-    val diff = Comparison.sideBySide(actualWithoutTs, expectedDf, List("user", tableUtils.partitionColumn))
-
-    if (diff.count() > 0) {
-      println(s"=== Diff Details for Time-based Aggregations ===")
-      println(s"Expected count: ${expectedDf.count()}")
-      println(s"Diff count: ${diff.count()}")
-      diff.show(100, truncate = false)
-    }
-
-    assertEquals(0, diff.count())
-
-    println("=== Time-based Aggregations Incremental Test Passed ===")
-    println("✓ FIRST: TimeTuple IR {epochMillis, payload}")
-    println("✓ LAST: TimeTuple IR {epochMillis, payload}")
-    println("✓ FIRST_K: Array[TimeTuple] - stores timestamps")
-    println("✓ LAST_K: Array[TimeTuple] - stores timestamps")
-    println("✓ TOP_K: Array[Double] - stores only values")
-    println("✓ BOTTOM_K: Array[Double] - stores only values")
-
-    // Cleanup
-    spark.stop()
-  }
-
-  @Test
-  def testIncrementalStatisticalAggregations(): Unit = {
-    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTestStatistical" + "_" + Random.alphanumeric.take(6).mkString, local = true)
-    implicit val tableUtils = TableUtils(spark)
-    val namespace = s"incremental_stats_${Random.alphanumeric.take(6).mkString}"
-    tableUtils.createDatabase(namespace)
-
-    val schema = List(
-      Column("user", StringType, 5),
-      Column("value", DoubleType, 100),
-      Column("category", StringType, 10)  // For APPROX_UNIQUE_COUNT
-    )
-
-    // Generate sufficient data for statistical aggregations
-    val df = DataFrameGen.events(spark, schema, count = 10000, partitions = 20)
-    df.write.mode("overwrite").saveAsTable(s"${namespace}.test_stats_input")
-    val inputDf = spark.table(s"${namespace}.test_stats_input")
-
-    val aggregations: Seq[Aggregation] = Seq(
-      Builders.Aggregation(Operation.SKEW, "value", Seq(new Window(7, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.KURTOSIS, "value", Seq(new Window(7, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.APPROX_PERCENTILE, "value", Seq(new Window(7, TimeUnit.DAYS)),
-        argMap = Map("percentiles" -> "[0.5, 0.25, 0.75]")),
-      Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "category", Seq(new Window(7, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.APPROX_HISTOGRAM_K, "category", Seq(new Window(7, TimeUnit.DAYS)),
-        argMap = Map("k" -> "10"))
-    )
-
-    val tableProps: Map[String, String] = Map("source" -> "chronon")
-    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
-    val today_minus_7_date = tableUtils.partitionSpec.minus(today_date, new Window(7, TimeUnit.DAYS))
-    val today_minus_20_date = tableUtils.partitionSpec.minus(today_date, new Window(20, TimeUnit.DAYS))
-    val partitionRange = PartitionRange(today_minus_20_date, today_date)
-
-    val groupBy = new GroupBy(aggregations, Seq("user"), inputDf)
-    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalStatsOutput", partitionRange, tableProps)
-
-    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalStatsOutput where ds='$today_minus_7_date'")
-
-    // Verify IR table has data
-    assertTrue(s"IR table should have rows", actualIncrementalDf.count() > 0)
-
-    // Verify APPROX_HISTOGRAM_K column exists and has binary data (sketch)
-    val histogramCol = actualIncrementalDf.schema.fields.find(_.name.contains("category_approx_histogram_k"))
-    assertTrue("APPROX_HISTOGRAM_K column should exist", histogramCol.isDefined)
-    assertTrue("APPROX_HISTOGRAM_K should be BinaryType (sketch)", histogramCol.get.dataType.isInstanceOf[BinaryType])
-
-    // Verify histogram sketch is non-null
-    val histogramData = spark.sql(
-      s"""
-         |SELECT category_approx_histogram_k
-         |FROM ${namespace}.testIncrementalStatsOutput
-         |WHERE ds='$today_minus_7_date' AND category_approx_histogram_k IS NOT NULL
-         |LIMIT 1
-         |""".stripMargin
-    ).collect()
-
-    assertTrue("APPROX_HISTOGRAM_K should produce non-null sketch data", histogramData.nonEmpty)
-
-    println("=== Statistical Aggregations Incremental Test Passed ===")
-    println("✓ SKEW: Statistical skewness")
-    println("✓ KURTOSIS: Statistical kurtosis")
-    println("✓ APPROX_PERCENTILE: Approximate percentiles")
-    println("✓ APPROX_UNIQUE_COUNT: Approximate distinct count")
-    println("✓ APPROX_HISTOGRAM_K: Approximate histogram with k buckets")
-
-    // Cleanup
-    spark.stop()
-  }
 
 
   /**

From 40aa755b72ae0921ec8052e469193fed6df48ebf Mon Sep 17 00:00:00 2001
From: Pengyu Hou <3771747+pengyu-hou@users.noreply.github.com>
Date: Tue, 10 Feb 2026 22:46:40 -0800
Subject: [PATCH 46/54] fix failed ci

---
 api/py/ai/chronon/group_by.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/py/ai/chronon/group_by.py b/api/py/ai/chronon/group_by.py
index 373ca09cfe..87b1c30984 100644
--- a/api/py/ai/chronon/group_by.py
+++ b/api/py/ai/chronon/group_by.py
@@ -389,7 +389,7 @@ def GroupBy(
     derivations: Optional[List[ttypes.Derivation]] = None,
     deprecation_date: Optional[str] = None,
     description: Optional[str] = None,
-    is_incremental: Optional[bool] = False,
+    is_incremental: Optional[bool] = None,
     **kwargs,
 ) -> ttypes.GroupBy:
     """

From b847a4089125eea7a8194518d5386e31fe99ba86 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Thu, 12 Feb 2026 22:18:04 -0800
Subject: [PATCH 47/54] remove obsolete comment

---
 .../src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala | 1 -
 1 file changed, 1 deletion(-)

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
index aac7ed859f..1510516199 100644
--- a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
+++ b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
@@ -24,7 +24,6 @@ import scala.collection.Seq
 
 // The primary API of the aggregator package.
 // the semantics are to mutate values in place for performance reasons
-// userAggregationParts is used when incrementalMode = True.
 class RowAggregator(val inputSchema: Seq[(String, DataType)], val aggregationParts: Seq[AggregationPart])
     extends Serializable
     with SimpleAggregator[Row, Array[Any], Array[Any]] {

From b385cda60bcf375de38f4a590e2faab8f04ec3b8 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Thu, 12 Feb 2026 22:20:38 -0800
Subject: [PATCH 48/54] revert to master files tor spark_submit.sh and
 teams.json

---
 api/py/test/sample/scripts/spark_submit.sh | 9 +--------
 api/py/test/sample/teams.json              | 2 +-
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/api/py/test/sample/scripts/spark_submit.sh b/api/py/test/sample/scripts/spark_submit.sh
index ef048a5532..45102e8843 100644
--- a/api/py/test/sample/scripts/spark_submit.sh
+++ b/api/py/test/sample/scripts/spark_submit.sh
@@ -28,14 +28,13 @@
 
 set -euxo pipefail
 CHRONON_WORKING_DIR=${CHRONON_TMPDIR:-/tmp}/${USER}
-echo $CHRONON_WORKING_DIR
 mkdir -p ${CHRONON_WORKING_DIR}
 export TEST_NAME="${APP_NAME}_${USER}_test"
 unset PYSPARK_DRIVER_PYTHON
 unset PYSPARK_PYTHON
 unset SPARK_HOME
 unset SPARK_CONF_DIR
-export LOG4J_FILE="${CHRONON_WORKING_DIR}/log4j.properties"
+export LOG4J_FILE="${CHRONON_WORKING_DIR}/log4j_file"
 cat > ${LOG4J_FILE} << EOF
 log4j.rootLogger=INFO, stdout
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
@@ -48,9 +47,6 @@ EOF
 $SPARK_SUBMIT_PATH \
 --driver-java-options " -Dlog4j.configuration=file:${LOG4J_FILE}" \
 --conf "spark.executor.extraJavaOptions= -XX:ParallelGCThreads=4 -XX:+UseParallelGC -XX:+UseCompressedOops" \
---conf "spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005 -Dlog4j.configuration=file:${LOG4J_FILE}" \
---conf "spark.sql.warehouse.dir=/home/chaitu/projects/chronon/spark-warehouse" \
---conf "javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=/home/chaitu/projects/chronon/hive-metastore/metastore_db;create=true" \
 --conf spark.sql.shuffle.partitions=${PARALLELISM:-4000} \
 --conf spark.dynamicAllocation.maxExecutors=${MAX_EXECUTORS:-1000} \
 --conf spark.default.parallelism=${PARALLELISM:-4000} \
@@ -81,6 +77,3 @@ tee ${CHRONON_WORKING_DIR}/${APP_NAME}_spark.log
 
 
 
-
-#--conf "spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005 -Dlog4j.rootLogger=INFO,console" \
-
diff --git a/api/py/test/sample/teams.json b/api/py/test/sample/teams.json
index a60502b65d..39f7a25559 100644
--- a/api/py/test/sample/teams.json
+++ b/api/py/test/sample/teams.json
@@ -5,7 +5,7 @@
         },
         "common_env": {
             "VERSION": "latest",
-            "SPARK_SUBMIT_PATH": "spark-submit",
+            "SPARK_SUBMIT_PATH": "[TODO]/path/to/spark-submit",
             "JOB_MODE": "local[*]",
             "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing",
             "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class",

From 464395b959a1ec35de17c0e80db52c8b64940379 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Thu, 12 Feb 2026 22:23:52 -0800
Subject: [PATCH 49/54] remove log statements during debug

---
 spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala
index fca8d723be..94106a33a0 100644
--- a/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala
+++ b/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala
@@ -880,7 +880,6 @@ case class TableUtils(sparkSession: SparkSession) {
       outputPartitionRange
     }
     val outputExisting = partitions(outputTable)
-    logger.info(s"outputExisting : ${outputExisting}")
     // To avoid recomputing partitions removed by retention mechanisms we will not fill holes in the very beginning of the range
     // If a user fills a new partition in the newer end of the range, then we will never fill any partitions before that range.
     // We instead log a message saying why we won't fill the earliest hole.
@@ -889,8 +888,6 @@ case class TableUtils(sparkSession: SparkSession) {
     } else {
       validPartitionRange.start
     }
-
-    logger.info(s"Cutoff partition for skipping holes is set to $cutoffPartition")
     val fillablePartitions =
       if (skipFirstHole) {
         validPartitionRange.partitions.toSet.filter(_ >= cutoffPartition)

From 1002744c4c7b2a7629e1b49aed3b21f5cdfa83be Mon Sep 17 00:00:00 2001
From: chaitanya <1847554+kambstreat@users.noreply.github.com>
Date: Fri, 13 Feb 2026 09:31:34 -0800
Subject: [PATCH 50/54] Add daily_inc suffix to incremental table

Co-authored-by: Pengyu Hou <3771747+pengyu-hou@users.noreply.github.com>
Signed-off-by: chaitanya <1847554+kambstreat@users.noreply.github.com>
---
 api/src/main/scala/ai/chronon/api/Extensions.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala
index 25df552d41..44f653d3ca 100644
--- a/api/src/main/scala/ai/chronon/api/Extensions.scala
+++ b/api/src/main/scala/ai/chronon/api/Extensions.scala
@@ -97,7 +97,7 @@ object Extensions {
     def cleanName: String = metaData.name.sanitize
 
     def outputTable = s"${metaData.outputNamespace}.${metaData.cleanName}"
-    def incrementalOutputTable = s"${metaData.outputNamespace}.${metaData.cleanName}_inc"
+    def incrementalOutputTable = s"${metaData.outputNamespace}.${metaData.cleanName}_daily_inc"
     def preModelTransformsTable = s"${metaData.outputNamespace}.${metaData.cleanName}_pre_mt"
     def outputLabelTable = s"${metaData.outputNamespace}.${metaData.cleanName}_labels"
     def outputFinalView = s"${metaData.outputNamespace}.${metaData.cleanName}_labeled"

From cd141aba0e5f8bde6f85a24864de3d10c32479a8 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Fri, 13 Feb 2026 15:43:09 -0800
Subject: [PATCH 51/54] fix bug in flatten function

---
 .../src/main/scala/ai/chronon/spark/GroupBy.scala  | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index c364a1da98..97199f45f6 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -373,12 +373,14 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
       hopsArrays: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)]): RDD[(Array[Any], Array[Any])] = {
     hopsArrays.flatMap {
       case (keyWithHash: KeyWithHash, hopsArray: HopsAggregator.OutputArrayType) =>
-        val hopsArrayHead: Array[HopIr] = hopsArray.headOption.get
-        hopsArrayHead.map { array: HopIr =>
-          val timestamp = array.last.asInstanceOf[Long]
-          val withoutTimestamp = array.dropRight(1)
-          val normalizedIR = flattenedAgg.normalize(withoutTimestamp)
-          ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp) :+ timestamp), normalizedIR)
+        hopsArray.headOption match {
+          case Some(dailyHops) =>
+            dailyHops.map { hopIr =>
+              val timestamp = hopIr.last.asInstanceOf[Long]
+              val normalizedIR = flattenedAgg.normalize(hopIr.dropRight(1))
+              ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp) :+ timestamp), normalizedIR)
+            }
+          case None => Iterator.empty
         }
     }
   }

From a9f3c787768b2d9aad34f678b87f0e61723ac76a Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Mon, 16 Feb 2026 17:50:58 -0800
Subject: [PATCH 52/54] revert changes in GroupByTest

---
 .../test/scala/ai/chronon/spark/test/GroupByTest.scala    | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
index 44a92f84c0..c11ae88fa4 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -39,9 +39,8 @@ import ai.chronon.spark._
 import ai.chronon.spark.catalog.TableUtils
 import com.google.gson.Gson
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{ArrayType, BinaryType, MapType, StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
+import org.apache.spark.sql.types.{StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
 import org.apache.spark.sql.{Encoders, Row, SparkSession}
-import org.apache.spark.sql.functions.col
 import org.junit.Assert._
 import org.junit.Test
 
@@ -117,7 +116,6 @@ class GroupByTest {
 
     val groupBy = new GroupBy(aggregations, Seq("user"), df)
     val actualDf = groupBy.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
-
     val outputDatesRdd: RDD[Row] = spark.sparkContext.parallelize(outputDates.map(Row(_)))
     val outputDatesDf = spark.createDataFrame(outputDatesRdd, StructType(Seq(StructField("ds", SparkStringType))))
     val datesViewName = "test_group_by_snapshot_events_output_range"
@@ -453,7 +451,6 @@ class GroupByTest {
              additionalAgg = aggs)
   }
 
-
   private def createTestSource(windowSize: Int = 365,
                                suffix: String = "",
                                partitionColOpt: Option[String] = None): (Source, String) = {
@@ -962,9 +959,6 @@ class GroupByTest {
     assertTrue("Should be able to filter GroupBy results", filteredResult.count() >= 0)
   }
 
-
-
-
   /**
    * Test that GroupBy derivations without wildcards preserve infrastructure columns (keys, partition, time).
    * This validates the fix for the bug where derivations would drop necessary columns.

From 91d2768db8c18345cb8a5e62971e206f47d94598 Mon Sep 17 00:00:00 2001
From: chaitu <kambstreat@users.noreply.github.com>
Date: Sat, 28 Feb 2026 22:12:07 -0800
Subject: [PATCH 53/54] remove comment

---
 .../scala/ai/chronon/spark/test/GroupByIncrementalTest.scala     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala
index 26a5071418..0c983f24ce 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala
@@ -420,7 +420,6 @@ class GroupByIncrementalTest {
     // Drop ts from comparison - it's just the partition timestamp, not part of the aggregation IR
     val actualWithoutTs = actualIncrementalDf.drop("ts")
 
-    // Comparison.sideBySide handles sorting arrays and converting Row objects to clean JSON
     val diff = Comparison.sideBySide(actualWithoutTs, expectedDf, List("user", tableUtils.partitionColumn))
 
     if (diff.count() > 0) {

From 0dc387988649002ab0965298dd411b9bc6a11da2 Mon Sep 17 00:00:00 2001
From: Pengyu Hou <3771747+pengyu-hou@users.noreply.github.com>
Date: Fri, 3 Apr 2026 10:33:25 -0700
Subject: [PATCH 54/54] Strengthen incremental backfill test coverage and fix
 bugs in GroupBy.scala
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  GroupBy.scala — bug fixes

  - Fix aggregationParts returning empty when no holes exist: Previously derived from the hole-filling loop, so when the incremental table was already up-to-date, convertIncrementalDfToHops received an empty list
  and produced wrong hops. Now derived directly from groupByConf unconditionally.
  - Fix maxWindow.get NPE: Replaced with getOrElse(throw ...) that gives a clear error message when incremental mode is used on a GroupBy with no windowed aggregations.
  - Remove unused imports: Operation and com.google.common.collect.Table.
  - Fix typo: "aggregatiosn" → "aggregations" in scaladoc.

  GroupByIncrementalTest.scala — test improvements

  - Enable UNIQUE_COUNT and BOUNDED_UNIQUE_COUNT: Uncommented both operations in testIncrementalBasicAggregations. Comparison converts array IRs to their sizes (since element ordering/MD5 hashing differs between
  Chronon and SQL) and validates against count(distinct ...) from SQL.
  - Rewrite testIncrementalStatisticalAggregations: Replaced schema-only checks with a full incremental-vs-non-incremental snapshotEvents comparison for SKEW, KURTOSIS, APPROX_PERCENTILE, APPROX_UNIQUE_COUNT, and
  APPROX_HISTOGRAM_K.
  - Remove redundant column-existence assertions: Dropped assertTrue("IR must contain X") checks that were superseded by the value comparison.
  - Remove unused BinaryType import.
---
 .../main/scala/ai/chronon/spark/GroupBy.scala |  33 ++--
 .../spark/test/GroupByIncrementalTest.scala   | 142 ++++++------------
 2 files changed, 65 insertions(+), 110 deletions(-)

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
index a36f3339e3..139cc000ba 100644
--- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
+++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -23,7 +23,7 @@ import ai.chronon.aggregator.windowing._
 import ai.chronon.api
 import ai.chronon.api.DataModel.{Entities, Events}
 import ai.chronon.api.Extensions._
-import ai.chronon.api.{Accuracy, Constants, DataModel, Operation, ParametricMacro, TimeUnit, Window}
+import ai.chronon.api.{Accuracy, Constants, DataModel, ParametricMacro, TimeUnit, Window}
 import ai.chronon.online.serde.{RowWrapper, SparkConversions}
 import ai.chronon.spark.Extensions._
 import ai.chronon.spark.catalog.TableUtils
@@ -39,7 +39,6 @@ import scala.jdk.CollectionConverters._
 import java.util
 import scala.collection.{Seq, mutable}
 import scala.util.ScalaJavaConversions.{JListOps, ListOps, MapOps}
-import _root_.com.google.common.collect.Table
 
 class GroupBy(val aggregations: Seq[api.Aggregation],
               val keyColumns: Seq[String],
@@ -432,7 +431,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
   /**
     * computes incremental daily table
     * @param incrementalOutputTable output of the incremental data stored here
-    * @param range date range to calculate daily aggregatiosn
+    * @param range date range to calculate daily aggregations
     * @param tableProps
     */
   def computeIncrementalDf(incrementalOutputTable: String, range: PartitionRange, tableProps: Map[String, String]) = {
@@ -791,8 +790,13 @@ object GroupBy {
       .map(_.toScala)
       .orNull
 
+    val maxWindow = groupByConf.maxWindow.getOrElse(
+      throw new IllegalArgumentException(
+        s"GroupBy ${groupByConf.metaData.name} has no windowed aggregations. " +
+          "Incremental mode requires at least one windowed aggregation."))
+
     val incrementalQueryableRange = PartitionRange(
-      tableUtils.partitionSpec.minus(range.start, groupByConf.maxWindow.get),
+      tableUtils.partitionSpec.minus(range.start, maxWindow),
       range.end
     )(tableUtils)
 
@@ -804,21 +808,18 @@ object GroupBy {
       skipFirstHole = false
     )
 
-    val incrementalGroupByAggParts = partitionRangeHoles
-      .map { holes =>
-        val incrementalAggregationParts = holes.map { hole =>
-          logger.info(s"Filling hole in incremental table: $hole")
-          val incrementalGroupByBackfill =
-            from(groupByConf, hole, tableUtils, computeDependency = true, incrementalMode = true)
-          incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, hole, tableProps)
-          incrementalGroupByBackfill.flattenedAgg.aggregationParts
-        }
+    val aggregationParts = groupByConf.getAggregations.toScala.flatMap(_.unWindowed)
 
-        incrementalAggregationParts.headOption.getOrElse(Seq.empty)
+    partitionRangeHoles.foreach { holes =>
+      holes.foreach { hole =>
+        logger.info(s"Filling hole in incremental table: $hole")
+        val incrementalGroupByBackfill =
+          from(groupByConf, hole, tableUtils, computeDependency = true, incrementalMode = true)
+        incrementalGroupByBackfill.computeIncrementalDf(incrementalOutputTable, hole, tableProps)
       }
-      .getOrElse(Seq.empty)
+    }
 
-    (incrementalQueryableRange, incrementalGroupByAggParts)
+    (incrementalQueryableRange, aggregationParts)
   }
 
   private def convertIncrementalDfToHops(
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala
index 0c983f24ce..28ccde4a1f 100644
--- a/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala
+++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByIncrementalTest.scala
@@ -34,7 +34,7 @@ import ai.chronon.spark.Extensions._
 import ai.chronon.spark._
 import ai.chronon.spark.catalog.TableUtils
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{BinaryType, StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
+import org.apache.spark.sql.types.{StructField, StructType, LongType => SparkLongType, StringType => SparkStringType}
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.functions.col
 import org.junit.Assert._
@@ -119,14 +119,14 @@ class GroupByIncrementalTest {
       // Variance (struct IR with count/mean/m2)
       Builders.Aggregation(Operation.VARIANCE, "price", Seq(new Window(7, TimeUnit.DAYS))),
 
-      // UNIQUE_COUNT (array IR)
-      //Builders.Aggregation(Operation.UNIQUE_COUNT, "price", Seq(new Window(7, TimeUnit.DAYS))),
+      // UNIQUE_COUNT (array IR): IR = array<double> of distinct values
+      Builders.Aggregation(Operation.UNIQUE_COUNT, "price", Seq(new Window(7, TimeUnit.DAYS))),
 
       // HISTOGRAM (map IR)
       Builders.Aggregation(Operation.HISTOGRAM, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "0")),
 
-      // BOUNDED_UNIQUE_COUNT (array IR with bound)
-      //Builders.Aggregation(Operation.BOUNDED_UNIQUE_COUNT, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "100"))
+      // BOUNDED_UNIQUE_COUNT (array IR with bound): IR = array<string> of bounded distinct values (MD5-hashed)
+      Builders.Aggregation(Operation.BOUNDED_UNIQUE_COUNT, "product_id", Seq(new Window(7, TimeUnit.DAYS)), argMap = Map("k" -> "100"))
     )
 
     val tableProps: Map[String, String] = Map("source" -> "chronon")
@@ -143,35 +143,7 @@ class GroupByIncrementalTest {
     val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalBasicAggsOutput where ds='$today_minus_7_date'")
     df.createOrReplaceTempView("test_basic_aggs_input")
 
-    println("=== Incremental IR Schema ===")
-    actualIncrementalDf.printSchema()
-
-    // ASSERTION 1: Verify IR table has expected key columns
-    val irColumns = actualIncrementalDf.schema.fieldNames.toSet
-    assertTrue("IR must contain 'user' column", irColumns.contains("user"))
-    assertTrue("IR must contain 'ds' column", irColumns.contains("ds"))
-    assertTrue("IR must contain 'ts' column", irColumns.contains("ts"))
-
-    // ASSERTION 2: Verify all aggregation columns exist
-    assertTrue("IR must contain SUM price", irColumns.exists(_.contains("price_sum")))
-    assertTrue("IR must contain COUNT quantity", irColumns.exists(_.contains("quantity_count")))
-    assertTrue("IR must contain AVERAGE rating", irColumns.exists(_.contains("rating_average")))
-    assertTrue("IR must contain MIN price", irColumns.exists(_.contains("price_min")))
-    assertTrue("IR must contain MAX quantity", irColumns.exists(_.contains("quantity_max")))
-    assertTrue("IR must contain VARIANCE price", irColumns.exists(_.contains("price_variance")))
-    //assertTrue("IR must contain UNIQUE COUNT price", irColumns.exists(_.contains("price_unique_count")))
-    assertTrue("IR must contain HISTOGRAM product_id", irColumns.exists(_.contains("product_id_histogram")))
-    //assertTrue("IR must contain BOUNDED_UNIQUE_COUNT product_id", irColumns.exists(_.contains("product_id_bounded_unique_count")))
-
-    // ASSERTION 4: Verify IR table has data
-    val irRowCount = actualIncrementalDf.count()
-    assertTrue(s"IR table should have rows, found ${irRowCount}", irRowCount > 0)
-
-
-    //    collect_set(price) as price_unique_count,
-    //    slice(collect_set(md5(product_id)), 1, 100) as product_id_bounded_unique_count
-
-    // ASSERTION 5: Compare against SQL computation
+    // Compare against SQL computation
     val query =
       s"""
          |WITH base_aggs AS (
@@ -185,7 +157,9 @@ class GroupByIncrementalTest {
          |      cast(count(price) as int) as count,
          |      avg(price) as mean,
          |      sum(price * price) - count(price) * avg(price) * avg(price) as m2
-         |    ) as price_variance
+         |    ) as price_variance,
+         |    count(distinct price) as price_unique_count,
+         |    least(count(distinct product_id), 100) as product_id_bounded_unique_count
          |  FROM test_basic_aggs_input
          |  WHERE ds='$today_minus_7_date'
          |  GROUP BY user, ds
@@ -208,24 +182,22 @@ class GroupByIncrementalTest {
 
     val expectedDf = spark.sql(query)
 
-    // Convert array columns to counts for comparison (since MD5 hashing differs between Scala and SQL)
+    // Replace UNIQUE_COUNT and BOUNDED_UNIQUE_COUNT array columns with their sizes for comparison.
+    // SQL produces Long counts; size() returns Int — cast both to Long for type consistency.
     import org.apache.spark.sql.functions.size
-    //val actualWithCounts = actualIncrementalDf
-    //  .withColumn("price_unique_count", size(col("price_unique_count")))
-    //  .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
+    val actualForComparison = actualIncrementalDf
+      .withColumn("price_unique_count", size(col("price_unique_count")).cast("long"))
+      .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")).cast("long"))
 
-    //val expectedWithCounts = expectedDf
-    //  .withColumn("price_unique_count", size(col("price_unique_count")))
-    //  .withColumn("product_id_bounded_unique_count", size(col("product_id_bounded_unique_count")))
-
-    //val diff = Comparison.sideBySide(actualWithCounts, expectedWithCounts, List("user", tableUtils.partitionColumn))
-    val diff = Comparison.sideBySide(actualIncrementalDf, expectedDf, List("user", tableUtils.partitionColumn))
+    val diff = Comparison.sideBySide(actualForComparison, expectedDf, List("user", tableUtils.partitionColumn))
 
+    val irRowCount = actualIncrementalDf.count()
     if (diff.count() > 0) {
       println(s"=== Diff Details for All Aggregations ===")
       println(s"Actual count: ${irRowCount}")
       println(s"Expected count: ${expectedDf.count()}")
       println(s"Diff count: ${diff.count()}")
+      actualForComparison.show(10, truncate = false)
       diff.show(100, truncate = false)
     }
 
@@ -450,64 +422,46 @@ class GroupByIncrementalTest {
     val namespace = s"incremental_stats_${Random.alphanumeric.take(6).mkString}"
     tableUtils.createDatabase(namespace)
 
-    val schema = List(
-      Column("user", StringType, 5),
-      Column("value", DoubleType, 100),
-      Column("category", StringType, 10)  // For APPROX_UNIQUE_COUNT
-    )
-
-    // Generate sufficient data for statistical aggregations
-    val df = DataFrameGen.events(spark, schema, count = 10000, partitions = 20)
-    df.write.mode("overwrite").saveAsTable(s"${namespace}.test_stats_input")
-    val inputDf = spark.table(s"${namespace}.test_stats_input")
+    val outputDates = CStream.genPartitions(10, tableUtils.partitionSpec)
 
     val aggregations: Seq[Aggregation] = Seq(
-      Builders.Aggregation(Operation.SKEW, "value", Seq(new Window(7, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.KURTOSIS, "value", Seq(new Window(7, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.APPROX_PERCENTILE, "value", Seq(new Window(7, TimeUnit.DAYS)),
+      // Moment-based (IR = array<double> [n, m1, m2, m3, m4]); finalized to Double
+      Builders.Aggregation(Operation.SKEW, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      Builders.Aggregation(Operation.KURTOSIS, "price", Seq(new Window(10, TimeUnit.DAYS))),
+      // Sketch-based (IR = binary KLL sketch); finalized to Array[Float]
+      Builders.Aggregation(Operation.APPROX_PERCENTILE, "price", Seq(new Window(10, TimeUnit.DAYS)),
         argMap = Map("percentiles" -> "[0.5, 0.25, 0.75]")),
-      Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "category", Seq(new Window(7, TimeUnit.DAYS))),
-      Builders.Aggregation(Operation.APPROX_HISTOGRAM_K, "category", Seq(new Window(7, TimeUnit.DAYS)),
+      // Sketch-based (IR = binary CPC sketch); finalized to Long
+      Builders.Aggregation(Operation.APPROX_UNIQUE_COUNT, "user", Seq(new Window(10, TimeUnit.DAYS))),
+      // Sketch-based (IR = binary); finalized to Map[String, Long]
+      Builders.Aggregation(Operation.APPROX_HISTOGRAM_K, "user", Seq(new Window(10, TimeUnit.DAYS)),
         argMap = Map("k" -> "10"))
     )
 
-    val tableProps: Map[String, String] = Map("source" -> "chronon")
-    val today_date = tableUtils.partitionSpec.at(System.currentTimeMillis())
-    val today_minus_7_date = tableUtils.partitionSpec.minus(today_date, new Window(7, TimeUnit.DAYS))
-    val today_minus_20_date = tableUtils.partitionSpec.minus(today_date, new Window(20, TimeUnit.DAYS))
-    val partitionRange = PartitionRange(today_minus_20_date, today_date)
-
-    val groupBy = new GroupBy(aggregations, Seq("user"), inputDf)
-    groupBy.computeIncrementalDf(s"${namespace}.testIncrementalStatsOutput", partitionRange, tableProps)
-
-    val actualIncrementalDf = spark.sql(s"select * from ${namespace}.testIncrementalStatsOutput where ds='$today_minus_7_date'")
-
-    // Verify IR table has data
-    assertTrue(s"IR table should have rows", actualIncrementalDf.count() > 0)
-
-    // Verify APPROX_HISTOGRAM_K column exists and has binary data (sketch)
-    val histogramCol = actualIncrementalDf.schema.fields.find(_.name.contains("category_approx_histogram_k"))
-    assertTrue("APPROX_HISTOGRAM_K column should exist", histogramCol.isDefined)
-    assertTrue("APPROX_HISTOGRAM_K should be BinaryType (sketch)", histogramCol.get.dataType.isInstanceOf[BinaryType])
+    val (source, _) = createTestSourceIncremental(windowSize = 30, suffix = "_stats_events",
+      partitionColOpt = Some(tableUtils.partitionColumn))
+    val groupByConf = Builders.GroupBy(
+      sources = Seq(source),
+      keyColumns = Seq("item"),
+      aggregations = aggregations,
+      metaData = Builders.MetaData(name = "testIncrementalStats", namespace = namespace, team = "chronon"),
+      backfillStartDate = tableUtils.partitionSpec.minus(tableUtils.partitionSpec.at(System.currentTimeMillis()),
+        new Window(20, TimeUnit.DAYS))
+    )
 
-    // Verify histogram sketch is non-null
-    val histogramData = spark.sql(
-      s"""
-         |SELECT category_approx_histogram_k
-         |FROM ${namespace}.testIncrementalStatsOutput
-         |WHERE ds='$today_minus_7_date' AND category_approx_histogram_k IS NOT NULL
-         |LIMIT 1
-         |""".stripMargin
-    ).collect()
+    val df = spark.read.table(source.table)
+    val groupBy = new GroupBy(aggregations, Seq("item"), df.filter("item is not null"))
+    val nonIncrementalDf = groupBy.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
 
-    assertTrue("APPROX_HISTOGRAM_K should produce non-null sketch data", histogramData.nonEmpty)
+    val groupByIncremental = GroupBy.fromIncrementalDf(groupByConf, PartitionRange(outputDates.min, outputDates.max), tableUtils)
+    val incrementalDf = groupByIncremental.snapshotEvents(PartitionRange(outputDates.min, outputDates.max))
 
-    println("=== Statistical Aggregations Incremental Test Passed ===")
-    println("✓ SKEW: Statistical skewness")
-    println("✓ KURTOSIS: Statistical kurtosis")
-    println("✓ APPROX_PERCENTILE: Approximate percentiles")
-    println("✓ APPROX_UNIQUE_COUNT: Approximate distinct count")
-    println("✓ APPROX_HISTOGRAM_K: Approximate histogram with k buckets")
+    val diff = Comparison.sideBySide(nonIncrementalDf, incrementalDf, List("item", tableUtils.partitionColumn))
+    if (diff.count() > 0) {
+      println("=== Diff Details for Statistical Aggregations ===")
+      diff.show(100, truncate = false)
+    }
+    assertEquals(0, diff.count())
 
     // Cleanup
     spark.stop()