From 138441e1471aabac25f200a8b47e3adc82a225ad Mon Sep 17 00:00:00 2001
From: mcuadera <xrg9@cdc.gov>
Date: Thu, 13 Mar 2025 10:53:58 -0400
Subject: [PATCH 01/28] create create_raw_data_parquet fx

---
 NAMESPACE                      |  1 +
 R/dal.parquet.R                | 62 ++++++++++++++++++++++++++++++++++
 man/create_raw_data_parquet.Rd | 26 ++++++++++++++
 man/get_partition_cols.Rd      | 23 +++++++++++++
 4 files changed, 112 insertions(+)
 create mode 100644 R/dal.parquet.R
 create mode 100644 man/create_raw_data_parquet.Rd
 create mode 100644 man/get_partition_cols.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 79a0de4e..58fb1a2e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -15,6 +15,7 @@ export(create_emergence_group_gif)
 export(create_npafp_export)
 export(create_pop_check_export)
 export(create_pot_comp_clust_export)
+export(create_raw_data_parquet)
 export(create_stool_adequacy_export)
 export(ctry_data_errors)
 export(duplicate_check)
diff --git a/R/dal.parquet.R b/R/dal.parquet.R
new file mode 100644
index 00000000..fdf89960
--- /dev/null
+++ b/R/dal.parquet.R
@@ -0,0 +1,62 @@
+#' Convert raw data into a parquet hierarchal folder
+#'
+#' The function takes a `raw_data` object (output of [get_all_polio_data()]) and
+#' saves it into a parquet directory
+#' @param raw_data `list` A `raw_data` object.
+#' @param path `str` Path to export the parquet folder to.
+#'
+#' @returns None.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' raw_data <- get_all_polio_data()
+#' create_raw_data_parquet(raw_data, "C:/Users/ABC1/Desktop/raw_data_parquet")
+#' }
+create_raw_data_parquet <- function(raw_data, path){
+  df_names <- names(raw_data)
+
+  for (i in df_names) {
+    switch()
+  }
+
+}
+
+# Private functions ----
+
+#' Gets the column used to partition a column
+#'
+#' @param name `str` Name of the column
+#'
+#' @return `chr` A character vector of columns to partition with.
+#' @keywords internal
+#'
+#' @examples
+#' \dontrun{
+#' get_partition_cols("afp")
+#' }
+get_partition_cols <- function(name) {
+  switch(name,
+         "afp" = c("place.admin.0", "yronset"),
+         "afp.dupe" = c("place.admin.0", "yronset"),
+         "afp.epi" = c("place.admin.0", "yronset"),
+         "para.case" = c("place.admin.0", "yronset"),
+         "es" = c("ADM0_NAME", "collect.yr"),
+         "es.dupe" = c("ADM0_NAME", "collect.yr"),
+         "sia" = c("place.admin.0", "yr.sia"),
+         "sia.dupe" = c("place.admin.0", "yr.sia"),
+         "pos" = c("place.admin.0", "yronset"),
+         "pos.dupe" = c("place.admin.0", "yronset"),
+         "other" = c("place.admin.0", "yronset"),
+         "other.dupe" = c("place.admin.0", "yronset"),
+         "dist.pop" = c("ADM0_NAME", "year"),
+         "prov.pop" = c("ADM0_NAME", "year"),
+         "ctry.pop" = c("ADM0_NAME", "year"),
+         "global.ctry" = c("ADM0_NAME"),
+         "global.prov" = c("ADM0_NAME"),
+         "global.dist" = c("ADM0_NAME"),
+         "roads" = c("continent"),
+         "cities" = c("CTRY_NAME"),
+         "metadata" = "download_time"
+         )
+}
diff --git a/man/create_raw_data_parquet.Rd b/man/create_raw_data_parquet.Rd
new file mode 100644
index 00000000..e65e2f63
--- /dev/null
+++ b/man/create_raw_data_parquet.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dal.parquet.R
+\name{create_raw_data_parquet}
+\alias{create_raw_data_parquet}
+\title{Convert raw data into a parquet hierarchal folder}
+\usage{
+create_raw_data_parquet(raw_data, path)
+}
+\arguments{
+\item{raw_data}{\code{list} A \code{raw_data} object.}
+
+\item{path}{\code{str} Path to export the parquet folder to.}
+}
+\value{
+None.
+}
+\description{
+The function takes a \code{raw_data} object (output of \code{\link[=get_all_polio_data]{get_all_polio_data()}}) and
+saves it into a parquet directory
+}
+\examples{
+\dontrun{
+raw_data <- get_all_polio_data()
+create_raw_data_parquet(raw_data, "C:/Users/ABC1/Desktop/raw_data_parquet")
+}
+}
diff --git a/man/get_partition_cols.Rd b/man/get_partition_cols.Rd
new file mode 100644
index 00000000..cad472b2
--- /dev/null
+++ b/man/get_partition_cols.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dal.parquet.R
+\name{get_partition_cols}
+\alias{get_partition_cols}
+\title{Gets the column used to partition a column}
+\usage{
+get_partition_cols(name)
+}
+\arguments{
+\item{name}{\code{str} Name of the column}
+}
+\value{
+\code{chr} A character vector of columns to partition with.
+}
+\description{
+Gets the column used to partition a column
+}
+\examples{
+\dontrun{
+get_partition_cols("afp")
+}
+}
+\keyword{internal}

From 6ca024be0a00f4881b335080825299e231d913df Mon Sep 17 00:00:00 2001
From: mcuadera <xrg9@cdc.gov>
Date: Fri, 14 Mar 2025 12:44:25 -0400
Subject: [PATCH 02/28] build local parquet

---
 NAMESPACE                           |   1 +
 R/dal.parquet.R                     | 176 +++++++++++++++++++++++++++-
 man/build_parquet_raw_data.Rd       |  35 ++++++
 man/build_parquet_raw_data_edav.Rd  |  24 ++++
 man/build_parquet_raw_data_local.Rd |  19 +++
 sirfunctions.Rproj                  |   1 -
 6 files changed, 249 insertions(+), 7 deletions(-)
 create mode 100644 man/build_parquet_raw_data.Rd
 create mode 100644 man/build_parquet_raw_data_edav.Rd
 create mode 100644 man/build_parquet_raw_data_local.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 58fb1a2e..87202a8e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(build_parquet_raw_data)
 export(check_afp_guid_ctry_data)
 export(check_cache)
 export(check_missing_rows)
diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index fdf89960..53036d5b 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -14,12 +14,97 @@
 #' create_raw_data_parquet(raw_data, "C:/Users/ABC1/Desktop/raw_data_parquet")
 #' }
 create_raw_data_parquet <- function(raw_data, path){
+  start <- Sys.time()
   df_names <- names(raw_data)
 
+  if (!dir.exists(path)) {
+    cli::cli_abort("Directory path does not exist.")
+  }
+
+  cli::cli_process_start("Creating raw_data parquet folder")
+  iter <- 1
   for (i in df_names) {
-    switch()
+    cli::cli_alert_info(paste0("Now processing: ", i))
+
+    if (i %in% c("global.prov", "global.dist")) {
+      raw_data[[i]] |>
+        dplyr::mutate(SHAPE = sf::st_as_text(SHAPE)) |>
+        arrow::write_dataset(path = file.path(path, i),
+                             partitioning = get_partition_cols(i))
+
+    } else if (i == "global.ctry") {
+      raw_data[[i]] |>
+        dplyr::mutate(Shape = sf::st_as_text(Shape)) |>
+        arrow::write_dataset(path = file.path(path, i),
+                             partitioning = get_partition_cols(i))
+    } else if (i %in% c("cities", "roads")) {
+      raw_data[[i]] |>
+        dplyr::mutate(geometry = sf::st_as_text(geometry)) |>
+        arrow::write_dataset(path = file.path(path, i),
+                             partitioning = get_partition_cols(i))
+
+    } else if (i == "metadata") {
+      raw_data[[i]] |>
+        dplyr::as_tibble() |>
+        arrow::write_dataset(path = file.path(path, i),
+                             partitioning = get_partition_cols(i))
+    } else {
+      raw_data[[i]] |> arrow::write_dataset(path = file.path(path, i),
+                                            partitioning = get_partition_cols(i))
+    }
+
+    cli::cli_alert_info(paste0(iter, "/", length(df_names), " processed."))
+    iter <- iter + 1
   }
 
+  cli::cli_process_done()
+  cli::cli_alert_success("raw_data parquet folder created!")
+  cli::cli_alert_info(paste0("Data processed in: ",
+                             round(difftime(Sys.time(), start, "mins"), 2),
+                             " mins."))
+}
+
+#' Recreate raw data from local parquet folder
+#'
+#' Recreates an output of [get_all_polio_data()] from a folder housing
+#' data in parquet format.
+#'
+#' @param path `str` Local path to the parquet folder
+#' @param from_edav `bool` Build using local files or files in EDAV?
+#' @param container `azcontainer` An instance of an Azure container to connect
+#' to. Pass [get_azure_storage_connection()] using defaults if not accessing
+#' using a service principal.
+#'
+#' @returns `list` A list containing connections to the folders associated with
+#' individual datasets in the original output of [get_all_polio_data()].
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' # Building raw_data locally
+#' parquet_path <- "C:/Users/ABC1/Desktop/parquet_folder"
+#' raw_data <- build_parquet_raw_data(parquet_path)
+#'
+#' # Build raw_data from EDAV
+#' raw_data <- build_parquet_raw_data()
+#' }
+build_parquet_raw_data <- function(path = NULL, from_edav = F, container = NULL) {
+
+  if (from_edav) {
+    # Default values
+    if (!is.null(path)) {
+      path <- "GID/PEB/SIR/Sandbox/parquet_sandbox"
+    }
+    if (!is.null(container)) {
+      container <- get_azure_storage_connection()
+    }
+
+    raw_data <- build_parquet_raw_data_edav(path, container)
+  } else {
+    raw_data <- build_parquet_raw_data_local(path)
+  }
+
+  return(raw_data)
 }
 
 # Private functions ----
@@ -28,7 +113,7 @@ create_raw_data_parquet <- function(raw_data, path){
 #'
 #' @param name `str` Name of the column
 #'
-#' @return `chr` A character vector of columns to partition with.
+#' @returns `chr` A character vector of columns to partition with.
 #' @keywords internal
 #'
 #' @examples
@@ -49,14 +134,93 @@ get_partition_cols <- function(name) {
          "pos.dupe" = c("place.admin.0", "yronset"),
          "other" = c("place.admin.0", "yronset"),
          "other.dupe" = c("place.admin.0", "yronset"),
-         "dist.pop" = c("ADM0_NAME", "year"),
-         "prov.pop" = c("ADM0_NAME", "year"),
-         "ctry.pop" = c("ADM0_NAME", "year"),
+         "dist.pop" = c("ADM0_NAME"),
+         "prov.pop" = c("ADM0_NAME"),
+         "ctry.pop" = c("ADM0_NAME"),
          "global.ctry" = c("ADM0_NAME"),
          "global.prov" = c("ADM0_NAME"),
          "global.dist" = c("ADM0_NAME"),
          "roads" = c("continent"),
-         "cities" = c("CTRY_NAME"),
+         "cities" = c("CNTRY_NAME"),
          "metadata" = "download_time"
          )
 }
+
+#' Build raw_data using local parquet files
+#'
+#' @param path `str` A path to the parquet directory
+#'
+#' @returns `list` A list containing connections to the folders associated with
+#' individual datasets in the original output of [get_all_polio_data()].
+#' @keywords internal
+#'
+build_parquet_raw_data_local <- function(path = NULL) {
+
+  if (!dir.exists(path)) {
+    cli::cli_abort("Not a valid directory.")
+  }
+
+  valid_values <- c("afp", "afp.dupe", "afp.epi", "para.case", "es", "es.dupe",
+                    "sia", "sia.dupe", "pos", "pos.dupe", "other", "other.dupe",
+                    "dist.pop", "prov.pop", "ctry.pop", "global.ctry",
+                    "global.prov", "global.dist", "roads" , "cities", "metadata"
+                    )
+  data <- list.files(path)
+  data <- intersect(data, valid_values)
+
+  raw_data <- list()
+  for (i in data) {
+    raw_data[[i]] <- arrow::open_dataset(file.path(path, i))
+  }
+
+  return(raw_data)
+
+}
+
+#' Build raw_data using EDAV files
+#'
+#' @param path `str` Path to EDAV folder containing parquet files. This must
+#' be the absolute file path from the Blob endpoint of the container.
+#' @param container `azcontainer` An instance of an Azure container to connect
+#' to. Pass [get_azure_storage_connection()] using defaults if not accessing
+#' using a service principal.
+#'
+#' @returns `list` A list containing connections to the folders associated with
+#' individual datasets in the original output of [get_all_polio_data()].
+#' @keywords internal
+#'
+build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
+
+  if (is.null(container)) {
+    container <- get_azure_storage_connection()
+  }
+
+  exist <- edav_io("exists.dir", default_dir = "",
+                   file_loc = path, azcontainer = container)
+  if (!exist) {
+    cli::cli_abort("The directory does not exist on EDAV.")
+  } else {
+    rm(exist)
+  }
+
+  cli::cli_process_start("Building raw_data from EDAV parquet files")
+  start <- Sys.time()
+
+  raw_data <- NULL
+  # Download files locally in the temp directory first
+  dest <- "C:/Users/XRG9/Desktop/test"
+  local_pq <- file.path(dest, basename(path))
+  AzureStor::multidownload_adls_file(container,
+                                     src = "GID/PEB/SIR/Sandbox/parquet_sandbox/*",
+                                     dest = local_pq,
+                                     recursive = TRUE,
+                                     overwrite = TRUE
+                                     )
+
+  raw_data <- build_parquet_raw_data_local(local_pq)
+  cli::cli_process_done()
+  cli::cli_process_start(paste0("Built in ", difftime(start, Sys.time(), "mins"), " mins."))
+
+  return(raw_data)
+
+}
diff --git a/man/build_parquet_raw_data.Rd b/man/build_parquet_raw_data.Rd
new file mode 100644
index 00000000..1eef21a7
--- /dev/null
+++ b/man/build_parquet_raw_data.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dal.parquet.R
+\name{build_parquet_raw_data}
+\alias{build_parquet_raw_data}
+\title{Recreate raw data from local parquet folder}
+\usage{
+build_parquet_raw_data(path = NULL, from_edav = F, container = NULL)
+}
+\arguments{
+\item{path}{\code{str} Local path to the parquet folder}
+
+\item{from_edav}{\code{bool} Build using local files or files in EDAV?}
+
+\item{container}{\code{azcontainer} An instance of an Azure container to connect
+to. Pass \code{\link[=get_azure_storage_connection]{get_azure_storage_connection()}} using defaults if not accessing
+using a service principal.}
+}
+\value{
+\code{list} A list containing connections to the folders associated with
+individual datasets in the original output of \code{\link[=get_all_polio_data]{get_all_polio_data()}}.
+}
+\description{
+Recreates an output of \code{\link[=get_all_polio_data]{get_all_polio_data()}} from a folder housing
+data in parquet format.
+}
+\examples{
+\dontrun{
+# Building raw_data locally
+parquet_path <- "C:/Users/ABC1/Desktop/parquet_folder"
+raw_data <- build_parquet_raw_data(parquet_path)
+
+# Build raw_data from EDAV
+raw_data <- build_parquet_raw_data()
+}
+}
diff --git a/man/build_parquet_raw_data_edav.Rd b/man/build_parquet_raw_data_edav.Rd
new file mode 100644
index 00000000..a8af4023
--- /dev/null
+++ b/man/build_parquet_raw_data_edav.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dal.parquet.R
+\name{build_parquet_raw_data_edav}
+\alias{build_parquet_raw_data_edav}
+\title{Build raw_data using EDAV files}
+\usage{
+build_parquet_raw_data_edav(path = NULL, container = NULL, ...)
+}
+\arguments{
+\item{path}{\code{str} Path to EDAV folder containing parquet files. This must
+be the absolute file path from the Blob endpoint of the container.}
+
+\item{container}{\code{azcontainer} An instance of an Azure container to connect
+to. Pass \code{\link[=get_azure_storage_connection]{get_azure_storage_connection()}} using defaults if not accessing
+using a service principal.}
+}
+\value{
+\code{list} A list containing connections to the folders associated with
+individual datasets in the original output of \code{\link[=get_all_polio_data]{get_all_polio_data()}}.
+}
+\description{
+Build raw_data using EDAV files
+}
+\keyword{internal}
diff --git a/man/build_parquet_raw_data_local.Rd b/man/build_parquet_raw_data_local.Rd
new file mode 100644
index 00000000..d6b7aba5
--- /dev/null
+++ b/man/build_parquet_raw_data_local.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dal.parquet.R
+\name{build_parquet_raw_data_local}
+\alias{build_parquet_raw_data_local}
+\title{Build raw_data using local parquet files}
+\usage{
+build_parquet_raw_data_local(path = NULL)
+}
+\arguments{
+\item{path}{\code{str} A path to the parquet directory}
+}
+\value{
+\code{list} A list containing connections to the folders associated with
+individual datasets in the original output of \code{\link[=get_all_polio_data]{get_all_polio_data()}}.
+}
+\description{
+Build raw_data using local parquet files
+}
+\keyword{internal}
diff --git a/sirfunctions.Rproj b/sirfunctions.Rproj
index fd189303..69fafd4b 100644
--- a/sirfunctions.Rproj
+++ b/sirfunctions.Rproj
@@ -1,5 +1,4 @@
 Version: 1.0
-ProjectId: e9616991-2fba-4185-b9cb-72e1f1045eb4
 
 RestoreWorkspace: No
 SaveWorkspace: No

From fa59911fa74f1a1a9c9de7c80afafb27c2776479 Mon Sep 17 00:00:00 2001
From: mcuadera <xrg9@cdc.gov>
Date: Fri, 14 Mar 2025 14:23:09 -0400
Subject: [PATCH 03/28] create parquet folder upload fx

---
 NAMESPACE                     |  1 +
 R/dal.parquet.R               | 50 ++++++++++++++++++++++++++++++++---
 man/upload_parquet_to_edav.Rd | 28 ++++++++++++++++++++
 3 files changed, 76 insertions(+), 3 deletions(-)
 create mode 100644 man/upload_parquet_to_edav.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 87202a8e..f0eee07f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -85,6 +85,7 @@ export(send_teams_message)
 export(set_emergence_colors)
 export(test_EDAV_connection)
 export(upload_dr_to_github)
+export(upload_parquet_to_edav)
 export(upload_to_sharepoint)
 import(dplyr)
 import(ggplot2)
diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 53036d5b..bfc272d3 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -107,6 +107,50 @@ build_parquet_raw_data <- function(path = NULL, from_edav = F, container = NULL)
   return(raw_data)
 }
 
+#' Uploads a local parquet folder to EDAV
+#'
+#' Uploads a folder containing parquet files to EDAV
+#'
+#' @param src `str` Local path to the parquet folder.
+#' @param dest `str` EDAV endpoint.
+#' @param container `azcontainer` An instance of an Azure container.
+#'
+#' @returns None.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' local_dir <- "C:/Users/ABC1/Desktop/parquet_folder"
+#' edav_dir <- "ABC/parquet_folder"
+#' upload_parquet_to_edav(local_dir, edav_dir)
+#' }
+upload_parquet_to_edav <- function(src, dest, container = NULL) {
+  if (is.null(container)) {
+    container <- get_azure_storage_connection()
+  }
+
+  while (TRUE) {
+    cli::cli_alert_info(paste0("Confirm upload to: ", dest, "/", basename(src), " (y/n)"))
+    response <- stringr::str_to_lower(stringr::str_trim(readline("> ")))
+    if (!response %in% c("y", "n")) {
+      cli::cli_alert_warning("Invalid response. Try again.")
+    } else if (response == "n") {
+      cli::cli_alert("Upload cancelled.")
+    } else if (response == "y") {
+      break
+    }
+  }
+
+  cli::cli_process_start("Uploading parquet folder to EDAV")
+  start <- Sys.time()
+  AzureStor::multiupload_adls_file(container, paste0(src, "/*"), dest,
+                                   recursive = TRUE)
+  cli::cli_process_done()
+  cli::cli_alert_success(c("Uploaded in: ",
+                           round(difftime(Sys.time(), start, "mins"), 2),
+                           " mins"))
+}
+
 # Private functions ----
 
 #' Gets the column used to partition a column
@@ -208,10 +252,10 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
 
   raw_data <- NULL
   # Download files locally in the temp directory first
-  dest <- "C:/Users/XRG9/Desktop/test"
+  dest <- tempdir()
   local_pq <- file.path(dest, basename(path))
   AzureStor::multidownload_adls_file(container,
-                                     src = "GID/PEB/SIR/Sandbox/parquet_sandbox/*",
+                                     src = paste0(path, "/*"),
                                      dest = local_pq,
                                      recursive = TRUE,
                                      overwrite = TRUE
@@ -219,7 +263,7 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
 
   raw_data <- build_parquet_raw_data_local(local_pq)
   cli::cli_process_done()
-  cli::cli_process_start(paste0("Built in ", difftime(start, Sys.time(), "mins"), " mins."))
+  cli::cli_process_start(paste0("Built in ", difftime(Sys.time(), start, "mins"), " mins."))
 
   return(raw_data)
 
diff --git a/man/upload_parquet_to_edav.Rd b/man/upload_parquet_to_edav.Rd
new file mode 100644
index 00000000..41064538
--- /dev/null
+++ b/man/upload_parquet_to_edav.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dal.parquet.R
+\name{upload_parquet_to_edav}
+\alias{upload_parquet_to_edav}
+\title{Uploads a local parquet folder to EDAV}
+\usage{
+upload_parquet_to_edav(src, dest, container = NULL)
+}
+\arguments{
+\item{src}{\code{str} Local path to the parquet folder.}
+
+\item{dest}{\code{str} EDAV endpoint.}
+
+\item{container}{\code{azcontainer} An instance of an Azure container.}
+}
+\value{
+None.
+}
+\description{
+Uploads a folder containing parquet files to EDAV
+}
+\examples{
+\dontrun{
+local_dir <- "C:/Users/ABC1/Desktop/parquet_folder"
+edav_dir <- "ABC/parquet_folder"
+upload_parquet_to_edav(local_dir, edav_dir)
+}
+}

From 9b8185d75b0cfd995c6c5362e21ae654072b3ef9 Mon Sep 17 00:00:00 2001
From: mcuadera <xrg9@cdc.gov>
Date: Wed, 19 Mar 2025 08:41:52 -0400
Subject: [PATCH 04/28] using storage multidownload instead of
 multidownload_adls_file

for building the parquet raw data from EDAV
---
 R/dal.parquet.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index bfc272d3..e5f30cdc 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -254,7 +254,7 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
   # Download files locally in the temp directory first
   dest <- tempdir()
   local_pq <- file.path(dest, basename(path))
-  AzureStor::multidownload_adls_file(container,
+  AzureStor::storage_multidownload(container,
                                      src = paste0(path, "/*"),
                                      dest = local_pq,
                                      recursive = TRUE,

From 58632f776465f12021fc36e54794bf1e2b69e67b Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 13:20:15 -0400
Subject: [PATCH 05/28] shard using ctry name

ctry + year creates too much sharding
---
 R/dal.parquet.R | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index e5f30cdc..648f6e39 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -166,26 +166,26 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) {
 #' }
 get_partition_cols <- function(name) {
   switch(name,
-         "afp" = c("place.admin.0", "yronset"),
-         "afp.dupe" = c("place.admin.0", "yronset"),
-         "afp.epi" = c("place.admin.0", "yronset"),
-         "para.case" = c("place.admin.0", "yronset"),
-         "es" = c("ADM0_NAME", "collect.yr"),
-         "es.dupe" = c("ADM0_NAME", "collect.yr"),
-         "sia" = c("place.admin.0", "yr.sia"),
-         "sia.dupe" = c("place.admin.0", "yr.sia"),
-         "pos" = c("place.admin.0", "yronset"),
-         "pos.dupe" = c("place.admin.0", "yronset"),
-         "other" = c("place.admin.0", "yronset"),
-         "other.dupe" = c("place.admin.0", "yronset"),
-         "dist.pop" = c("ADM0_NAME"),
-         "prov.pop" = c("ADM0_NAME"),
-         "ctry.pop" = c("ADM0_NAME"),
-         "global.ctry" = c("ADM0_NAME"),
-         "global.prov" = c("ADM0_NAME"),
-         "global.dist" = c("ADM0_NAME"),
-         "roads" = c("continent"),
-         "cities" = c("CNTRY_NAME"),
+         "afp" = "place.admin.0",
+         "afp.dupe" = "place.admin.0",
+         "afp.epi" = "place.admin.0",
+         "para.case" = "place.admin.0",
+         "es" = "ADM0_NAME",
+         "es.dupe" = "ADM0_NAME",
+         "sia" = "place.admin.0",
+         "sia.dupe" = "place.admin.0",
+         "pos" = "place.admin.0",
+         "pos.dupe" = "place.admin.0",
+         "other" = "place.admin.0",
+         "other.dupe" = "place.admin.0",
+         "dist.pop" = "ctry",
+         "prov.pop" = "ctry",
+         "ctry.pop" = "ctry",
+         "global.ctry" = "ADM0_NAME",
+         "global.prov" = "ADM0_NAME",
+         "global.dist" = "ADM0_NAME",
+         "roads" = "continent",
+         "cities" = "CNTRY_NAME",
          "metadata" = "download_time"
          )
 }

From 580c777ccf326ae36c56bb4b5fa3b19d4c25f751 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 13:23:09 -0400
Subject: [PATCH 06/28] Add coverage datasets

---
 R/dal.parquet.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 648f6e39..b0580d8d 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -184,6 +184,9 @@ get_partition_cols <- function(name) {
          "global.ctry" = "ADM0_NAME",
          "global.prov" = "ADM0_NAME",
          "global.dist" = "ADM0_NAME",
+         "ctry.coverage" = "year",
+         "prov.coverage" = "year",
+         "dist.coverage" = "year",
          "roads" = "continent",
          "cities" = "CNTRY_NAME",
          "metadata" = "download_time"

From 69b9980d27c70e956f6eced8d639f7134a547e82 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 14:05:46 -0400
Subject: [PATCH 07/28] change partition column

also include new helper function
---
 R/dal.parquet.R | 101 ++++++++++++++++++++++++++++++------------------
 1 file changed, 64 insertions(+), 37 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index b0580d8d..e2602f3f 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -17,6 +17,9 @@ create_raw_data_parquet <- function(raw_data, path){
   start <- Sys.time()
   df_names <- names(raw_data)
 
+  options(arrow.use_threads = TRUE)
+  on.exit(options(arrow.use_threads = old_threads), add = TRUE)
+
   if (!dir.exists(path)) {
     cli::cli_abort("Directory path does not exist.")
   }
@@ -26,17 +29,12 @@ create_raw_data_parquet <- function(raw_data, path){
   for (i in df_names) {
     cli::cli_alert_info(paste0("Now processing: ", i))
 
-    if (i %in% c("global.prov", "global.dist")) {
-      raw_data[[i]] |>
-        dplyr::mutate(SHAPE = sf::st_as_text(SHAPE)) |>
-        arrow::write_dataset(path = file.path(path, i),
-                             partitioning = get_partition_cols(i))
-
-    } else if (i == "global.ctry") {
+    if (i %in% c("global.ctry", "global.prov", "global.dist")) {
       raw_data[[i]] |>
         dplyr::mutate(Shape = sf::st_as_text(Shape)) |>
         arrow::write_dataset(path = file.path(path, i),
                              partitioning = get_partition_cols(i))
+
     } else if (i %in% c("cities", "roads")) {
       raw_data[[i]] |>
         dplyr::mutate(geometry = sf::st_as_text(geometry)) |>
@@ -166,24 +164,24 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) {
 #' }
 get_partition_cols <- function(name) {
   switch(name,
-         "afp" = "place.admin.0",
-         "afp.dupe" = "place.admin.0",
-         "afp.epi" = "place.admin.0",
-         "para.case" = "place.admin.0",
-         "es" = "ADM0_NAME",
-         "es.dupe" = "ADM0_NAME",
-         "sia" = "place.admin.0",
-         "sia.dupe" = "place.admin.0",
-         "pos" = "place.admin.0",
-         "pos.dupe" = "place.admin.0",
-         "other" = "place.admin.0",
-         "other.dupe" = "place.admin.0",
-         "dist.pop" = "ctry",
-         "prov.pop" = "ctry",
-         "ctry.pop" = "ctry",
-         "global.ctry" = "ADM0_NAME",
-         "global.prov" = "ADM0_NAME",
-         "global.dist" = "ADM0_NAME",
+         "afp" = "yronset",
+         "afp.dupe" = "yronset",
+         "afp.epi" = "yronset",
+         "para.case" = "yronset",
+         "es" = "collect.yr",
+         "es.dupe" = "collect.yr",
+         "sia" = "yr.sia",
+         "sia.dupe" = "yr.sia",
+         "pos" = "yronset",
+         "pos.dupe" = "yronset",
+         "other" = "yronset",
+         "other.dupe" = "yronset",
+         "dist.pop" = "year",
+         "prov.pop" = "year",
+         "ctry.pop" = "year",
+         "global.ctry" = "WHO_REGION",
+         "global.prov" = "WHO_REGION",
+         "global.dist" = "WHO_REGION",
          "ctry.coverage" = "year",
          "prov.coverage" = "year",
          "dist.coverage" = "year",
@@ -210,7 +208,9 @@ build_parquet_raw_data_local <- function(path = NULL) {
   valid_values <- c("afp", "afp.dupe", "afp.epi", "para.case", "es", "es.dupe",
                     "sia", "sia.dupe", "pos", "pos.dupe", "other", "other.dupe",
                     "dist.pop", "prov.pop", "ctry.pop", "global.ctry",
-                    "global.prov", "global.dist", "roads" , "cities", "metadata"
+                    "global.prov", "global.dist", 
+                    "ctry.coverage", "prov.coverage", "dist.coverage",
+                    "roads" , "cities", "metadata"
                     )
   data <- list.files(path)
   data <- intersect(data, valid_values)
@@ -255,19 +255,46 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
 
   raw_data <- NULL
   # Download files locally in the temp directory first
-  dest <- tempdir()
-  local_pq <- file.path(dest, basename(path))
-  AzureStor::storage_multidownload(container,
-                                     src = paste0(path, "/*"),
-                                     dest = local_pq,
-                                     recursive = TRUE,
-                                     overwrite = TRUE
-                                     )
-
-  raw_data <- build_parquet_raw_data_local(local_pq)
-  cli::cli_process_done()
+  withr::with_tempdir({
+    local_pq <- file.path(getwd(), basename(path))
+    AzureStor::storage_multidownload(container,
+                                      src = paste0(path, "/*"),
+                                      dest = local_pq,
+                                      recursive = TRUE,
+                                      overwrite = TRUE
+                                      )
+
+    raw_data <- build_parquet_raw_data_local(local_pq)
+    cli::cli_process_done()
+  })
+  
   cli::cli_process_start(paste0("Built in ", difftime(Sys.time(), start, "mins"), " mins."))
 
   return(raw_data)
 
 }
+
+#' Drop Shape column and convert to binary
+#'
+#' @param x `sf` or `data.frame` Geodata.
+#' @param geom_col `str` Name of the geometry column.
+#'
+#' @returns `tibble` Data without any Shape column.
+#'
+#' @keywords internal
+#' 
+to_wkb_drop_sf <- function(x, geom_col) {
+  # Works whether x is sf or a plain data.frame with an sfc column
+  geom <- if (inherits(x, "sf")) {
+    sf::st_geometry(x)
+  } else {
+    x[[geom_col]]
+  } 
+
+  x[[paste0(geom_col, "_wkb")]] <- sf::st_as_binary(geom)
+  x[[geom_col]] <- NULL
+  if (inherits(x, "sf")) {
+     x <- sf::st_drop_geometry(x)
+  }
+  return(x)
+}
\ No newline at end of file

From 918a58636969c7cf63776a8f568a4f9a54721a52 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 14:13:08 -0400
Subject: [PATCH 08/28] use threading to create partitions

---
 R/dal.parquet.R | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index e2602f3f..755d94f9 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -17,6 +17,7 @@ create_raw_data_parquet <- function(raw_data, path){
   start <- Sys.time()
   df_names <- names(raw_data)
 
+  old_threads <- getOption("arrow.use_threads")
   options(arrow.use_threads = TRUE)
   on.exit(options(arrow.use_threads = old_threads), add = TRUE)
 
@@ -29,15 +30,15 @@ create_raw_data_parquet <- function(raw_data, path){
   for (i in df_names) {
     cli::cli_alert_info(paste0("Now processing: ", i))
 
+    data <- 
+
     if (i %in% c("global.ctry", "global.prov", "global.dist")) {
-      raw_data[[i]] |>
-        dplyr::mutate(Shape = sf::st_as_text(Shape)) |>
+      to_wkb_drop_sf(raw_data[[i]], "Shape") |>
         arrow::write_dataset(path = file.path(path, i),
                              partitioning = get_partition_cols(i))
 
     } else if (i %in% c("cities", "roads")) {
-      raw_data[[i]] |>
-        dplyr::mutate(geometry = sf::st_as_text(geometry)) |>
+      to_wkb_drop_sf(raw_data[[i]], "geometry") |>
         arrow::write_dataset(path = file.path(path, i),
                              partitioning = get_partition_cols(i))
 
@@ -186,7 +187,7 @@ get_partition_cols <- function(name) {
          "prov.coverage" = "year",
          "dist.coverage" = "year",
          "roads" = "continent",
-         "cities" = "CNTRY_NAME",
+         "cities" = "POP_CLASS",
          "metadata" = "download_time"
          )
 }

From 105b3b7cffca6c919dc683025167c11c02f7e063 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 14:28:05 -0400
Subject: [PATCH 09/28] remove execution time code and fix to_wkb_drop_sf

---
 R/dal.parquet.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 755d94f9..49304cad 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -14,7 +14,7 @@
 #' create_raw_data_parquet(raw_data, "C:/Users/ABC1/Desktop/raw_data_parquet")
 #' }
 create_raw_data_parquet <- function(raw_data, path){
-  start <- Sys.time()
+
   df_names <- names(raw_data)
 
   old_threads <- getOption("arrow.use_threads")
@@ -57,10 +57,6 @@ create_raw_data_parquet <- function(raw_data, path){
   }
 
   cli::cli_process_done()
-  cli::cli_alert_success("raw_data parquet folder created!")
-  cli::cli_alert_info(paste0("Data processed in: ",
-                             round(difftime(Sys.time(), start, "mins"), 2),
-                             " mins."))
 }
 
 #' Recreate raw data from local parquet folder
@@ -292,7 +288,11 @@ to_wkb_drop_sf <- function(x, geom_col) {
     x[[geom_col]]
   } 
 
-  x[[paste0(geom_col, "_wkb")]] <- sf::st_as_binary(geom)
+  # Convert to WKB (list of raw vectors), then drop the "WKB" class
+  wkb <- sf::st_as_binary(geom)
+  wkb <- unclass(wkb)   # <- key line: makes it a plain list Arrow can infer
+
+  x[[paste0(geom_col, "_wkb")]] <- wkb
   x[[geom_col]] <- NULL
   if (inherits(x, "sf")) {
      x <- sf::st_drop_geometry(x)

From 4638444c987b6ae44a4b5e3c1f90007a02c945e7 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 15:36:53 -0400
Subject: [PATCH 10/28] add function to convert wkb to sf

---
 R/dal.parquet.R | 66 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 54 insertions(+), 12 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 49304cad..7b27dc43 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -61,6 +61,7 @@ create_raw_data_parquet <- function(raw_data, path){
 
 #' Recreate raw data from local parquet folder
 #'
+#' @description
 #' Recreates an output of [get_all_polio_data()] from a folder housing
 #' data in parquet format.
 #'
@@ -69,6 +70,9 @@ create_raw_data_parquet <- function(raw_data, path){
 #' @param container `azcontainer` An instance of an Azure container to connect
 #' to. Pass [get_azure_storage_connection()] using defaults if not accessing
 #' using a service principal.
+#' 
+#' @details
+#' For tibbles with Shapes, pass to [from_wkb_to_sf()] first before creating maps.
 #'
 #' @returns `list` A list containing connections to the folders associated with
 #' individual datasets in the original output of [get_all_polio_data()].
@@ -274,28 +278,66 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
 #' Drop Shape column and convert to binary
 #'
 #' @param x `sf` or `data.frame` Geodata.
-#' @param geom_col `str` Name of the geometry column.
-#'
-#' @returns `tibble` Data without any Shape column.
+#' 
+#' @details
+#' This function was written using the CDC EDAV Chatbot using the model GPT-5.2.
+#' @returns `tibble` dData without any Shape column.
 #'
 #' @keywords internal
 #' 
-to_wkb_drop_sf <- function(x, geom_col) {
+to_wkb_drop_sf <- function(sf_data) {
+
+  if ("Shape" %in% names(sf_data)) {
+    geom_col <- "Shape"
+  } else if ("geometry" %in% names(sf_data)) {
+    geom_col <- "geometry"
+  } else {
+    cli::cli_abort("Not an sf dataset.")
+  }
+
   # Works whether x is sf or a plain data.frame with an sfc column
-  geom <- if (inherits(x, "sf")) {
-    sf::st_geometry(x)
+  geom <- if (inherits(sf_data, "sf")) {
+    sf::st_geometry(sf_data)
   } else {
-    x[[geom_col]]
+    sf_data[[geom_col]]
   } 
 
   # Convert to WKB (list of raw vectors), then drop the "WKB" class
   wkb <- sf::st_as_binary(geom)
   wkb <- unclass(wkb)   # <- key line: makes it a plain list Arrow can infer
 
-  x[[paste0(geom_col, "_wkb")]] <- wkb
-  x[[geom_col]] <- NULL
-  if (inherits(x, "sf")) {
-     x <- sf::st_drop_geometry(x)
+  sf_data[[geom_col]] <- wkb
+  if (inherits(sf_data, "sf")) {
+     sf_data <- sf::st_drop_geometry(sf_data)
+  }
+  return(sf_data)
+}
+
+#' Convert WKB back to sf column
+#'
+#' @param sf_data `arrow connection` Geodata arrow connection.
+#'
+#' @returns `tibble` Geodata with `sf`.
+#'
+#' @export
+from_wkb_to_sf <- function(sf_data) {
+
+
+  # Ensure that global shapefiles have Shape and city/roads as geometry. 
+  # Otherwise, need to modify this function.
+  if ("Shape" %in% names(sf_data)) {
+    wkb_col <- "Shape"
+  } else if ("geometry" %in% names(sf_data)) {
+    wkb_col <- "geometry"
+  } else {
+    cli::cli_abort("Not an sf dataset.")
   }
-  return(x)
+
+  sf_data |>
+    dplyr::mutate(dplyr::across(dplyr::any_of(wkb_col), \(x) {
+      sf::st_as_sf(x, EWKB = TRUE, crs = 4326)
+    }))
+  
+  return(sf_data)
+
 }
\ No newline at end of file

From 9e7c95bca449a8247b389158f60b95395d6ef8d5 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 16:00:35 -0400
Subject: [PATCH 11/28] fix raw data compile from EDAV

---
 R/dal.parquet.R | 104 +++++++++++++++++++++++-------------------------
 1 file changed, 49 insertions(+), 55 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 7b27dc43..34c03ac3 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -33,12 +33,14 @@ create_raw_data_parquet <- function(raw_data, path){
     data <- 
 
     if (i %in% c("global.ctry", "global.prov", "global.dist")) {
-      to_wkb_drop_sf(raw_data[[i]], "Shape") |>
+      raw_data[[i]] |>
+        to_wkb_drop_sf() |>
         arrow::write_dataset(path = file.path(path, i),
                              partitioning = get_partition_cols(i))
 
     } else if (i %in% c("cities", "roads")) {
-      to_wkb_drop_sf(raw_data[[i]], "geometry") |>
+      raw_data[[i]] |>
+        to_wkb_drop_sf() |>
         arrow::write_dataset(path = file.path(path, i),
                              partitioning = get_partition_cols(i))
 
@@ -91,10 +93,10 @@ build_parquet_raw_data <- function(path = NULL, from_edav = F, container = NULL)
 
   if (from_edav) {
     # Default values
-    if (!is.null(path)) {
+    if (is.null(path)) {
       path <- "GID/PEB/SIR/Sandbox/parquet_sandbox"
     }
-    if (!is.null(container)) {
+    if (is.null(container)) {
       container <- get_azure_storage_connection()
     }
 
@@ -141,13 +143,38 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) {
   }
 
   cli::cli_process_start("Uploading parquet folder to EDAV")
-  start <- Sys.time()
-  AzureStor::multiupload_adls_file(container, paste0(src, "/*"), dest,
+  AzureStor::multiupload_adls_file(container, paste0(src, "/*"), file.path(dest, basename(src)),
                                    recursive = TRUE)
   cli::cli_process_done()
-  cli::cli_alert_success(c("Uploaded in: ",
-                           round(difftime(Sys.time(), start, "mins"), 2),
-                           " mins"))
+}
+
+#' Convert WKB back to sf column
+#'
+#' @param sf_data `arrow connection` Geodata arrow connection.
+#'
+#' @returns `tibble` Geodata with `sf`.
+#'
+#' @export
+from_wkb_to_sf <- function(sf_data) {
+
+
+  # Ensure that global shapefiles have Shape and city/roads as geometry. 
+  # Otherwise, need to modify this function.
+  if ("Shape" %in% names(sf_data)) {
+    wkb_col <- "Shape"
+  } else if ("geometry" %in% names(sf_data)) {
+    wkb_col <- "geometry"
+  } else {
+    cli::cli_abort("Not an sf dataset.")
+  }
+
+  sf_data |>
+    dplyr::mutate(dplyr::across(dplyr::any_of(wkb_col), \(x) {
+      sf::st_as_sf(x, EWKB = TRUE, crs = 4326)
+    }))
+  
+  return(sf_data)
+
 }
 
 # Private functions ----
@@ -243,8 +270,9 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
     container <- get_azure_storage_connection()
   }
 
-  exist <- edav_io("exists.dir", default_dir = "",
+  exist <- edav_io("exists.dir", NULL,
                    file_loc = path, azcontainer = container)
+  
   if (!exist) {
     cli::cli_abort("The directory does not exist on EDAV.")
   } else {
@@ -252,24 +280,19 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
   }
 
   cli::cli_process_start("Building raw_data from EDAV parquet files")
-  start <- Sys.time()
 
   raw_data <- NULL
-  # Download files locally in the temp directory first
-  withr::with_tempdir({
-    local_pq <- file.path(getwd(), basename(path))
-    AzureStor::storage_multidownload(container,
-                                      src = paste0(path, "/*"),
-                                      dest = local_pq,
-                                      recursive = TRUE,
-                                      overwrite = TRUE
-                                      )
-
-    raw_data <- build_parquet_raw_data_local(local_pq)
-    cli::cli_process_done()
-  })
-  
-  cli::cli_process_start(paste0("Built in ", difftime(Sys.time(), start, "mins"), " mins."))
+
+  local_pq <- file.path(rappdirs::user_data_dir("sirfunctions"), basename(path))
+  AzureStor::storage_multidownload(container,
+                                    src = paste0(path, "/*"),
+                                    dest = local_pq,
+                                    recursive = TRUE,
+                                    overwrite = TRUE
+                                    )
+
+  raw_data <- build_parquet_raw_data_local(local_pq)
+  cli::cli_process_done()
 
   return(raw_data)
 
@@ -311,33 +334,4 @@ to_wkb_drop_sf <- function(sf_data) {
      sf_data <- sf::st_drop_geometry(sf_data)
   }
   return(sf_data)
-}
-
-#' Convert WKB back to sf column
-#'
-#' @param sf_data `arrow connection` Geodata arrow connection.
-#'
-#' @returns `tibble` Geodata with `sf`.
-#'
-#' @export
-from_wkb_to_sf <- function(sf_data) {
-
-
-  # Ensure that global shapefiles have Shape and city/roads as geometry. 
-  # Otherwise, need to modify this function.
-  if ("Shape" %in% names(sf_data)) {
-    wkb_col <- "Shape"
-  } else if ("geometry" %in% names(sf_data)) {
-    wkb_col <- "geometry"
-  } else {
-    cli::cli_abort("Not an sf dataset.")
-  }
-
-  sf_data |>
-    dplyr::mutate(dplyr::across(dplyr::any_of(wkb_col), \(x) {
-      sf::st_as_sf(x, EWKB = TRUE, crs = 4326)
-    }))
-  
-  return(sf_data)
-
 }
\ No newline at end of file

From ad1714a05a09df6734202cf0b1c783990a917a2c Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 16:02:29 -0400
Subject: [PATCH 12/28] add docs for helper functions

---
 NAMESPACE                     |  3 ++-
 man/build_parquet_raw_data.Rd |  3 +++
 man/from_wkb_to_sf.Rd         | 17 +++++++++++++++++
 man/to_wkb_drop_sf.Rd         | 21 +++++++++++++++++++++
 4 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 man/from_wkb_to_sf.Rd
 create mode 100644 man/to_wkb_drop_sf.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 4b589f9b..7368b8d1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,8 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_rolling_years)
-export(check_afp_geographies)
 export(build_parquet_raw_data)
+export(check_afp_geographies)
 export(check_afp_guid_ctry_data)
 export(check_cache)
 export(check_missing_rows)
@@ -38,6 +38,7 @@ export(f.stool.ad.01)
 export(f.timely.detection.01)
 export(fix_ctry_data_missing_guids)
 export(force_load_polio_data_cache)
+export(from_wkb_to_sf)
 export(generate_60_day_tab)
 export(generate_60_day_table_data)
 export(generate_ad_final_col)
diff --git a/man/build_parquet_raw_data.Rd b/man/build_parquet_raw_data.Rd
index 1eef21a7..3390b92a 100644
--- a/man/build_parquet_raw_data.Rd
+++ b/man/build_parquet_raw_data.Rd
@@ -23,6 +23,9 @@ individual datasets in the original output of \code{\link[=get_all_polio_data]{g
 Recreates an output of \code{\link[=get_all_polio_data]{get_all_polio_data()}} from a folder housing
 data in parquet format.
 }
+\details{
+For tibbles with Shapes, pass to \code{\link[=from_wkb_to_sf]{from_wkb_to_sf()}} first before creating maps.
+}
 \examples{
 \dontrun{
 # Building raw_data locally
diff --git a/man/from_wkb_to_sf.Rd b/man/from_wkb_to_sf.Rd
new file mode 100644
index 00000000..33920411
--- /dev/null
+++ b/man/from_wkb_to_sf.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dal.parquet.R
+\name{from_wkb_to_sf}
+\alias{from_wkb_to_sf}
+\title{Convert WKB back to sf column}
+\usage{
+from_wkb_to_sf(sf_data)
+}
+\arguments{
+\item{sf_data}{\verb{arrow connection} Geodata arrow connection.}
+}
+\value{
+\code{tibble} Geodata with \code{sf}.
+}
+\description{
+Convert WKB back to sf column
+}
diff --git a/man/to_wkb_drop_sf.Rd b/man/to_wkb_drop_sf.Rd
new file mode 100644
index 00000000..d1f0f560
--- /dev/null
+++ b/man/to_wkb_drop_sf.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dal.parquet.R
+\name{to_wkb_drop_sf}
+\alias{to_wkb_drop_sf}
+\title{Drop Shape column and convert to binary}
+\usage{
+to_wkb_drop_sf(sf_data)
+}
+\arguments{
+\item{x}{\code{sf} or \code{data.frame} Geodata.}
+}
+\value{
+\code{tibble} dData without any Shape column.
+}
+\description{
+Drop Shape column and convert to binary
+}
+\details{
+This function was written using the CDC EDAV Chatbot using the model GPT-5.2.
+}
+\keyword{internal}

From 355844a4f614da16690a0132039be9f72e77a39f Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 16:07:08 -0400
Subject: [PATCH 13/28] format R code

---
 R/dal.parquet.R | 211 ++++++++++++++++++++++++++----------------------
 1 file changed, 114 insertions(+), 97 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 34c03ac3..30ce57e5 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -13,8 +13,7 @@
 #' raw_data <- get_all_polio_data()
 #' create_raw_data_parquet(raw_data, "C:/Users/ABC1/Desktop/raw_data_parquet")
 #' }
-create_raw_data_parquet <- function(raw_data, path){
-
+create_raw_data_parquet <- function(raw_data, path) {
   df_names <- names(raw_data)
 
   old_threads <- getOption("arrow.use_threads")
@@ -30,29 +29,35 @@ create_raw_data_parquet <- function(raw_data, path){
   for (i in df_names) {
     cli::cli_alert_info(paste0("Now processing: ", i))
 
-    data <- 
-
-    if (i %in% c("global.ctry", "global.prov", "global.dist")) {
-      raw_data[[i]] |>
-        to_wkb_drop_sf() |>
-        arrow::write_dataset(path = file.path(path, i),
-                             partitioning = get_partition_cols(i))
-
-    } else if (i %in% c("cities", "roads")) {
-      raw_data[[i]] |>
-        to_wkb_drop_sf() |>
-        arrow::write_dataset(path = file.path(path, i),
-                             partitioning = get_partition_cols(i))
-
-    } else if (i == "metadata") {
-      raw_data[[i]] |>
-        dplyr::as_tibble() |>
-        arrow::write_dataset(path = file.path(path, i),
-                             partitioning = get_partition_cols(i))
-    } else {
-      raw_data[[i]] |> arrow::write_dataset(path = file.path(path, i),
-                                            partitioning = get_partition_cols(i))
-    }
+    data <-
+      if (i %in% c("global.ctry", "global.prov", "global.dist")) {
+        raw_data[[i]] |>
+          to_wkb_drop_sf() |>
+          arrow::write_dataset(
+            path = file.path(path, i),
+            partitioning = get_partition_cols(i)
+          )
+      } else if (i %in% c("cities", "roads")) {
+        raw_data[[i]] |>
+          to_wkb_drop_sf() |>
+          arrow::write_dataset(
+            path = file.path(path, i),
+            partitioning = get_partition_cols(i)
+          )
+      } else if (i == "metadata") {
+        raw_data[[i]] |>
+          dplyr::as_tibble() |>
+          arrow::write_dataset(
+            path = file.path(path, i),
+            partitioning = get_partition_cols(i)
+          )
+      } else {
+        raw_data[[i]] |>
+          arrow::write_dataset(
+            path = file.path(path, i),
+            partitioning = get_partition_cols(i)
+          )
+      }
 
     cli::cli_alert_info(paste0(iter, "/", length(df_names), " processed."))
     iter <- iter + 1
@@ -72,7 +77,7 @@ create_raw_data_parquet <- function(raw_data, path){
 #' @param container `azcontainer` An instance of an Azure container to connect
 #' to. Pass [get_azure_storage_connection()] using defaults if not accessing
 #' using a service principal.
-#' 
+#'
 #' @details
 #' For tibbles with Shapes, pass to [from_wkb_to_sf()] first before creating maps.
 #'
@@ -89,8 +94,11 @@ create_raw_data_parquet <- function(raw_data, path){
 #' # Build raw_data from EDAV
 #' raw_data <- build_parquet_raw_data()
 #' }
-build_parquet_raw_data <- function(path = NULL, from_edav = F, container = NULL) {
-
+build_parquet_raw_data <- function(
+  path = NULL,
+  from_edav = F,
+  container = NULL
+) {
   if (from_edav) {
     # Default values
     if (is.null(path)) {
@@ -130,21 +138,18 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) {
     container <- get_azure_storage_connection()
   }
 
-  while (TRUE) {
-    cli::cli_alert_info(paste0("Confirm upload to: ", dest, "/", basename(src), " (y/n)"))
-    response <- stringr::str_to_lower(stringr::str_trim(readline("> ")))
-    if (!response %in% c("y", "n")) {
-      cli::cli_alert_warning("Invalid response. Try again.")
-    } else if (response == "n") {
-      cli::cli_alert("Upload cancelled.")
-    } else if (response == "y") {
-      break
-    }
+  dir_exists <- edav_io("exists.dir", NULL, dest)
+  if (!dir_exists) {
+    cli::cli_abort("Folder doesn't exist on EDAV. Unable to upload")
   }
 
   cli::cli_process_start("Uploading parquet folder to EDAV")
-  AzureStor::multiupload_adls_file(container, paste0(src, "/*"), file.path(dest, basename(src)),
-                                   recursive = TRUE)
+  AzureStor::multiupload_adls_file(
+    container,
+    paste0(src, "/*"),
+    file.path(dest, basename(src)),
+    recursive = TRUE
+  )
   cli::cli_process_done()
 }
 
@@ -156,9 +161,7 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) {
 #'
 #' @export
 from_wkb_to_sf <- function(sf_data) {
-
-
-  # Ensure that global shapefiles have Shape and city/roads as geometry. 
+  # Ensure that global shapefiles have Shape and city/roads as geometry.
   # Otherwise, need to modify this function.
   if ("Shape" %in% names(sf_data)) {
     wkb_col <- "Shape"
@@ -172,9 +175,8 @@ from_wkb_to_sf <- function(sf_data) {
     dplyr::mutate(dplyr::across(dplyr::any_of(wkb_col), \(x) {
       sf::st_as_sf(x, EWKB = TRUE, crs = 4326)
     }))
-  
-  return(sf_data)
 
+  return(sf_data)
 }
 
 # Private functions ----
@@ -191,32 +193,33 @@ from_wkb_to_sf <- function(sf_data) {
 #' get_partition_cols("afp")
 #' }
 get_partition_cols <- function(name) {
-  switch(name,
-         "afp" = "yronset",
-         "afp.dupe" = "yronset",
-         "afp.epi" = "yronset",
-         "para.case" = "yronset",
-         "es" = "collect.yr",
-         "es.dupe" = "collect.yr",
-         "sia" = "yr.sia",
-         "sia.dupe" = "yr.sia",
-         "pos" = "yronset",
-         "pos.dupe" = "yronset",
-         "other" = "yronset",
-         "other.dupe" = "yronset",
-         "dist.pop" = "year",
-         "prov.pop" = "year",
-         "ctry.pop" = "year",
-         "global.ctry" = "WHO_REGION",
-         "global.prov" = "WHO_REGION",
-         "global.dist" = "WHO_REGION",
-         "ctry.coverage" = "year",
-         "prov.coverage" = "year",
-         "dist.coverage" = "year",
-         "roads" = "continent",
-         "cities" = "POP_CLASS",
-         "metadata" = "download_time"
-         )
+  switch(
+    name,
+    "afp" = "yronset",
+    "afp.dupe" = "yronset",
+    "afp.epi" = "yronset",
+    "para.case" = "yronset",
+    "es" = "collect.yr",
+    "es.dupe" = "collect.yr",
+    "sia" = "yr.sia",
+    "sia.dupe" = "yr.sia",
+    "pos" = "yronset",
+    "pos.dupe" = "yronset",
+    "other" = "yronset",
+    "other.dupe" = "yronset",
+    "dist.pop" = "year",
+    "prov.pop" = "year",
+    "ctry.pop" = "year",
+    "global.ctry" = "WHO_REGION",
+    "global.prov" = "WHO_REGION",
+    "global.dist" = "WHO_REGION",
+    "ctry.coverage" = "year",
+    "prov.coverage" = "year",
+    "dist.coverage" = "year",
+    "roads" = "continent",
+    "cities" = "POP_CLASS",
+    "metadata" = "download_time"
+  )
 }
 
 #' Build raw_data using local parquet files
@@ -228,18 +231,36 @@ get_partition_cols <- function(name) {
 #' @keywords internal
 #'
 build_parquet_raw_data_local <- function(path = NULL) {
-
   if (!dir.exists(path)) {
     cli::cli_abort("Not a valid directory.")
   }
 
-  valid_values <- c("afp", "afp.dupe", "afp.epi", "para.case", "es", "es.dupe",
-                    "sia", "sia.dupe", "pos", "pos.dupe", "other", "other.dupe",
-                    "dist.pop", "prov.pop", "ctry.pop", "global.ctry",
-                    "global.prov", "global.dist", 
-                    "ctry.coverage", "prov.coverage", "dist.coverage",
-                    "roads" , "cities", "metadata"
-                    )
+  valid_values <- c(
+    "afp",
+    "afp.dupe",
+    "afp.epi",
+    "para.case",
+    "es",
+    "es.dupe",
+    "sia",
+    "sia.dupe",
+    "pos",
+    "pos.dupe",
+    "other",
+    "other.dupe",
+    "dist.pop",
+    "prov.pop",
+    "ctry.pop",
+    "global.ctry",
+    "global.prov",
+    "global.dist",
+    "ctry.coverage",
+    "prov.coverage",
+    "dist.coverage",
+    "roads",
+    "cities",
+    "metadata"
+  )
   data <- list.files(path)
   data <- intersect(data, valid_values)
 
@@ -249,7 +270,6 @@ build_parquet_raw_data_local <- function(path = NULL) {
   }
 
   return(raw_data)
-
 }
 
 #' Build raw_data using EDAV files
@@ -265,14 +285,12 @@ build_parquet_raw_data_local <- function(path = NULL) {
 #' @keywords internal
 #'
 build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
-
   if (is.null(container)) {
     container <- get_azure_storage_connection()
   }
 
-  exist <- edav_io("exists.dir", NULL,
-                   file_loc = path, azcontainer = container)
-  
+  exist <- edav_io("exists.dir", NULL, file_loc = path, azcontainer = container)
+
   if (!exist) {
     cli::cli_abort("The directory does not exist on EDAV.")
   } else {
@@ -284,32 +302,31 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
   raw_data <- NULL
 
   local_pq <- file.path(rappdirs::user_data_dir("sirfunctions"), basename(path))
-  AzureStor::storage_multidownload(container,
-                                    src = paste0(path, "/*"),
-                                    dest = local_pq,
-                                    recursive = TRUE,
-                                    overwrite = TRUE
-                                    )
+  AzureStor::storage_multidownload(
+    container,
+    src = paste0(path, "/*"),
+    dest = local_pq,
+    recursive = TRUE,
+    overwrite = TRUE
+  )
 
   raw_data <- build_parquet_raw_data_local(local_pq)
   cli::cli_process_done()
 
   return(raw_data)
-
 }
 
 #' Drop Shape column and convert to binary
 #'
 #' @param x `sf` or `data.frame` Geodata.
-#' 
+#'
 #' @details
 #' This function was written using the CDC EDAV Chatbot using the model GPT-5.2.
 #' @returns `tibble` dData without any Shape column.
 #'
 #' @keywords internal
-#' 
+#'
 to_wkb_drop_sf <- function(sf_data) {
-
   if ("Shape" %in% names(sf_data)) {
     geom_col <- "Shape"
   } else if ("geometry" %in% names(sf_data)) {
@@ -323,15 +340,15 @@ to_wkb_drop_sf <- function(sf_data) {
     sf::st_geometry(sf_data)
   } else {
     sf_data[[geom_col]]
-  } 
+  }
 
   # Convert to WKB (list of raw vectors), then drop the "WKB" class
   wkb <- sf::st_as_binary(geom)
-  wkb <- unclass(wkb)   # <- key line: makes it a plain list Arrow can infer
+  wkb <- unclass(wkb) # <- key line: makes it a plain list Arrow can infer
 
   sf_data[[geom_col]] <- wkb
   if (inherits(sf_data, "sf")) {
-     sf_data <- sf::st_drop_geometry(sf_data)
+    sf_data <- sf::st_drop_geometry(sf_data)
   }
   return(sf_data)
 }
\ No newline at end of file

From a8eed2220b23c8847210ffa5620f4f58de46f846 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 16:08:44 -0400
Subject: [PATCH 14/28] don't default to a file path

---
 R/dal.parquet.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 30ce57e5..a12a068f 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -102,7 +102,7 @@ build_parquet_raw_data <- function(
   if (from_edav) {
     # Default values
     if (is.null(path)) {
-      path <- "GID/PEB/SIR/Sandbox/parquet_sandbox"
+      cli::cli_abort("Please pass a file path to the parquet folder")
     }
     if (is.null(container)) {
       container <- get_azure_storage_connection()

From 79e669e78883d3b670f937be36fdfb77c4246de5 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Mon, 30 Mar 2026 16:30:53 -0400
Subject: [PATCH 15/28] provide ability to extract specific datasets from raw
 data

---
 R/dal.parquet.R                     | 34 ++++++++++++++++++++++-------
 man/build_parquet_raw_data.Rd       |  7 +++++-
 man/build_parquet_raw_data_edav.Rd  |  2 +-
 man/build_parquet_raw_data_local.Rd |  2 +-
 4 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index a12a068f..d00bd89e 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -97,6 +97,7 @@ create_raw_data_parquet <- function(raw_data, path) {
 build_parquet_raw_data <- function(
   path = NULL,
   from_edav = F,
+  dataset = "all",
   container = NULL
 ) {
   if (from_edav) {
@@ -108,9 +109,9 @@ build_parquet_raw_data <- function(
       container <- get_azure_storage_connection()
     }
 
-    raw_data <- build_parquet_raw_data_edav(path, container)
+    raw_data <- build_parquet_raw_data_edav(path, dataset, container)
   } else {
-    raw_data <- build_parquet_raw_data_local(path)
+    raw_data <- build_parquet_raw_data_local(path, dataset)
   }
 
   return(raw_data)
@@ -225,12 +226,13 @@ get_partition_cols <- function(name) {
 #' Build raw_data using local parquet files
 #'
 #' @param path `str` A path to the parquet directory
+#' @param dataset `str` A specific dataset. Defaults to `"all"`. Otherwise, can specify any dataset in the list returned by [get_all_polio_data()]. 
 #'
 #' @returns `list` A list containing connections to the folders associated with
 #' individual datasets in the original output of [get_all_polio_data()].
 #' @keywords internal
 #'
-build_parquet_raw_data_local <- function(path = NULL) {
+build_parquet_raw_data_local <- function(path = NULL, dataset = "all") {
   if (!dir.exists(path)) {
     cli::cli_abort("Not a valid directory.")
   }
@@ -262,7 +264,11 @@ build_parquet_raw_data_local <- function(path = NULL) {
     "metadata"
   )
   data <- list.files(path)
-  data <- intersect(data, valid_values)
+  if (dataset == "all") {
+    data <- intersect(data, valid_values)
+  } else {
+    data <- intersect(data, dataset)
+  }
 
   raw_data <- list()
   for (i in data) {
@@ -276,6 +282,7 @@ build_parquet_raw_data_local <- function(path = NULL) {
 #'
 #' @param path `str` Path to EDAV folder containing parquet files. This must
 #' be the absolute file path from the Blob endpoint of the container.
+#' @param dataset `str` A specific dataset. Defaults to `"all"`. Otherwise, can specify any dataset in the list returned by [get_all_polio_data()]. 
 #' @param container `azcontainer` An instance of an Azure container to connect
 #' to. Pass [get_azure_storage_connection()] using defaults if not accessing
 #' using a service principal.
@@ -284,7 +291,7 @@ build_parquet_raw_data_local <- function(path = NULL) {
 #' individual datasets in the original output of [get_all_polio_data()].
 #' @keywords internal
 #'
-build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
+build_parquet_raw_data_edav <- function(path = NULL, dataset = "all", container = NULL) {
   if (is.null(container)) {
     container <- get_azure_storage_connection()
   }
@@ -301,19 +308,30 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) {
 
   raw_data <- NULL
 
+  if (dataset == "all") {
+    source_path <- paste0(path, "raw_data_parquet/*")
+  } else {
+    source_path <- paste0(file.path(path, "raw_data_parquet", dataset), "/*")
+  }
+
   local_pq <- file.path(rappdirs::user_data_dir("sirfunctions"), basename(path))
   AzureStor::storage_multidownload(
     container,
-    src = paste0(path, "/*"),
+    src = source_path,
     dest = local_pq,
     recursive = TRUE,
     overwrite = TRUE
   )
 
-  raw_data <- build_parquet_raw_data_local(local_pq)
+  raw_data <- build_parquet_raw_data_local(local_pq, dataset)
   cli::cli_process_done()
+  
+  if (length(raw_data) == 1) {
+    return(raw_data[[1]])
+  } else {
+    return(raw_data)
+  }
 
-  return(raw_data)
 }
 
 #' Drop Shape column and convert to binary
diff --git a/man/build_parquet_raw_data.Rd b/man/build_parquet_raw_data.Rd
index 3390b92a..6f5de4cb 100644
--- a/man/build_parquet_raw_data.Rd
+++ b/man/build_parquet_raw_data.Rd
@@ -4,7 +4,12 @@
 \alias{build_parquet_raw_data}
 \title{Recreate raw data from local parquet folder}
 \usage{
-build_parquet_raw_data(path = NULL, from_edav = F, container = NULL)
+build_parquet_raw_data(
+  path = NULL,
+  from_edav = F,
+  dataset = "all",
+  container = NULL
+)
 }
 \arguments{
 \item{path}{\code{str} Local path to the parquet folder}
diff --git a/man/build_parquet_raw_data_edav.Rd b/man/build_parquet_raw_data_edav.Rd
index a8af4023..a61f0a83 100644
--- a/man/build_parquet_raw_data_edav.Rd
+++ b/man/build_parquet_raw_data_edav.Rd
@@ -4,7 +4,7 @@
 \alias{build_parquet_raw_data_edav}
 \title{Build raw_data using EDAV files}
 \usage{
-build_parquet_raw_data_edav(path = NULL, container = NULL, ...)
+build_parquet_raw_data_edav(path = NULL, container = NULL, dataset = "all")
 }
 \arguments{
 \item{path}{\code{str} Path to EDAV folder containing parquet files. This must
diff --git a/man/build_parquet_raw_data_local.Rd b/man/build_parquet_raw_data_local.Rd
index d6b7aba5..fd6e95df 100644
--- a/man/build_parquet_raw_data_local.Rd
+++ b/man/build_parquet_raw_data_local.Rd
@@ -4,7 +4,7 @@
 \alias{build_parquet_raw_data_local}
 \title{Build raw_data using local parquet files}
 \usage{
-build_parquet_raw_data_local(path = NULL)
+build_parquet_raw_data_local(path = NULL, dataset = "all")
 }
 \arguments{
 \item{path}{\code{str} A path to the parquet directory}

From 1a330be2ec6de6774b25cc721d2b83e673917ec6 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Tue, 31 Mar 2026 10:28:54 -0400
Subject: [PATCH 16/28] add logic for loading specific datasets

---
 R/dal.parquet.R                     | 150 ++++++++++++++++++++++++----
 man/build_parquet_raw_data_edav.Rd  |   4 +-
 man/build_parquet_raw_data_local.Rd |   2 +
 3 files changed, 134 insertions(+), 22 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index d00bd89e..72d9f6d6 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -226,7 +226,7 @@ get_partition_cols <- function(name) {
 #' Build raw_data using local parquet files
 #'
 #' @param path `str` A path to the parquet directory
-#' @param dataset `str` A specific dataset. Defaults to `"all"`. Otherwise, can specify any dataset in the list returned by [get_all_polio_data()]. 
+#' @param dataset `str` A specific dataset. Defaults to `"all"`. Otherwise, can specify any dataset in the list returned by [get_all_polio_data()].
 #'
 #' @returns `list` A list containing connections to the folders associated with
 #' individual datasets in the original output of [get_all_polio_data()].
@@ -264,15 +264,52 @@ build_parquet_raw_data_local <- function(path = NULL, dataset = "all") {
     "metadata"
   )
   data <- list.files(path)
-  if (dataset == "all") {
-    data <- intersect(data, valid_values)
-  } else {
-    data <- intersect(data, dataset)
-  }
 
-  raw_data <- list()
-  for (i in data) {
-    raw_data[[i]] <- arrow::open_dataset(file.path(path, i))
+  if (length(dataset) == 1 && dataset == "all") {
+    raw_data <- list()
+
+    for (i in valid_values) {
+      
+      tryCatch({
+        raw_data[[i]] <- arrow::open_dataset(file.path(path, i))
+      }, error = \(e) {
+        cli::cli_alert_info(paste0("Dataset not found and won't be added: ", i))
+        raw_data[[i]] <- NULL
+      })
+      
+    }
+  } else if (length(dataset) > 1) {
+    invalid <- setdiff(dataset, valid_values)
+
+    if (length(invalid) > 0) {
+      cli::cli_alert_info("The following type passed are invalid and won't be loaded: ")
+      cli::cli_li(invalid)
+    }
+
+    valid <- dataset[!dataset %in% invalid]
+
+    if (length(valid) == 0) {
+      cli::cli_abort("All the dataset passed are invalid.")
+    }
+
+    has_all <- sum(stringr::str_detect(valid, "all"))
+
+    if (has_all >= 1) {
+      cli::cli_abort("Please pass only 'all'.")
+    }
+
+    raw_data <- list()
+    
+    for (i in valid) {
+      tryCatch({
+        raw_data[[i]] <- arrow::open_dataset(file.path(path, i))
+      }, error = \(e) {
+        cli::cli_alert_info(paste0("Dataset not found and won't be added: ", i))
+        raw_data[[i]] <- NULL
+      })
+    }
+  } else if (length(dataset) == 1 && dataset %in% valid_values) {
+    raw_data <- arrow::open_dataset(file.path(path, dataset))
   }
 
   return(raw_data)
@@ -296,6 +333,33 @@ build_parquet_raw_data_edav <- function(path = NULL, dataset = "all", container
     container <- get_azure_storage_connection()
   }
 
+  valid_values <- c(
+    "afp",
+    "afp.dupe",
+    "afp.epi",
+    "para.case",
+    "es",
+    "es.dupe",
+    "sia",
+    "sia.dupe",
+    "pos",
+    "pos.dupe",
+    "other",
+    "other.dupe",
+    "dist.pop",
+    "prov.pop",
+    "ctry.pop",
+    "global.ctry",
+    "global.prov",
+    "global.dist",
+    "ctry.coverage",
+    "prov.coverage",
+    "dist.coverage",
+    "roads",
+    "cities",
+    "metadata"
+  )
+
   exist <- edav_io("exists.dir", NULL, file_loc = path, azcontainer = container)
 
   if (!exist) {
@@ -308,29 +372,73 @@ build_parquet_raw_data_edav <- function(path = NULL, dataset = "all", container
 
   raw_data <- NULL
 
-  if (dataset == "all") {
-    source_path <- paste0(path, "raw_data_parquet/*")
-  } else {
+  if (length(dataset) == 1 && dataset == "all") {
+    source_path <- file.path(path, "raw_data_parquet/*")
+    local_pq <- file.path(rappdirs::user_data_dir("sirfunctions"), "raw_data_parquet")
+  } else if (length(dataset) > 1) {
+
+    invalid <- setdiff(dataset, valid_values)
+
+    if (length(invalid) > 0) {
+      cli::cli_alert_info(
+        "The following type passed are invalid and won't be loaded: "
+      )
+      cli::cli_li(invalid)
+    }
+
+    valid <- dataset[!dataset %in% invalid]
+
+    if (length(valid) == 0) {
+      cli::cli_abort("All the dataset passed are invalid.")
+    }
+
+    has_all <- sum(stringr::str_detect(valid, "all"))
+
+    if (has_all >= 1) {
+      cli::cli_abort("Please pass only 'all'.")
+    }
+
+    source_path <- paste0(file.path(path, "raw_data_parquet"), "/", valid, "/*")
+    local_pq <- paste0(file.path(rappdirs::user_data_dir("sirfunctions"), "raw_data_parquet"), "/", valid)
+  } else if (length(dataset) == 1 && dataset %in% valid_values) {
     source_path <- paste0(file.path(path, "raw_data_parquet", dataset), "/*")
+    local_pq <- paste0(file.path(rappdirs::user_data_dir("sirfunctions"), "raw_data_parquet"), "/", dataset)
   }
 
-  local_pq <- file.path(rappdirs::user_data_dir("sirfunctions"), basename(path))
-  AzureStor::storage_multidownload(
+  for (i in local_pq) {
+
+
+      unlink(i, recursive = TRUE, force = TRUE)
+      dir.create(i, recursive = TRUE)
+  
+    
+  }
+
+  if (length(source_path) > 1) {
+    for (i in length(source_path)) {
+
+    AzureStor::storage_multidownload(
+    container,
+    src = source_path[i],
+    dest = local_pq[i],
+    recursive = TRUE,
+    overwrite = TRUE
+  )
+    }
+  } else {
+    AzureStor::storage_multidownload(
     container,
     src = source_path,
     dest = local_pq,
     recursive = TRUE,
     overwrite = TRUE
   )
+  }
 
-  raw_data <- build_parquet_raw_data_local(local_pq, dataset)
+  raw_data <- build_parquet_raw_data_local(file.path(rappdirs::user_data_dir("sirfunctions"), "raw_data_parquet"), dataset)
   cli::cli_process_done()
-  
-  if (length(raw_data) == 1) {
-    return(raw_data[[1]])
-  } else {
-    return(raw_data)
-  }
+
+  return(raw_data)
 
 }
 
diff --git a/man/build_parquet_raw_data_edav.Rd b/man/build_parquet_raw_data_edav.Rd
index a61f0a83..849f2ea6 100644
--- a/man/build_parquet_raw_data_edav.Rd
+++ b/man/build_parquet_raw_data_edav.Rd
@@ -4,12 +4,14 @@
 \alias{build_parquet_raw_data_edav}
 \title{Build raw_data using EDAV files}
 \usage{
-build_parquet_raw_data_edav(path = NULL, container = NULL, dataset = "all")
+build_parquet_raw_data_edav(path = NULL, dataset = "all", container = NULL)
 }
 \arguments{
 \item{path}{\code{str} Path to EDAV folder containing parquet files. This must
 be the absolute file path from the Blob endpoint of the container.}
 
+\item{dataset}{\code{str} A specific dataset. Defaults to \code{"all"}. Otherwise, can specify any dataset in the list returned by \code{\link[=get_all_polio_data]{get_all_polio_data()}}.}
+
 \item{container}{\code{azcontainer} An instance of an Azure container to connect
 to. Pass \code{\link[=get_azure_storage_connection]{get_azure_storage_connection()}} using defaults if not accessing
 using a service principal.}
diff --git a/man/build_parquet_raw_data_local.Rd b/man/build_parquet_raw_data_local.Rd
index fd6e95df..f4480ca1 100644
--- a/man/build_parquet_raw_data_local.Rd
+++ b/man/build_parquet_raw_data_local.Rd
@@ -8,6 +8,8 @@ build_parquet_raw_data_local(path = NULL, dataset = "all")
 }
 \arguments{
 \item{path}{\code{str} A path to the parquet directory}
+
+\item{dataset}{\code{str} A specific dataset. Defaults to \code{"all"}. Otherwise, can specify any dataset in the list returned by \code{\link[=get_all_polio_data]{get_all_polio_data()}}.}
 }
 \value{
 \code{list} A list containing connections to the folders associated with

From ee3e18b1ab02325c10ae39da895543544e71a5c6 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Tue, 31 Mar 2026 11:16:28 -0400
Subject: [PATCH 17/28] fix issue with saving sf columns

---
 R/dal.parquet.R | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 72d9f6d6..5304473d 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -164,19 +164,23 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) {
 from_wkb_to_sf <- function(sf_data) {
   # Ensure that global shapefiles have Shape and city/roads as geometry.
   # Otherwise, need to modify this function.
+
+  if (inherits(sf_data, "ArrowObject")) {
+    cli::cli_abort("Please run dplyr::collect() first prior to passing to the function.")
+  }
+
   if ("Shape" %in% names(sf_data)) {
-    wkb_col <- "Shape"
+    sf_data <- sf_data |>
+      dplyr::mutate(Shape = sf::st_as_sfc(Shape, EWKB = TRUE, crs = 4326)) |>
+      sf::st_as_sf()
   } else if ("geometry" %in% names(sf_data)) {
-    wkb_col <- "geometry"
+    sf_data <- sf_data |>
+      dplyr::mutate(geometry = sf::st_as_sfc(geometry, EWKB = TRUE, crs = 4326)) |>
+      sf::st_as_sf()
   } else {
     cli::cli_abort("Not an sf dataset.")
   }
 
-  sf_data |>
-    dplyr::mutate(dplyr::across(dplyr::any_of(wkb_col), \(x) {
-      sf::st_as_sf(x, EWKB = TRUE, crs = 4326)
-    }))
-
   return(sf_data)
 }
 
@@ -473,8 +477,6 @@ to_wkb_drop_sf <- function(sf_data) {
   wkb <- unclass(wkb) # <- key line: makes it a plain list Arrow can infer
 
   sf_data[[geom_col]] <- wkb
-  if (inherits(sf_data, "sf")) {
-    sf_data <- sf::st_drop_geometry(sf_data)
-  }
+
   return(sf_data)
 }
\ No newline at end of file

From ade9e88339de4c513b3717c81af3178c27b92046 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Tue, 31 Mar 2026 13:36:19 -0400
Subject: [PATCH 18/28] add example of use from_wkb_to_sf

---
 R/dal.parquet.R       | 8 ++++++++
 man/from_wkb_to_sf.Rd | 9 +++++++++
 2 files changed, 17 insertions(+)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 5304473d..7f04aeb2 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -161,6 +161,14 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) {
 #' @returns `tibble` Geodata with `sf`.
 #'
 #' @export
+#' @examples
+#' \dontrun {
+#' raw_data <- build_parquet_raw_data()
+#' kenya_ctry_sf <- raw_data$global.ctry |> 
+#'     dplyr::filter(ctry == "KENYA") |> 
+#'     dplyr::collect() |> 
+#'     from_wkb_to_sf()
+#' }
 from_wkb_to_sf <- function(sf_data) {
   # Ensure that global shapefiles have Shape and city/roads as geometry.
   # Otherwise, need to modify this function.
diff --git a/man/from_wkb_to_sf.Rd b/man/from_wkb_to_sf.Rd
index 33920411..e172bfdd 100644
--- a/man/from_wkb_to_sf.Rd
+++ b/man/from_wkb_to_sf.Rd
@@ -15,3 +15,12 @@ from_wkb_to_sf(sf_data)
 \description{
 Convert WKB back to sf column
 }
+\examples{
+\dontrun {
+raw_data <- build_parquet_raw_data()
+kenya_ctry_sf <- raw_data$global.ctry |> 
+    dplyr::filter(ctry == "KENYA") |> 
+    dplyr::collect() |> 
+    from_wkb_to_sf()
+}
+}

From 38a131f713db62011a984c5911d2c8fdf4668b50 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Tue, 31 Mar 2026 15:48:53 -0400
Subject: [PATCH 19/28] add docs

---
 R/dal.parquet.R               | 22 ++++++++--------------
 man/build_parquet_raw_data.Rd | 12 ++++++------
 man/from_wkb_to_sf.Rd         |  2 +-
 3 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 7f04aeb2..403e2828 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -72,9 +72,9 @@ create_raw_data_parquet <- function(raw_data, path) {
 #' Recreates an output of [get_all_polio_data()] from a folder housing
 #' data in parquet format.
 #'
-#' @param path `str` Local path to the parquet folder
-#' @param from_edav `bool` Build using local files or files in EDAV?
-#' @param container `azcontainer` An instance of an Azure container to connect
+#' @param path `str` Absolute path to the parquet folder.
+#' @param from_edav `bool` Build using local files or files in EDAV? Defaults to TRUE.
+#' @param container `azcontainer` An instance of an Azure container to connect.
 #' to. Pass [get_azure_storage_connection()] using defaults if not accessing
 #' using a service principal.
 #'
@@ -95,19 +95,16 @@ create_raw_data_parquet <- function(raw_data, path) {
 #' raw_data <- build_parquet_raw_data()
 #' }
 build_parquet_raw_data <- function(
-  path = NULL,
-  from_edav = F,
+  path = "GID/PEB/SIR/Data/analytic",
+  from_edav = TRUE,
   dataset = "all",
-  container = NULL
+  container = get_azure_storage_connection()
 ) {
   if (from_edav) {
     # Default values
     if (is.null(path)) {
       cli::cli_abort("Please pass a file path to the parquet folder")
     }
-    if (is.null(container)) {
-      container <- get_azure_storage_connection()
-    }
 
     raw_data <- build_parquet_raw_data_edav(path, dataset, container)
   } else {
@@ -162,7 +159,7 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) {
 #'
 #' @export
 #' @examples
-#' \dontrun {
+#' \dontrun{
 #' raw_data <- build_parquet_raw_data()
 #' kenya_ctry_sf <- raw_data$global.ctry |> 
 #'     dplyr::filter(ctry == "KENYA") |> 
@@ -340,10 +337,7 @@ build_parquet_raw_data_local <- function(path = NULL, dataset = "all") {
 #' individual datasets in the original output of [get_all_polio_data()].
 #' @keywords internal
 #'
-build_parquet_raw_data_edav <- function(path = NULL, dataset = "all", container = NULL) {
-  if (is.null(container)) {
-    container <- get_azure_storage_connection()
-  }
+build_parquet_raw_data_edav <- function(path = NULL, dataset = "all", container = get_azure_storage_connection()) {
 
   valid_values <- c(
     "afp",
diff --git a/man/build_parquet_raw_data.Rd b/man/build_parquet_raw_data.Rd
index 6f5de4cb..96c6240e 100644
--- a/man/build_parquet_raw_data.Rd
+++ b/man/build_parquet_raw_data.Rd
@@ -5,18 +5,18 @@
 \title{Recreate raw data from local parquet folder}
 \usage{
 build_parquet_raw_data(
-  path = NULL,
-  from_edav = F,
+  path = "GID/PEB/SIR/Data/analytic",
+  from_edav = TRUE,
   dataset = "all",
-  container = NULL
+  container = get_azure_storage_connection()
 )
 }
 \arguments{
-\item{path}{\code{str} Local path to the parquet folder}
+\item{path}{\code{str} Absolute path to the parquet folder.}
 
-\item{from_edav}{\code{bool} Build using local files or files in EDAV?}
+\item{from_edav}{\code{bool} Build using local files or files in EDAV? Defaults to TRUE.}
 
-\item{container}{\code{azcontainer} An instance of an Azure container to connect
+\item{container}{\code{azcontainer} An instance of an Azure container to connect.
 to. Pass \code{\link[=get_azure_storage_connection]{get_azure_storage_connection()}} using defaults if not accessing
 using a service principal.}
 }
diff --git a/man/from_wkb_to_sf.Rd b/man/from_wkb_to_sf.Rd
index e172bfdd..e1623f3b 100644
--- a/man/from_wkb_to_sf.Rd
+++ b/man/from_wkb_to_sf.Rd
@@ -16,7 +16,7 @@ from_wkb_to_sf(sf_data)
 Convert WKB back to sf column
 }
 \examples{
-\dontrun {
+\dontrun{
 raw_data <- build_parquet_raw_data()
 kenya_ctry_sf <- raw_data$global.ctry |> 
     dplyr::filter(ctry == "KENYA") |> 

From 5ef7953de11786f33ed391bfd801bf73a0a73e46 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Tue, 31 Mar 2026 15:58:25 -0400
Subject: [PATCH 20/28] simplify parameters and functions

---
 R/dal.parquet.R                    | 7 ++-----
 man/build_parquet_raw_data_edav.Rd | 6 +++++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index 403e2828..f9078808 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -96,8 +96,8 @@ create_raw_data_parquet <- function(raw_data, path) {
 #' }
 build_parquet_raw_data <- function(
   path = "GID/PEB/SIR/Data/analytic",
-  from_edav = TRUE,
   dataset = "all",
+  from_edav = TRUE,
   container = get_azure_storage_connection()
 ) {
   if (from_edav) {
@@ -131,10 +131,7 @@ build_parquet_raw_data <- function(
 #' edav_dir <- "ABC/parquet_folder"
 #' upload_parquet_to_edav(local_dir, edav_dir)
 #' }
-upload_parquet_to_edav <- function(src, dest, container = NULL) {
-  if (is.null(container)) {
-    container <- get_azure_storage_connection()
-  }
+upload_parquet_to_edav <- function(src, dest, container = get_azure_storage_connection()) {
 
   dir_exists <- edav_io("exists.dir", NULL, dest)
   if (!dir_exists) {
diff --git a/man/build_parquet_raw_data_edav.Rd b/man/build_parquet_raw_data_edav.Rd
index 849f2ea6..3ed377ff 100644
--- a/man/build_parquet_raw_data_edav.Rd
+++ b/man/build_parquet_raw_data_edav.Rd
@@ -4,7 +4,11 @@
 \alias{build_parquet_raw_data_edav}
 \title{Build raw_data using EDAV files}
 \usage{
-build_parquet_raw_data_edav(path = NULL, dataset = "all", container = NULL)
+build_parquet_raw_data_edav(
+  path = NULL,
+  dataset = "all",
+  container = get_azure_storage_connection()
+)
 }
 \arguments{
 \item{path}{\code{str} Path to EDAV folder containing parquet files. This must

From 8b9b3511826d3a140595e00c7b3fbfca6cd66949 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Tue, 31 Mar 2026 16:05:13 -0400
Subject: [PATCH 21/28] move get_all_polio_data on its own R script

---
 R/dal.R                | 949 -----------------------------------------
 R/get_all_polio_data.R | 947 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 947 insertions(+), 949 deletions(-)
 create mode 100644 R/get_all_polio_data.R

diff --git a/R/dal.R b/R/dal.R
index 2ec10532..daa7aa25 100644
--- a/R/dal.R
+++ b/R/dal.R
@@ -897,955 +897,6 @@ normalize_format <- function(fmt) {
 
 #### 2) Key data pull functions ####
 
-
-#' Retrieve all pre-processed polio data
-#'
-#' @description Download POLIS data from the CDC pre-processed endpoint. By default
-#' this function will return a "small" or recent dataset. This is primarily for data
-#' that is from the past six years. You can specify a "medium" sized dataset for data
-#' that is from 2016 onwards. Finally the "large" sized dataset will provide information
-#' from 2000 onwards. Regular pulls form the data will recreate the "small" dataset
-#' when new information is available and the Data Management Team can force the
-#' creation of the "medium" and "large" static datasets as necessary.
-#'
-#' @param size `str` Size of data to download. Defaults to `"small"`.
-#' - `"small"`: Data from the last six years.
-#' - `"medium"`: Data from 2016-present.
-#' - `"large"`: Data from 2000-present.
-#' @param data_folder `str` Location of the data folder containing pre-processed POLIS data,
-#' spatial files, coverage data, and population data. Defaults to `"GID/PEB/SIR/Data"`.
-#' @param polis_folder `str` Location of the POLIS folder. Defaults to `"GID/PEB/SIR/POLIS"`.
-#' @param core_ready_folder `str` Which core ready folder to use. Defaults to `"Core_Ready_Files"`.
-#' @param force.new.run `logical` Default `FALSE`, if `TRUE` will run recent data and cache.
-#' @param recreate.static.files `logical` Default `FALSE`, if `TRUE` will run all data and cache.
-#' @param attach.spatial.data `logical` Default `TRUE`, adds spatial data to downloaded object.
-#' @param use_edav `logical` Build raw data list using EDAV files. Defaults to `TRUE`.
-#' @param archive Logical. Whether to archive previous output directories
-#'    before overwriting. Default is `TRUE`.
-#' @param keep_n_archives Numeric. Number of archive folders to retain.
-#'   Defaults to `Inf`, which keeps all archives. Set to a finite number
-#'   (e.g., 3) to automatically delete older archives beyond the N most recent.
-#' @param output_format str: output_format to save files as.
-#' Available formats include 'rds' and 'qs2'. Defaults is 'rds'.
-#' @param local_caching `logical` Enable local caching so data is stored locally and
-#' only downloaded when there is updated data from EDAV.
-#' @param use_archived_data `logical` Allows the ability to recreate the raw data file using previous
-#' preprocessed data. If
-#' @returns Named `list` containing polio data that is relevant to CDC.
-#' @examples
-#' \dontrun{
-#' raw.data <- get_all_polio_data() # downloads data for last 6 years, including spatial files
-#' raw.data <- get_all_polio_data(size = "small", attach.spatial.data = FALSE) # exclude spatial data
-#' }
-#'
-#' @export
-get_all_polio_data <- function(
-    size = "small",
-    data_folder = "GID/PEB/SIR/Data",
-    polis_folder = "GID/PEB/SIR/POLIS",
-    core_ready_folder = "Core_Ready_Files",
-    force.new.run = FALSE,
-    recreate.static.files = FALSE,
-    attach.spatial.data = TRUE,
-    use_edav = TRUE,
-    use_archived_data = FALSE,
-    archive = TRUE,
-    keep_n_archives = Inf,
-    output_format = "rds",
-    local_caching = TRUE) {
-
-  # check to see that size parameter is appropriate
-  if (!size %in% c("small", "medium", "large")) {
-    stop("The parameter 'size' must be either 'small', 'medium', or 'large'")
-  }
-
-  # Check output format
-  if (!output_format %in% c("rds", "qs2")) {
-    stop("Only rds and qs2 is supported at this time.")
-  }
-
-# normalize and validate both output formats
-output_format <- normalize_format(output_format)
-
-# Fail safe in instances where EDAV connection fails
-if (use_edav) {
-  verify_edav <- tryCatch(
-    {
-      invisible(capture.output(test_EDAV_connection()))
-      cli::cli_alert_success("Connect to EDAV successful.")
-      TRUE
-    },
-    error = \(e) {
-      cli::cli_alert_info("Connection to EDAV unsuccessful.")
-      FALSE
-    }
-  )
-
-  if (!verify_edav) {
-    cli::cli_alert_info("Unable to obtain data from EDAV. Loading from local cache instead.")
-    cli::cli_alert_info("NOTE: Data may be stale. Please review the global polio dataset metadata for information on when the data was last processed.")
-    raw.data <- force_load_polio_data_cache(attach.spatial.data, output_format)
-    return(raw.data)
-  }
-}
-
-# Constant variables
-# Each file comes out of these folders
-analytic_folder <- file.path(data_folder, "analytic")
-polis_data_folder <- file.path(data_folder, "polis")
-spatial_folder <- file.path(data_folder, "spatial")
-coverage_folder <- file.path(data_folder, "coverage")
-pop_folder <- file.path(data_folder, "pop")
-
-# Year cutoffs for the different datasets
-current_year <- lubridate::year(Sys.Date())
-small_year <- current_year - 5
-med_year <- 2016 #hardcode to 2016 because it's an important point in time
-
-# Required files
-raw_data_recent_name <- paste0("raw.data.recent", output_format)
-raw_data_medium_name <- paste0("raw.data.", med_year, ".", small_year - 1, output_format)
-raw_data_2000_name <- paste0("raw.data.2000.", med_year - 1, output_format)
-spatial_data_name <- paste0("spatial.data", output_format)
-global_ctry_sf_name <- "global.ctry.rds"
-global_prov_sf_name <- "global.prov.rds"
-global_dist_sf_name <- "global.dist.rds"
-
-# Perform check to build using the archived polis folder
-if (use_archived_data) {
-  cli::cli_alert_info("Using archived data")
-  cli::cli_alert_info("NOTE: the metadata will be for the most recent pull")
-  polis_data_folder <- get_archived_polis_data(
-    data_folder,
-    use_edav,
-    keep_n_archives
-  )
-  recreate.static.files <- TRUE
-}
-
-# look to see if the recent raw data rds is in the analytic folder
-prev_table <- sirfunctions_io("list", NULL, analytic_folder,
-  edav = use_edav
-)
-
-if (nrow(prev_table) > 0) {
-  prev_table <- prev_table |>
-    dplyr::filter(grepl(raw_data_recent_name, name)) |>
-    dplyr::select("file" = "name", "size", "ctime" = "lastModified")
-} else {
-  # if empty, make sure to recreate tibble to the right format
-  prev_table <- tibble(
-    "file" = NA,
-    "size" = NA,
-    "ctime" = NA
-  ) |>
-    dplyr::mutate(file = as.character(file),
-                  size = as.double(size),
-                  ctime = as_datetime(ctime)) |>
-    dplyr::filter(!is.na(file))
-}
-
-if (recreate.static.files | force.new.run) {
-  force.new.run <- T
-  create.cache <- T
-}
-
-
-if (!force.new.run) {
-
-  # Check if using the local cache is sufficient
-  if (use_edav & size == "small" & local_caching) {
-    if (!recache_raw_data(analytic_folder, use_edav, output_format)) {
-
-      raw.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"),
-                                                          paste0("raw_data", output_format)),
-                                  edav = FALSE)
-
-      cli::cli_process_start("Checking for duplicates in datasets.")
-      raw.data <- duplicate_check(raw.data)
-      cli::cli_process_done()
-      if (attach.spatial.data) {
-        if (!recache_spatial_data(analytic_folder, spatial_folder,
-                                  use_edav, output_format)) {
-          spatial.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"),
-                                                                  paste0("spatial_data", output_format)),
-                                          edav = FALSE)
-          raw.data$global.ctry <- spatial.data$global.ctry
-          raw.data$global.prov <- spatial.data$global.prov
-          raw.data$global.dist <- spatial.data$global.dist
-          raw.data$roads <- spatial.data$roads
-          raw.data$cities <- spatial.data$cities
-
-          return(raw.data)
-        } else {
-          spatial.data <- sirfunctions_io("read", NULL, file.path(analytic_folder, spatial_data_name),
-                                          edav = use_edav)
-          sirfunctions_io("write", NULL, file.path(rappdirs::user_data_dir("sirfunctions"),
-                                                                  paste0("spatial_data", output_format)),
-                                          obj = spatial.data,
-                                          edav = FALSE)
-          edav_spatial_timestamp <- sirfunctions_io(
-            "read",
-            NULL,
-            file.path(analytic_folder, paste0("spatial_timestamp", output_format)),
-            edav = use_edav
-          )
-          sirfunctions_io("write", NULL, file.path(rappdirs::user_data_dir("sirfunctions"),
-                                                   paste0("spatial_timestamp", output_format)),
-                          obj = edav_spatial_timestamp,
-                          edav = FALSE)
-
-          raw.data$global.ctry <- spatial.data$global.ctry
-          raw.data$global.prov <- spatial.data$global.prov
-          raw.data$global.dist <- spatial.data$global.dist
-          raw.data$roads <- spatial.data$roads
-          raw.data$cities <- spatial.data$cities
-
-          return(raw.data)
-        }
-      } else {
-        return(raw.data)
-      }
-    }
-  }
-
-  if (use_edav) {
-    cli::cli_alert_info(paste0("Downloading most recent active polio data from ", small_year," onwards"))
-  } else {
-    cli::cli_alert_info(paste0("Loading most recent active polio data from ", small_year," onwards"))
-  }
-
-  raw.data.small.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav)
-
-  if (size == "small") {
-    raw.data <- raw.data.small.pull
-  }
-
-  if (size == "medium") {
-    prev_table <- sirfunctions_io("list", NULL, analytic_folder, edav = use_edav) |>
-      dplyr::filter(grepl(raw_data_medium_name, name)) |>
-      dplyr::select("file" = "name", "size", "ctime" = "lastModified")
-
-    if (use_edav) {
-      cli::cli_alert_info(paste0("Downloading static polio data from ", med_year, "-", small_year))
-    } else {
-      cli::cli_alert_info(paste0("Loading static polio data from ", med_year, "-", small_year))
-    }
-
-    raw.data.medium.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav)
-
-    raw.data <- split_concat_raw_data(
-      action = "concat",
-      raw.data.small.pull = raw.data.small.pull,
-      raw.data.medium.pull = raw.data.medium.pull
-    )
-  }
-
-  if (size == "large") {
-    prev_table <- sirfunctions_io("list", NULL, analytic_folder,
-                                  edav = use_edav, full_names = TRUE
-    ) |>
-      dplyr::filter(grepl(raw_data_medium_name, name)) |>
-      dplyr::select("file" = "name", "size", "ctime" = "lastModified")
-
-    if (use_edav) {
-      cli::cli_alert_info(paste0("Downloading static polio data from ", med_year, "-", small_year))
-    } else {
-      cli::cli_alert_info(paste0("Loading static polio data from ", med_year, "-", small_year))
-    }
-
-    raw.data.medium.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav)
-
-    prev_table <- sirfunctions_io("list", NULL, analytic_folder, edav = use_edav) |>
-      dplyr::filter(grepl(raw_data_2000_name, name)) |>
-      dplyr::select("file" = "name", "size", "ctime" = "lastModified")
-
-    if (use_edav) {
-      cli::cli_alert_info(paste0("Downloading static polio data from 2001-", med_year))
-    } else {
-      cli::cli_alert_info(paste0("Loading static polio data from 2001-", med_year))
-    }
-
-    raw.data.large.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav)
-
-    raw.data <- split_concat_raw_data(
-      action = "concat",
-      raw.data.small.pull = raw.data.small.pull,
-      raw.data.medium.pull = raw.data.medium.pull,
-      raw.data.large.pull = raw.data.large.pull
-    )
-  }
-
-  # Only cache the small dataset, which we use in 90% of the case
-  if (use_edav & local_caching) {
-    raw_data_timestamp_exists <- invisible(sirfunctions_io(
-      "exists.file",
-      NULL,
-      file.path(analytic_folder, paste0("raw_data_timestamp", output_format)),
-      edav = use_edav
-    ))
-
-  } else {
-    raw_data_timestamp_exists <- FALSE
-  }
-  if (size == "small" & raw_data_timestamp_exists & local_caching) {
-    cli::cli_process_start("Caching global polio data locally")
-
-    if (!dir.exists(rappdirs::user_data_dir("sirfunctions"))) {
-      dir.create(rappdirs::user_data_dir("sirfunctions"), recursive = TRUE)
-    }
-
-    sirfunctions_io("write", NULL,
-                    file.path(rappdirs::user_data_dir("sirfunctions"), paste0("raw_data", output_format)),
-                    obj = raw.data,
-                    edav = FALSE)
-    # Add edav tag file to local cache dir
-    edav_raw_data_timestamp <- sirfunctions_io(
-      "read",
-      NULL,
-      file.path(analytic_folder, paste0("raw_data_timestamp", output_format)),
-      edav = use_edav
-    )
-
-    sirfunctions_io("write", NULL,
-                    file.path(rappdirs::user_data_dir("sirfunctions"), paste0("raw_data_timestamp", output_format)),
-                    obj = edav_raw_data_timestamp,
-                    edav = FALSE)
-
-    cli::cli_process_done()
-  }
-
-  cli::cli_process_done()
-
-  cli::cli_process_start("Checking for duplicates in datasets.")
-  raw.data <- duplicate_check(raw.data)
-  cli::cli_process_done()
-
-  if (attach.spatial.data) {
-
-    # Don't recache spatial if up to date
-    if (!recache_spatial_data(analytic_folder, spatial_folder,
-                              use_edav, output_format) & local_caching) {
-      spatial.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"),
-                                                              paste0("spatial_data", output_format)),
-                                      edav = FALSE)
-      raw.data$global.ctry <- spatial.data$global.ctry
-      raw.data$global.prov <- spatial.data$global.prov
-      raw.data$global.dist <- spatial.data$global.dist
-      raw.data$roads <- spatial.data$roads
-      raw.data$cities <- spatial.data$cities
-
-      return(raw.data)
-    }
-
-    if (use_edav) {
-      cli::cli_process_start("Downloading and attaching spatial data")
-    } else {
-      cli::cli_process_start("Loading and attaching spatial data")
-    }
-
-    spatial.data <- sirfunctions_io("read", NULL,
-                                      file.path(analytic_folder, spatial_data_name),
-                                      edav = use_edav
-      )
-
-    raw.data$global.ctry <- spatial.data$global.ctry
-    raw.data$global.prov <- spatial.data$global.prov
-    raw.data$global.dist <- spatial.data$global.dist
-    raw.data$roads <- spatial.data$roads
-    raw.data$cities <- spatial.data$cities
-
-    cli::cli_process_done()
-
-    if (use_edav & local_caching) {
-      spatial_timestamp_exists <- sirfunctions_io(
-        "exists.file",
-        NULL,
-        file.path(analytic_folder, paste0("spatial_timestamp", output_format)),
-        edav = use_edav
-      )
-    } else {
-      spatial_timestamp_exists <- FALSE
-    }
-
-    if (recache_spatial_data(analytic_folder, spatial_folder,
-                             use_edav, output_format) & spatial_timestamp_exists & local_caching) {
-      sirfunctions_io("write",
-                      NULL,
-                      file.path(rappdirs::user_data_dir("sirfunctions"),
-                                paste0("spatial_data",
-                                output_format)),
-                      obj = spatial.data,
-                      edav = FALSE)
-
-      spatial_processed_tag <- sirfunctions_io("read",
-                                               NULL,
-                                               file.path(analytic_folder,
-                                                         paste0("spatial_timestamp", output_format)),
-                                               edav = use_edav)
-      sirfunctions_io("write",
-                      NULL,
-                      file.path(rappdirs::user_data_dir("sirfunctions"),
-                                paste0("spatial_timestamp", output_format)),
-                      obj = spatial_processed_tag,
-                      edav = FALSE)
-    }
-  }
-
-  return(raw.data)
-
-} else {
-
-  # Check that the required folders have data
-  for (folder in c(analytic_folder, polis_data_folder, spatial_folder,
-                   coverage_folder, pop_folder)) {
-
-    # get_all_polio_data will recreate the analytic folder if it's missing
-    switch(basename(folder),
-           "analytic" = {
-             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
-               cli::cli_alert_info("No analytics folder found. Will create a new one.")
-               sirfunctions_io("create.dir", NULL, folder, edav = use_edav)
-             }
-           },
-           "polis" = {
-             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
-               cli::cli_alert_info("Creating polis folder in the data folder")
-               sirfunctions_io("create.dir", NULL, folder, edav = use_edav)
-             } else {
-               cli::cli_alert_info("Moving updated polis data to the data folder")
-             }
-
-
-             create_polis_data_folder(
-              data_folder,
-              polis_folder,
-              core_ready_folder,
-              use_edav,
-              archive,
-              keep_n_archives
-            )
-
-           },
-           "spatial" = {
-             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
-               cli::cli_abort(paste0("No spatial data found in the data folder.",
-                                     " Ensure that the output folder when running ",
-                                     " tidypolis::process_spatial() is ",
-                                     spatial_folder),
-               )
-             }
-           },
-           "coverage" = {
-             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
-               cli::cli_abort(paste0("Coverage data not found.",
-                                     "Please create and add coverage data in: ",
-                                     folder))
-             }
-           },
-           "pop" = {
-             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
-               cli::cli_abort(paste0("Population data not found. ",
-                                     "Preprocessing of population files may be required. ",
-                                     "Please create a pop data folder and add data in: ",
-                                     folder))
-             }
-           }
-    )
-  }
-
-  if (use_edav) {
-    cli::cli_h1("Testing download times")
-    download_metrics <- test_EDAV_connection(return_list = T)
-  }
-
-  # use the truncated AFP file
-  afp.trunc <- T
-
-  if (recreate.static.files) {
-    afp.trunc <- F
-  }
-
-  dl_table <- dplyr::bind_rows(
-    sirfunctions_io("list", NULL, polis_data_folder, edav = use_edav),
-    sirfunctions_io("list", NULL, spatial_folder, edav = use_edav),
-    sirfunctions_io("list", NULL, coverage_folder, edav = use_edav),
-    sirfunctions_io("list", NULL, pop_folder, edav = use_edav),
-    sirfunctions_io("list", NULL, polis_folder, edav = use_edav) |>
-      dplyr::filter(grepl("cache", name))
-  ) |>
-    dplyr::filter(!is.na(size)) |>
-    dplyr::select("file" = "name", "size")
-
-  if (use_edav) {
-    dl_table <- dl_table |>
-      dplyr::mutate(
-        "dl_time_sec" = size / download_metrics$size * download_metrics$d
-      )
-  }
-
-  if (afp.trunc) {
-    dl_table <- dl_table |>
-      dplyr::filter(!grepl("afp_linelist_2001", file))
-  } else {
-    dl_table <- dl_table |>
-      dplyr::filter(!grepl("afp_linelist_2019", file))
-  }
-
-  file_size <- dl_table$size |> sum()
-
-  if (use_edav) {
-    download_time <- dl_table$dl_time_sec |> sum()
-  }
-
-  if (use_edav) {
-    cli::cli_h1("Downloading POLIS Data")
-  } else {
-    cli::cli_h1("Loading POLIS Data")
-  }
-
-  raw.data <- list()
-  spatial.data <- list()
-
-  # Check if spatial data needs to be redownloaded from the analytics folder
-  spatial_timestamp_exists <- sirfunctions_io(
-    "exists.file",
-    NULL,
-    file.path(analytic_folder, paste0("spatial_timestamp", output_format)),
-    edav = use_edav
-  )
-
-  if (spatial_timestamp_exists) {
-    # Check if it's recent or needs updating
-    edav_spatial_timestamp <- sirfunctions_io(
-      "read",
-      NULL,
-      file.path(analytic_folder, paste0("spatial_timestamp", output_format)),
-      edav = use_edav
-    ) |>
-      dplyr::select(name, lastModifiedEDAV = lastModified)
-
-    edav_spatial_folder_info <- sirfunctions_io(
-      "list",
-      NULL,
-      file.path(spatial_folder),
-      edav = use_edav
-    ) |>
-      dplyr::select(name, lastModified)
-
-    spatial_timestamp_comparison <- dplyr::left_join(edav_spatial_timestamp,
-                                                     edav_spatial_folder_info) |>
-      dplyr::mutate(updated = ifelse(lastModifiedEDAV == lastModified, TRUE, FALSE)) |>
-      dplyr::pull(updated) |> sum(na.rm = TRUE)
-  } else {
-
-    spatial_timestamp_comparison <- 0
-
-  }
-
-  if (spatial_timestamp_comparison == 3) {
-    cli::cli_alert_success("Spatial data in the analytic folder is up to date. Loading from cache...")
-    spatial.data <- sirfunctions_io(
-      "read",
-      NULL,
-      file.path(analytic_folder, spatial_data_name),
-      edav = use_edav
-    )
-  } else {
-    if (spatial_timestamp_exists) {
-      cli::cli_alert_warning("Spatial data in the analytic folder is outdated. Recreating from the spatial folder")
-    } else {
-      cli::cli_alert_warning("No spatial timestamp exists. Recreating from the spatial folder")
-    }
-
-    cli::cli_process_start("1) Loading country shape files")
-    spatial.data$global.ctry <- load_clean_ctry_sp(
-      fp = file.path(spatial_folder, global_ctry_sf_name),
-      edav = use_edav
-    )
-    cli::cli_process_done()
-
-    cli::cli_process_start("2) Loading province shape files")
-    spatial.data$global.prov <- load_clean_prov_sp(
-      fp = file.path(spatial_folder, global_prov_sf_name),
-      edav = use_edav
-    )
-    cli::cli_process_done()
-
-    cli::cli_process_start("3) Loading district shape files")
-    spatial.data$global.dist <- load_clean_dist_sp(
-      fp = file.path(spatial_folder, global_dist_sf_name),
-      edav = use_edav
-    )
-    cli::cli_process_done()
-  }
-
-  cli::cli_process_start("4) Loading AFP line list data (This file is almost 3GB and can take a while)")
-  raw.data$afp <-
-    sirfunctions_io("read", NULL, file_loc = dplyr::filter(
-      dl_table,
-      grepl("afp", file)
-    ) |>
-      dplyr::pull(file), edav = use_edav) |>
-    dplyr::filter(surveillancetypename == "AFP") |>
-    dplyr::mutate(
-      cdc.classification.all2 = dplyr::case_when(
-        final.cell.culture.result == "Not received in lab" &
-          cdc.classification.all == "PENDING" ~ "LAB PENDING",
-        TRUE ~ cdc.classification.all
-      ),
-      hot.case = ifelse(
-        paralysis.asymmetric == "Yes" &
-          paralysis.onset.fever == "Yes" &
-          paralysis.rapid.progress == "Yes",
-        1,
-        0
-      ),
-      hot.case = ifelse(is.na(hot.case), 99, hot.case)
-    )
-
-  cli::cli_process_done()
-
-  cli::cli_process_start("Processing AFP data for analysis")
-
-  raw.data$afp.epi <- raw.data$afp |>
-    dplyr::mutate(epi.week = lubridate::epiweek(dateonset)) |>
-    dplyr::group_by(place.admin.0, epi.week, yronset, cdc.classification.all2) |>
-    dplyr::summarize(afp.cases = dplyr::n(),
-                     .groups = "drop") |>
-    dplyr::mutate(epiweek.year = paste(yronset, epi.week, sep = "-")) |>
-    # manual fix of epi week
-    dplyr::mutate(epi.week = ifelse(epi.week == 52 &
-      yronset == 2022, 1, epi.week))
-
-  # factoring cdc classification to have an order we like in stacked bar chart
-  raw.data$afp.epi$cdc.classification.all2 <-
-    factor(
-      raw.data$afp.epi$cdc.classification.all2,
-      levels = c(
-        "WILD 1",
-        "cVDPV 2",
-        "VDPV 2",
-        "cVDPV 1",
-        "VDPV 1",
-        "COMPATIBLE",
-        "PENDING",
-        "LAB PENDING",
-        "NPAFP",
-        "NOT-AFP",
-        "UNKNOWN",
-        "aVDPV 1",
-        "aVDPV 3",
-        "cVDPV1andcVDPV2",
-        "CombinationWild1-cVDPV 2",
-        "aVDPV 2",
-        "VDPV 3",
-        "iVDPV 2",
-        "VDPV1andcVDPV2",
-        "VAPP",
-        "cVDPV 3",
-        "iVDPV 3",
-        "WILD 3",
-        "WILD1andWILD3",
-        "iVDPV 1",
-        "cVDPV2andcVDPV3"
-      ),
-      labels = c(
-        "WILD 1",
-        "cVDPV 2",
-        "VDPV 2",
-        "cVDPV 1",
-        "VDPV 1",
-        "COMPATIBLE",
-        "PENDING",
-        "LAB PENDING",
-        "NPAFP",
-        "NOT-AFP",
-        "UNKNOWN",
-        "aVDPV 1",
-        "aVDPV 3",
-        "cVDPV1andcVDPV2",
-        "CombinationWild1-cVDPV 2",
-        "aVDPV 2",
-        "VDPV 3",
-        "iVDPV 2",
-        "VDPV1andcVDPV2",
-        "VAPP",
-        "cVDPV 3",
-        "iVDPV 3",
-        "WILD 3",
-        "WILD1andWILD3",
-        "iVDPV 1",
-        "cVDPV2andcVDPV3"
-      )
-    )
-
-  raw.data$para.case <- raw.data$afp |>
-    dplyr::filter(
-      stringr::str_detect(cdc.classification.all2, "VDPV|WILD|COMPATIBLE")
-    ) |>
-    dplyr::mutate(yronset = ifelse(is.na(yronset) == T, 2022, yronset)) # this fix was for the manually added MOZ case
-  cli::cli_process_done()
-
-
-  cli::cli_process_start("5) Loading population data")
-  raw.data$dist.pop <-
-    sirfunctions_io("read", NULL,
-      dplyr::filter(dl_table, grepl("dist.pop", file)) |>
-        dplyr::pull(file),
-      edav = use_edav
-    ) |>
-    dplyr::ungroup()
-
-  raw.data$prov.pop <-
-    sirfunctions_io("read", NULL,
-      file_loc = dplyr::filter(dl_table, grepl("prov.pop", file)) |>
-        dplyr::pull(file), edav = use_edav
-    ) |>
-    dplyr::ungroup()
-
-  raw.data$ctry.pop <-
-    sirfunctions_io("read", NULL,
-      dplyr::filter(dl_table, grepl("ctry.pop", file)) |>
-        dplyr::pull(file),
-      edav = use_edav
-    ) |>
-    dplyr::ungroup()
-  cli::cli_process_done()
-
-
-  cli::cli_process_start("6) Loading coverage data")
-  raw.data$ctry.coverage <- sirfunctions_io("read", NULL,
-                                            file_loc = dplyr::filter(dl_table, grepl("ctry_cov", file)) |>
-                                              dplyr::pull(file), edav = use_edav
-  )
-
-  raw.data$prov.coverage <- sirfunctions_io("read", NULL,
-                                            file_loc = dplyr::filter(dl_table, grepl("prov_cov", file)) |>
-                                              dplyr::pull(file), edav = use_edav
-  )
-
-  raw.data$dist.coverage <- sirfunctions_io("read", NULL,
-                                            file_loc = dplyr::filter(dl_table, grepl("dist_cov", file)) |>
-                                              dplyr::pull(file), edav = use_edav
-  )
-
-  cli::cli_process_done()
-
-  cli::cli_process_start("7) Loading ES data")
-
-  raw.data$es <-
-    sirfunctions_io("read", NULL,
-      file_loc = dplyr::filter(dl_table, grepl("/es_2001", file)) |>
-        dplyr::pull(file), edav = use_edav
-    )
-  cli::cli_process_done()
-
-  cli::cli_process_start("8) Loading SIA data")
-  raw.data$sia <-
-    sirfunctions_io("read", NULL,
-      file_loc = dplyr::filter(dl_table, grepl("sia", file)) |>
-        dplyr::pull(file), edav = use_edav
-    )
-
-  cli::cli_process_done()
-
-  cli::cli_process_start("9) Loading all positives")
-  raw.data$pos <-
-    sirfunctions_io("read", NULL,
-      file_loc = dplyr::filter(dl_table, grepl("/pos", file)) |>
-        dplyr::pull(file), edav = use_edav
-    )
-
-  cli::cli_process_done()
-
-  cli::cli_process_start("10) Loading other surveillance linelist")
-  raw.data$other <-
-    sirfunctions_io("read", NULL,
-      file_loc = dplyr::filter(dl_table, grepl("/other", file)) |>
-        dplyr::pull(file), edav = use_edav
-    )
-
-  cli::cli_process_done()
-
-  cli::cli_process_start("11) Loading road network data")
-  spatial.data$roads <- sirfunctions_io("read", NULL,
-    file_loc = dplyr::filter(dl_table, grepl("roads.rds", file)) |>
-      dplyr::pull(file), edav = use_edav
-  )
-  cli::cli_process_done()
-
-  cli::cli_process_start("12) Loading city spatial data")
-  spatial.data$cities <- sirfunctions_io("read", NULL,
-    file_loc = dplyr::filter(dl_table, grepl("cities.rds", file)) |>
-      dplyr::pull(file), edav = use_edav
-  )
-  cli::cli_process_done()
-
-  cli::cli_process_start("13) Creating Metadata object")
-
-  polis.cache <- sirfunctions_io("read", NULL,
-    file_loc = dplyr::filter(dl_table, grepl("cache.rds", file)) |>
-      dplyr::pull(file), edav = use_edav
-  ) |>
-    dplyr::mutate(last_sync = as.Date(last_sync))
-
-  raw.data$metadata$download_time <- max(polis.cache$last_sync, na.rm = TRUE)
-
-  raw.data$metadata$processed_time <- sirfunctions_io("list", NULL,
-    file.path(polis_folder, "data", core_ready_folder),
-    edav = use_edav
-  ) |>
-    dplyr::filter(grepl("positives_2001-01-01", name)) |>
-    dplyr::select("ctime" = "lastModified") |>
-    dplyr::mutate(ctime = as.Date(ctime)) |>
-    dplyr::pull(ctime)
-
-  raw.data$metadata$user <- polis.cache |>
-    dplyr::filter(table == "virus") |>
-    dplyr::pull(last_user)
-
-  raw.data$metadata$most_recent_pos <- max(raw.data$pos$dateonset, na.rm = TRUE)
-  raw.data$metadata$most_recent_pos_loc <- raw.data$pos |>
-    dplyr::arrange(dplyr::desc(dateonset)) |>
-    dplyr::slice(1) |>
-    dplyr::pull(place.admin.0)
-
-
-  raw.data$metadata$most_recent_afp <- max(raw.data$afp$dateonset, na.rm = TRUE)
-  raw.data$metadata$most_recent_afp_loc <- raw.data$afp |>
-    dplyr::arrange(dplyr::desc(dateonset)) |>
-    dplyr::slice(1) |>
-    dplyr::pull(place.admin.0)
-
-
-  raw.data$metadata$most_recent_env <- max(raw.data$es$collect.date, na.rm = TRUE)
-  raw.data$metadata$most_recent_env_loc <- raw.data$es |>
-    dplyr::arrange(dplyr::desc(collect.date)) |>
-    dplyr::slice(1) |>
-    dplyr::pull(ADM0_NAME)
-
-
-  raw.data$metadata$most_recent_sia <- max(raw.data$sia$sub.activity.start.date)
-  raw.data$metadata$most_recent_sia_code <- raw.data$sia |>
-    dplyr::arrange(dplyr::desc(sub.activity.start.date)) |>
-    dplyr::slice(1) |>
-    dplyr::pull(sia.code)
-  raw.data$metadata$most_recent_sia_location <- raw.data$sia |>
-    dplyr::arrange(dplyr::desc(sub.activity.start.date)) |>
-    dplyr::slice(1) |>
-    dplyr::pull(place.admin.0)
-  raw.data$metadata$most_recent_sia_vax <- raw.data$sia |>
-    dplyr::arrange(dplyr::desc(sub.activity.start.date)) |>
-    dplyr::slice(1) |>
-    dplyr::pull(vaccine.type)
-
-  raw.data$metadata$most_recent_vdpv_class_change_date <- raw.data$pos$vdpvclassificationchangedate |>
-    lubridate::as_date() |>
-    max(na.rm = T)
-
-  rm(polis.cache)
-
-  cli::cli_process_done()
-
-  cli::cli_process_start("14) Clearing out unused memory")
-  gc()
-  cli::cli_process_done()
-}
-
-if (create.cache) {
-  cli::cli_process_start("15) Caching processed data")
-
-  out <- split_concat_raw_data(action = "split", split.years = c(2000, med_year, small_year), raw.data.all = raw.data)
-
-  out_files <- out$split.years |>
-    dplyr::mutate(
-      file_name = ifelse(grepl(current_year, tag), "recent", stringr::str_replace_all(tag, "-", ".")),
-      file_name = paste0("raw.data.", file_name, output_format)
-    )
-
-  if (!recreate.static.files) {
-    out_files <- out_files |> dplyr::filter(grepl("recent", file_name))
-  }
-
-  if (!use_archived_data) {
-    for (i in 1:nrow(out_files)) {
-      sirfunctions_io("write", NULL,
-                      file_loc = file.path(analytic_folder, dplyr::pull(out_files[i, ], file_name)),
-                      obj = out[[dplyr::pull(out_files[i, ], tag)]],
-                      edav = use_edav
-      )}
-    }
-
-# set up path for spatial df
-  sp_file_path <- file.path(analytic_folder, paste0("spatial.data", output_format))
-
-  sirfunctions_io("write", NULL,
-    file_loc = sp_file_path,
-    obj = spatial.data, edav = use_edav
-  )
-
-  # Create tags only if not using "archived" version
-  if (use_edav & !use_archived_data) {
-    # Create raw data file tag for future comparisons
-    sirfunctions_io("write", NULL,
-                    file_loc = file.path(analytic_folder, paste0("raw_data_timestamp", output_format)),
-                    obj = Sys.time())
-
-    # Create spatial data file tag for future comparisons
-    spatial_files <- sirfunctions_io("list",
-                                     NULL,
-                                     spatial_folder,
-                                     edav = use_edav,
-                                     full_names = TRUE)
-
-    edav_spatial_timestamp <- spatial_files |>
-      dplyr::filter(stringr::str_detect(name, "global."),
-                    stringr::str_ends(name, output_format)) |>
-      dplyr::select(name, lastModified)
-
-    sirfunctions_io(
-      "write",
-      NULL,
-      file.path(analytic_folder, paste0("spatial_timestamp", output_format)),
-      obj = edav_spatial_timestamp,
-      edav = use_edav
-    )
-  }
-
-  cli::cli_process_done()
-}
-
-raw_data_cut_size <- switch(size,
-                            "small" = small_year,
-                            "medium" = med_year,
-                            "large" = 2000)
-
-raw.data <- split_concat_raw_data(action = "split",
-                                  split.years = raw_data_cut_size,
-                                  raw.data.all = raw.data)[[1]]
-
-cli::cli_process_start("Checking for duplicates in datasets.")
-raw.data <- duplicate_check(raw.data)
-cli::cli_process_done()
-
-if (attach.spatial.data) {
-  raw.data$global.ctry <- spatial.data$global.ctry
-  raw.data$global.prov <- spatial.data$global.prov
-  raw.data$global.dist <- spatial.data$global.dist
-  raw.data$roads <- spatial.data$roads
-  raw.data$cities <- spatial.data$cities
-}
-
-if (use_archived_data) {
-  cli::cli_alert_success(paste0("Successfully recreated global polio data from ",
-                                basename(polis_data_folder)))
-}
-
-return(raw.data)
-
-}
-
 #' Assess duplicates in the get_all_polio_data() output
 #'
 #' @description
diff --git a/R/get_all_polio_data.R b/R/get_all_polio_data.R
new file mode 100644
index 00000000..9630661f
--- /dev/null
+++ b/R/get_all_polio_data.R
@@ -0,0 +1,947 @@
+#' Retrieve all pre-processed polio data
+#'
+#' @description Download POLIS data from the CDC pre-processed endpoint. By default
+#' this function will return a "small" or recent dataset. This is primarily for data
+#' that is from the past six years. You can specify a "medium" sized dataset for data
+#' that is from 2016 onwards. Finally the "large" sized dataset will provide information
+#' from 2000 onwards. Regular pulls form the data will recreate the "small" dataset
+#' when new information is available and the Data Management Team can force the
+#' creation of the "medium" and "large" static datasets as necessary.
+#'
+#' @param size `str` Size of data to download. Defaults to `"small"`.
+#' - `"small"`: Data from the last six years.
+#' - `"medium"`: Data from 2016-present.
+#' - `"large"`: Data from 2000-present.
+#' @param data_folder `str` Location of the data folder containing pre-processed POLIS data,
+#' spatial files, coverage data, and population data. Defaults to `"GID/PEB/SIR/Data"`.
+#' @param polis_folder `str` Location of the POLIS folder. Defaults to `"GID/PEB/SIR/POLIS"`.
+#' @param core_ready_folder `str` Which core ready folder to use. Defaults to `"Core_Ready_Files"`.
+#' @param force.new.run `logical` Default `FALSE`, if `TRUE` will run recent data and cache.
+#' @param recreate.static.files `logical` Default `FALSE`, if `TRUE` will run all data and cache.
+#' @param attach.spatial.data `logical` Default `TRUE`, adds spatial data to downloaded object.
+#' @param use_edav `logical` Build raw data list using EDAV files. Defaults to `TRUE`.
+#' @param archive Logical. Whether to archive previous output directories
+#'    before overwriting. Default is `TRUE`.
+#' @param keep_n_archives Numeric. Number of archive folders to retain.
+#'   Defaults to `Inf`, which keeps all archives. Set to a finite number
+#'   (e.g., 3) to automatically delete older archives beyond the N most recent.
+#' @param output_format str: output_format to save files as.
+#' Available formats include 'rds' and 'qs2'. Defaults is 'rds'.
+#' @param local_caching `logical` Enable local caching so data is stored locally and
+#' only downloaded when there is updated data from EDAV.
+#' @param use_archived_data `logical` Allows the ability to recreate the raw data file using previous
+#' preprocessed data. If
+#' @returns Named `list` containing polio data that is relevant to CDC.
+#' @examples
+#' \dontrun{
+#' raw.data <- get_all_polio_data() # downloads data for last 6 years, including spatial files
+#' raw.data <- get_all_polio_data(size = "small", attach.spatial.data = FALSE) # exclude spatial data
+#' }
+#'
+#' @export
+get_all_polio_data <- function(
+    size = "small",
+    data_folder = "GID/PEB/SIR/Data",
+    polis_folder = "GID/PEB/SIR/POLIS",
+    core_ready_folder = "Core_Ready_Files",
+    force.new.run = FALSE,
+    recreate.static.files = FALSE,
+    attach.spatial.data = TRUE,
+    use_edav = TRUE,
+    use_archived_data = FALSE,
+    archive = TRUE,
+    keep_n_archives = Inf,
+    output_format = "rds",
+    local_caching = TRUE) {
+
+  # check to see that size parameter is appropriate
+  if (!size %in% c("small", "medium", "large")) {
+    stop("The parameter 'size' must be either 'small', 'medium', or 'large'")
+  }
+
+  # Check output format
+  if (!output_format %in% c("rds", "qs2")) {
+    stop("Only rds and qs2 is supported at this time.")
+  }
+
+# normalize and validate both output formats
+output_format <- normalize_format(output_format)
+
+# Fail safe in instances where EDAV connection fails
+if (use_edav) {
+  verify_edav <- tryCatch(
+    {
+      invisible(capture.output(test_EDAV_connection()))
+      cli::cli_alert_success("Connect to EDAV successful.")
+      TRUE
+    },
+    error = \(e) {
+      cli::cli_alert_info("Connection to EDAV unsuccessful.")
+      FALSE
+    }
+  )
+
+  if (!verify_edav) {
+    cli::cli_alert_info("Unable to obtain data from EDAV. Loading from local cache instead.")
+    cli::cli_alert_info("NOTE: Data may be stale. Please review the global polio dataset metadata for information on when the data was last processed.")
+    raw.data <- force_load_polio_data_cache(attach.spatial.data, output_format)
+    return(raw.data)
+  }
+}
+
+# Constant variables
+# Each file comes out of these folders
+analytic_folder <- file.path(data_folder, "analytic")
+polis_data_folder <- file.path(data_folder, "polis")
+spatial_folder <- file.path(data_folder, "spatial")
+coverage_folder <- file.path(data_folder, "coverage")
+pop_folder <- file.path(data_folder, "pop")
+
+# Year cutoffs for the different datasets
+current_year <- lubridate::year(Sys.Date())
+small_year <- current_year - 5
+med_year <- 2016 #hardcode to 2016 because it's an important point in time
+
+# Required files
+raw_data_recent_name <- paste0("raw.data.recent", output_format)
+raw_data_medium_name <- paste0("raw.data.", med_year, ".", small_year - 1, output_format)
+raw_data_2000_name <- paste0("raw.data.2000.", med_year - 1, output_format)
+spatial_data_name <- paste0("spatial.data", output_format)
+global_ctry_sf_name <- "global.ctry.rds"
+global_prov_sf_name <- "global.prov.rds"
+global_dist_sf_name <- "global.dist.rds"
+
+# Perform check to build using the archived polis folder
+if (use_archived_data) {
+  cli::cli_alert_info("Using archived data")
+  cli::cli_alert_info("NOTE: the metadata will be for the most recent pull")
+  polis_data_folder <- get_archived_polis_data(
+    data_folder,
+    use_edav,
+    keep_n_archives
+  )
+  recreate.static.files <- TRUE
+}
+
+# look to see if the recent raw data rds is in the analytic folder
+prev_table <- sirfunctions_io("list", NULL, analytic_folder,
+  edav = use_edav
+)
+
+if (nrow(prev_table) > 0) {
+  prev_table <- prev_table |>
+    dplyr::filter(grepl(raw_data_recent_name, name)) |>
+    dplyr::select("file" = "name", "size", "ctime" = "lastModified")
+} else {
+  # if empty, make sure to recreate tibble to the right format
+  prev_table <- tibble(
+    "file" = NA,
+    "size" = NA,
+    "ctime" = NA
+  ) |>
+    dplyr::mutate(file = as.character(file),
+                  size = as.double(size),
+                  ctime = as_datetime(ctime)) |>
+    dplyr::filter(!is.na(file))
+}
+
+if (recreate.static.files | force.new.run) {
+  force.new.run <- T
+  create.cache <- T
+}
+
+
+if (!force.new.run) {
+
+  # Check if using the local cache is sufficient
+  if (use_edav & size == "small" & local_caching) {
+    if (!recache_raw_data(analytic_folder, use_edav, output_format)) {
+
+      raw.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"),
+                                                          paste0("raw_data", output_format)),
+                                  edav = FALSE)
+
+      cli::cli_process_start("Checking for duplicates in datasets.")
+      raw.data <- duplicate_check(raw.data)
+      cli::cli_process_done()
+      if (attach.spatial.data) {
+        if (!recache_spatial_data(analytic_folder, spatial_folder,
+                                  use_edav, output_format)) {
+          spatial.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"),
+                                                                  paste0("spatial_data", output_format)),
+                                          edav = FALSE)
+          raw.data$global.ctry <- spatial.data$global.ctry
+          raw.data$global.prov <- spatial.data$global.prov
+          raw.data$global.dist <- spatial.data$global.dist
+          raw.data$roads <- spatial.data$roads
+          raw.data$cities <- spatial.data$cities
+
+          return(raw.data)
+        } else {
+          spatial.data <- sirfunctions_io("read", NULL, file.path(analytic_folder, spatial_data_name),
+                                          edav = use_edav)
+          sirfunctions_io("write", NULL, file.path(rappdirs::user_data_dir("sirfunctions"),
+                                                                  paste0("spatial_data", output_format)),
+                                          obj = spatial.data,
+                                          edav = FALSE)
+          edav_spatial_timestamp <- sirfunctions_io(
+            "read",
+            NULL,
+            file.path(analytic_folder, paste0("spatial_timestamp", output_format)),
+            edav = use_edav
+          )
+          sirfunctions_io("write", NULL, file.path(rappdirs::user_data_dir("sirfunctions"),
+                                                   paste0("spatial_timestamp", output_format)),
+                          obj = edav_spatial_timestamp,
+                          edav = FALSE)
+
+          raw.data$global.ctry <- spatial.data$global.ctry
+          raw.data$global.prov <- spatial.data$global.prov
+          raw.data$global.dist <- spatial.data$global.dist
+          raw.data$roads <- spatial.data$roads
+          raw.data$cities <- spatial.data$cities
+
+          return(raw.data)
+        }
+      } else {
+        return(raw.data)
+      }
+    }
+  }
+
+  if (use_edav) {
+    cli::cli_alert_info(paste0("Downloading most recent active polio data from ", small_year," onwards"))
+  } else {
+    cli::cli_alert_info(paste0("Loading most recent active polio data from ", small_year," onwards"))
+  }
+
+  raw.data.small.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav)
+
+  if (size == "small") {
+    raw.data <- raw.data.small.pull
+  }
+
+  if (size == "medium") {
+    prev_table <- sirfunctions_io("list", NULL, analytic_folder, edav = use_edav) |>
+      dplyr::filter(grepl(raw_data_medium_name, name)) |>
+      dplyr::select("file" = "name", "size", "ctime" = "lastModified")
+
+    if (use_edav) {
+      cli::cli_alert_info(paste0("Downloading static polio data from ", med_year, "-", small_year))
+    } else {
+      cli::cli_alert_info(paste0("Loading static polio data from ", med_year, "-", small_year))
+    }
+
+    raw.data.medium.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav)
+
+    raw.data <- split_concat_raw_data(
+      action = "concat",
+      raw.data.small.pull = raw.data.small.pull,
+      raw.data.medium.pull = raw.data.medium.pull
+    )
+  }
+
+  if (size == "large") {
+    prev_table <- sirfunctions_io("list", NULL, analytic_folder,
+                                  edav = use_edav, full_names = TRUE
+    ) |>
+      dplyr::filter(grepl(raw_data_medium_name, name)) |>
+      dplyr::select("file" = "name", "size", "ctime" = "lastModified")
+
+    if (use_edav) {
+      cli::cli_alert_info(paste0("Downloading static polio data from ", med_year, "-", small_year))
+    } else {
+      cli::cli_alert_info(paste0("Loading static polio data from ", med_year, "-", small_year))
+    }
+
+    raw.data.medium.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav)
+
+    prev_table <- sirfunctions_io("list", NULL, analytic_folder, edav = use_edav) |>
+      dplyr::filter(grepl(raw_data_2000_name, name)) |>
+      dplyr::select("file" = "name", "size", "ctime" = "lastModified")
+
+    if (use_edav) {
+      cli::cli_alert_info(paste0("Downloading static polio data from 2001-", med_year))
+    } else {
+      cli::cli_alert_info(paste0("Loading static polio data from 2001-", med_year))
+    }
+
+    raw.data.large.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav)
+
+    raw.data <- split_concat_raw_data(
+      action = "concat",
+      raw.data.small.pull = raw.data.small.pull,
+      raw.data.medium.pull = raw.data.medium.pull,
+      raw.data.large.pull = raw.data.large.pull
+    )
+  }
+
+  # Only cache the small dataset, which we use in 90% of the case
+  if (use_edav & local_caching) {
+    raw_data_timestamp_exists <- invisible(sirfunctions_io(
+      "exists.file",
+      NULL,
+      file.path(analytic_folder, paste0("raw_data_timestamp", output_format)),
+      edav = use_edav
+    ))
+
+  } else {
+    raw_data_timestamp_exists <- FALSE
+  }
+  if (size == "small" & raw_data_timestamp_exists & local_caching) {
+    cli::cli_process_start("Caching global polio data locally")
+
+    if (!dir.exists(rappdirs::user_data_dir("sirfunctions"))) {
+      dir.create(rappdirs::user_data_dir("sirfunctions"), recursive = TRUE)
+    }
+
+    sirfunctions_io("write", NULL,
+                    file.path(rappdirs::user_data_dir("sirfunctions"), paste0("raw_data", output_format)),
+                    obj = raw.data,
+                    edav = FALSE)
+    # Add edav tag file to local cache dir
+    edav_raw_data_timestamp <- sirfunctions_io(
+      "read",
+      NULL,
+      file.path(analytic_folder, paste0("raw_data_timestamp", output_format)),
+      edav = use_edav
+    )
+
+    sirfunctions_io("write", NULL,
+                    file.path(rappdirs::user_data_dir("sirfunctions"), paste0("raw_data_timestamp", output_format)),
+                    obj = edav_raw_data_timestamp,
+                    edav = FALSE)
+
+    cli::cli_process_done()
+  }
+
+  cli::cli_process_done()
+
+  cli::cli_process_start("Checking for duplicates in datasets.")
+  raw.data <- duplicate_check(raw.data)
+  cli::cli_process_done()
+
+  if (attach.spatial.data) {
+
+    # Don't recache spatial if up to date
+    if (!recache_spatial_data(analytic_folder, spatial_folder,
+                              use_edav, output_format) & local_caching) {
+      spatial.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"),
+                                                              paste0("spatial_data", output_format)),
+                                      edav = FALSE)
+      raw.data$global.ctry <- spatial.data$global.ctry
+      raw.data$global.prov <- spatial.data$global.prov
+      raw.data$global.dist <- spatial.data$global.dist
+      raw.data$roads <- spatial.data$roads
+      raw.data$cities <- spatial.data$cities
+
+      return(raw.data)
+    }
+
+    if (use_edav) {
+      cli::cli_process_start("Downloading and attaching spatial data")
+    } else {
+      cli::cli_process_start("Loading and attaching spatial data")
+    }
+
+    spatial.data <- sirfunctions_io("read", NULL,
+                                      file.path(analytic_folder, spatial_data_name),
+                                      edav = use_edav
+      )
+
+    raw.data$global.ctry <- spatial.data$global.ctry
+    raw.data$global.prov <- spatial.data$global.prov
+    raw.data$global.dist <- spatial.data$global.dist
+    raw.data$roads <- spatial.data$roads
+    raw.data$cities <- spatial.data$cities
+
+    cli::cli_process_done()
+
+    if (use_edav & local_caching) {
+      spatial_timestamp_exists <- sirfunctions_io(
+        "exists.file",
+        NULL,
+        file.path(analytic_folder, paste0("spatial_timestamp", output_format)),
+        edav = use_edav
+      )
+    } else {
+      spatial_timestamp_exists <- FALSE
+    }
+
+    if (recache_spatial_data(analytic_folder, spatial_folder,
+                             use_edav, output_format) & spatial_timestamp_exists & local_caching) {
+      sirfunctions_io("write",
+                      NULL,
+                      file.path(rappdirs::user_data_dir("sirfunctions"),
+                                paste0("spatial_data",
+                                output_format)),
+                      obj = spatial.data,
+                      edav = FALSE)
+
+      spatial_processed_tag <- sirfunctions_io("read",
+                                               NULL,
+                                               file.path(analytic_folder,
+                                                         paste0("spatial_timestamp", output_format)),
+                                               edav = use_edav)
+      sirfunctions_io("write",
+                      NULL,
+                      file.path(rappdirs::user_data_dir("sirfunctions"),
+                                paste0("spatial_timestamp", output_format)),
+                      obj = spatial_processed_tag,
+                      edav = FALSE)
+    }
+  }
+
+  return(raw.data)
+
+} else {
+
+  # Check that the required folders have data
+  for (folder in c(analytic_folder, polis_data_folder, spatial_folder,
+                   coverage_folder, pop_folder)) {
+
+    # get_all_polio_data will recreate the analytic folder if it's missing
+    switch(basename(folder),
+           "analytic" = {
+             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
+               cli::cli_alert_info("No analytics folder found. Will create a new one.")
+               sirfunctions_io("create.dir", NULL, folder, edav = use_edav)
+             }
+           },
+           "polis" = {
+             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
+               cli::cli_alert_info("Creating polis folder in the data folder")
+               sirfunctions_io("create.dir", NULL, folder, edav = use_edav)
+             } else {
+               cli::cli_alert_info("Moving updated polis data to the data folder")
+             }
+
+
+             create_polis_data_folder(
+              data_folder,
+              polis_folder,
+              core_ready_folder,
+              use_edav,
+              archive,
+              keep_n_archives
+            )
+
+           },
+           "spatial" = {
+             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
+               cli::cli_abort(paste0("No spatial data found in the data folder.",
+                                     " Ensure that the output folder when running ",
+                                     " tidypolis::process_spatial() is ",
+                                     spatial_folder),
+               )
+             }
+           },
+           "coverage" = {
+             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
+               cli::cli_abort(paste0("Coverage data not found.",
+                                     "Please create and add coverage data in: ",
+                                     folder))
+             }
+           },
+           "pop" = {
+             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
+               cli::cli_abort(paste0("Population data not found. ",
+                                     "Preprocessing of population files may be required. ",
+                                     "Please create a pop data folder and add data in: ",
+                                     folder))
+             }
+           }
+    )
+  }
+
+  if (use_edav) {
+    cli::cli_h1("Testing download times")
+    download_metrics <- test_EDAV_connection(return_list = T)
+  }
+
+  # use the truncated AFP file
+  afp.trunc <- T
+
+  if (recreate.static.files) {
+    afp.trunc <- F
+  }
+
+  dl_table <- dplyr::bind_rows(
+    sirfunctions_io("list", NULL, polis_data_folder, edav = use_edav),
+    sirfunctions_io("list", NULL, spatial_folder, edav = use_edav),
+    sirfunctions_io("list", NULL, coverage_folder, edav = use_edav),
+    sirfunctions_io("list", NULL, pop_folder, edav = use_edav),
+    sirfunctions_io("list", NULL, polis_folder, edav = use_edav) |>
+      dplyr::filter(grepl("cache", name))
+  ) |>
+    dplyr::filter(!is.na(size)) |>
+    dplyr::select("file" = "name", "size")
+
+  if (use_edav) {
+    dl_table <- dl_table |>
+      dplyr::mutate(
+        "dl_time_sec" = size / download_metrics$size * download_metrics$d
+      )
+  }
+
+  if (afp.trunc) {
+    dl_table <- dl_table |>
+      dplyr::filter(!grepl("afp_linelist_2001", file))
+  } else {
+    dl_table <- dl_table |>
+      dplyr::filter(!grepl("afp_linelist_2019", file))
+  }
+
+  file_size <- dl_table$size |> sum()
+
+  if (use_edav) {
+    download_time <- dl_table$dl_time_sec |> sum()
+  }
+
+  if (use_edav) {
+    cli::cli_h1("Downloading POLIS Data")
+  } else {
+    cli::cli_h1("Loading POLIS Data")
+  }
+
+  raw.data <- list()
+  spatial.data <- list()
+
+  # Check if spatial data needs to be redownloaded from the analytics folder
+  spatial_timestamp_exists <- sirfunctions_io(
+    "exists.file",
+    NULL,
+    file.path(analytic_folder, paste0("spatial_timestamp", output_format)),
+    edav = use_edav
+  )
+
+  if (spatial_timestamp_exists) {
+    # Check if it's recent or needs updating
+    edav_spatial_timestamp <- sirfunctions_io(
+      "read",
+      NULL,
+      file.path(analytic_folder, paste0("spatial_timestamp", output_format)),
+      edav = use_edav
+    ) |>
+      dplyr::select(name, lastModifiedEDAV = lastModified)
+
+    edav_spatial_folder_info <- sirfunctions_io(
+      "list",
+      NULL,
+      file.path(spatial_folder),
+      edav = use_edav
+    ) |>
+      dplyr::select(name, lastModified)
+
+    spatial_timestamp_comparison <- dplyr::left_join(edav_spatial_timestamp,
+                                                     edav_spatial_folder_info) |>
+      dplyr::mutate(updated = ifelse(lastModifiedEDAV == lastModified, TRUE, FALSE)) |>
+      dplyr::pull(updated) |> sum(na.rm = TRUE)
+  } else {
+
+    spatial_timestamp_comparison <- 0
+
+  }
+
+  if (spatial_timestamp_comparison == 3) {
+    cli::cli_alert_success("Spatial data in the analytic folder is up to date. Loading from cache...")
+    spatial.data <- sirfunctions_io(
+      "read",
+      NULL,
+      file.path(analytic_folder, spatial_data_name),
+      edav = use_edav
+    )
+  } else {
+    if (spatial_timestamp_exists) {
+      cli::cli_alert_warning("Spatial data in the analytic folder is outdated. Recreating from the spatial folder")
+    } else {
+      cli::cli_alert_warning("No spatial timestamp exists. Recreating from the spatial folder")
+    }
+
+    cli::cli_process_start("1) Loading country shape files")
+    spatial.data$global.ctry <- load_clean_ctry_sp(
+      fp = file.path(spatial_folder, global_ctry_sf_name),
+      edav = use_edav
+    )
+    cli::cli_process_done()
+
+    cli::cli_process_start("2) Loading province shape files")
+    spatial.data$global.prov <- load_clean_prov_sp(
+      fp = file.path(spatial_folder, global_prov_sf_name),
+      edav = use_edav
+    )
+    cli::cli_process_done()
+
+    cli::cli_process_start("3) Loading district shape files")
+    spatial.data$global.dist <- load_clean_dist_sp(
+      fp = file.path(spatial_folder, global_dist_sf_name),
+      edav = use_edav
+    )
+    cli::cli_process_done()
+  }
+
+  cli::cli_process_start("4) Loading AFP line list data (This file is almost 3GB and can take a while)")
+  raw.data$afp <-
+    sirfunctions_io("read", NULL, file_loc = dplyr::filter(
+      dl_table,
+      grepl("afp", file)
+    ) |>
+      dplyr::pull(file), edav = use_edav) |>
+    dplyr::filter(surveillancetypename == "AFP") |>
+    dplyr::mutate(
+      cdc.classification.all2 = dplyr::case_when(
+        final.cell.culture.result == "Not received in lab" &
+          cdc.classification.all == "PENDING" ~ "LAB PENDING",
+        TRUE ~ cdc.classification.all
+      ),
+      hot.case = ifelse(
+        paralysis.asymmetric == "Yes" &
+          paralysis.onset.fever == "Yes" &
+          paralysis.rapid.progress == "Yes",
+        1,
+        0
+      ),
+      hot.case = ifelse(is.na(hot.case), 99, hot.case)
+    )
+
+  cli::cli_process_done()
+
+  cli::cli_process_start("Processing AFP data for analysis")
+
+  raw.data$afp.epi <- raw.data$afp |>
+    dplyr::mutate(epi.week = lubridate::epiweek(dateonset)) |>
+    dplyr::group_by(place.admin.0, epi.week, yronset, cdc.classification.all2) |>
+    dplyr::summarize(afp.cases = dplyr::n(),
+                     .groups = "drop") |>
+    dplyr::mutate(epiweek.year = paste(yronset, epi.week, sep = "-")) |>
+    # manual fix of epi week
+    dplyr::mutate(epi.week = ifelse(epi.week == 52 &
+      yronset == 2022, 1, epi.week))
+
+  # factoring cdc classification to have an order we like in stacked bar chart
+  raw.data$afp.epi$cdc.classification.all2 <-
+    factor(
+      raw.data$afp.epi$cdc.classification.all2,
+      levels = c(
+        "WILD 1",
+        "cVDPV 2",
+        "VDPV 2",
+        "cVDPV 1",
+        "VDPV 1",
+        "COMPATIBLE",
+        "PENDING",
+        "LAB PENDING",
+        "NPAFP",
+        "NOT-AFP",
+        "UNKNOWN",
+        "aVDPV 1",
+        "aVDPV 3",
+        "cVDPV1andcVDPV2",
+        "CombinationWild1-cVDPV 2",
+        "aVDPV 2",
+        "VDPV 3",
+        "iVDPV 2",
+        "VDPV1andcVDPV2",
+        "VAPP",
+        "cVDPV 3",
+        "iVDPV 3",
+        "WILD 3",
+        "WILD1andWILD3",
+        "iVDPV 1",
+        "cVDPV2andcVDPV3"
+      ),
+      labels = c(
+        "WILD 1",
+        "cVDPV 2",
+        "VDPV 2",
+        "cVDPV 1",
+        "VDPV 1",
+        "COMPATIBLE",
+        "PENDING",
+        "LAB PENDING",
+        "NPAFP",
+        "NOT-AFP",
+        "UNKNOWN",
+        "aVDPV 1",
+        "aVDPV 3",
+        "cVDPV1andcVDPV2",
+        "CombinationWild1-cVDPV 2",
+        "aVDPV 2",
+        "VDPV 3",
+        "iVDPV 2",
+        "VDPV1andcVDPV2",
+        "VAPP",
+        "cVDPV 3",
+        "iVDPV 3",
+        "WILD 3",
+        "WILD1andWILD3",
+        "iVDPV 1",
+        "cVDPV2andcVDPV3"
+      )
+    )
+
+  raw.data$para.case <- raw.data$afp |>
+    dplyr::filter(
+      stringr::str_detect(cdc.classification.all2, "VDPV|WILD|COMPATIBLE")
+    ) |>
+    dplyr::mutate(yronset = ifelse(is.na(yronset) == T, 2022, yronset)) # this fix was for the manually added MOZ case
+  cli::cli_process_done()
+
+
+  cli::cli_process_start("5) Loading population data")
+  raw.data$dist.pop <-
+    sirfunctions_io("read", NULL,
+      dplyr::filter(dl_table, grepl("dist.pop", file)) |>
+        dplyr::pull(file),
+      edav = use_edav
+    ) |>
+    dplyr::ungroup()
+
+  raw.data$prov.pop <-
+    sirfunctions_io("read", NULL,
+      file_loc = dplyr::filter(dl_table, grepl("prov.pop", file)) |>
+        dplyr::pull(file), edav = use_edav
+    ) |>
+    dplyr::ungroup()
+
+  raw.data$ctry.pop <-
+    sirfunctions_io("read", NULL,
+      dplyr::filter(dl_table, grepl("ctry.pop", file)) |>
+        dplyr::pull(file),
+      edav = use_edav
+    ) |>
+    dplyr::ungroup()
+  cli::cli_process_done()
+
+
+  cli::cli_process_start("6) Loading coverage data")
+  raw.data$ctry.coverage <- sirfunctions_io("read", NULL,
+                                            file_loc = dplyr::filter(dl_table, grepl("ctry_cov", file)) |>
+                                              dplyr::pull(file), edav = use_edav
+  )
+
+  raw.data$prov.coverage <- sirfunctions_io("read", NULL,
+                                            file_loc = dplyr::filter(dl_table, grepl("prov_cov", file)) |>
+                                              dplyr::pull(file), edav = use_edav
+  )
+
+  raw.data$dist.coverage <- sirfunctions_io("read", NULL,
+                                            file_loc = dplyr::filter(dl_table, grepl("dist_cov", file)) |>
+                                              dplyr::pull(file), edav = use_edav
+  )
+
+  cli::cli_process_done()
+
+  cli::cli_process_start("7) Loading ES data")
+
+  raw.data$es <-
+    sirfunctions_io("read", NULL,
+      file_loc = dplyr::filter(dl_table, grepl("/es_2001", file)) |>
+        dplyr::pull(file), edav = use_edav
+    )
+  cli::cli_process_done()
+
+  cli::cli_process_start("8) Loading SIA data")
+  raw.data$sia <-
+    sirfunctions_io("read", NULL,
+      file_loc = dplyr::filter(dl_table, grepl("sia", file)) |>
+        dplyr::pull(file), edav = use_edav
+    )
+
+  cli::cli_process_done()
+
+  cli::cli_process_start("9) Loading all positives")
+  raw.data$pos <-
+    sirfunctions_io("read", NULL,
+      file_loc = dplyr::filter(dl_table, grepl("/pos", file)) |>
+        dplyr::pull(file), edav = use_edav
+    )
+
+  cli::cli_process_done()
+
+  cli::cli_process_start("10) Loading other surveillance linelist")
+  raw.data$other <-
+    sirfunctions_io("read", NULL,
+      file_loc = dplyr::filter(dl_table, grepl("/other", file)) |>
+        dplyr::pull(file), edav = use_edav
+    )
+
+  cli::cli_process_done()
+
+  cli::cli_process_start("11) Loading road network data")
+  spatial.data$roads <- sirfunctions_io("read", NULL,
+    file_loc = dplyr::filter(dl_table, grepl("roads.rds", file)) |>
+      dplyr::pull(file), edav = use_edav
+  )
+  cli::cli_process_done()
+
+  cli::cli_process_start("12) Loading city spatial data")
+  spatial.data$cities <- sirfunctions_io("read", NULL,
+    file_loc = dplyr::filter(dl_table, grepl("cities.rds", file)) |>
+      dplyr::pull(file), edav = use_edav
+  )
+  cli::cli_process_done()
+
+  cli::cli_process_start("13) Creating Metadata object")
+
+  polis.cache <- sirfunctions_io("read", NULL,
+    file_loc = dplyr::filter(dl_table, grepl("cache.rds", file)) |>
+      dplyr::pull(file), edav = use_edav
+  ) |>
+    dplyr::mutate(last_sync = as.Date(last_sync))
+
+  raw.data$metadata$download_time <- max(polis.cache$last_sync, na.rm = TRUE)
+
+  raw.data$metadata$processed_time <- sirfunctions_io("list", NULL,
+    file.path(polis_folder, "data", core_ready_folder),
+    edav = use_edav
+  ) |>
+    dplyr::filter(grepl("positives_2001-01-01", name)) |>
+    dplyr::select("ctime" = "lastModified") |>
+    dplyr::mutate(ctime = as.Date(ctime)) |>
+    dplyr::pull(ctime)
+
+  raw.data$metadata$user <- polis.cache |>
+    dplyr::filter(table == "virus") |>
+    dplyr::pull(last_user)
+
+  raw.data$metadata$most_recent_pos <- max(raw.data$pos$dateonset, na.rm = TRUE)
+  raw.data$metadata$most_recent_pos_loc <- raw.data$pos |>
+    dplyr::arrange(dplyr::desc(dateonset)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(place.admin.0)
+
+
+  raw.data$metadata$most_recent_afp <- max(raw.data$afp$dateonset, na.rm = TRUE)
+  raw.data$metadata$most_recent_afp_loc <- raw.data$afp |>
+    dplyr::arrange(dplyr::desc(dateonset)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(place.admin.0)
+
+
+  raw.data$metadata$most_recent_env <- max(raw.data$es$collect.date, na.rm = TRUE)
+  raw.data$metadata$most_recent_env_loc <- raw.data$es |>
+    dplyr::arrange(dplyr::desc(collect.date)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(ADM0_NAME)
+
+
+  raw.data$metadata$most_recent_sia <- max(raw.data$sia$sub.activity.start.date)
+  raw.data$metadata$most_recent_sia_code <- raw.data$sia |>
+    dplyr::arrange(dplyr::desc(sub.activity.start.date)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(sia.code)
+  raw.data$metadata$most_recent_sia_location <- raw.data$sia |>
+    dplyr::arrange(dplyr::desc(sub.activity.start.date)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(place.admin.0)
+  raw.data$metadata$most_recent_sia_vax <- raw.data$sia |>
+    dplyr::arrange(dplyr::desc(sub.activity.start.date)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(vaccine.type)
+
+  raw.data$metadata$most_recent_vdpv_class_change_date <- raw.data$pos$vdpvclassificationchangedate |>
+    lubridate::as_date() |>
+    max(na.rm = T)
+
+  rm(polis.cache)
+
+  cli::cli_process_done()
+
+  cli::cli_process_start("14) Clearing out unused memory")
+  gc()
+  cli::cli_process_done()
+}
+
+if (create.cache) {
+  cli::cli_process_start("15) Caching processed data")
+
+  out <- split_concat_raw_data(action = "split", split.years = c(2000, med_year, small_year), raw.data.all = raw.data)
+
+  out_files <- out$split.years |>
+    dplyr::mutate(
+      file_name = ifelse(grepl(current_year, tag), "recent", stringr::str_replace_all(tag, "-", ".")),
+      file_name = paste0("raw.data.", file_name, output_format)
+    )
+
+  if (!recreate.static.files) {
+    out_files <- out_files |> dplyr::filter(grepl("recent", file_name))
+  }
+
+  if (!use_archived_data) {
+    for (i in 1:nrow(out_files)) {
+      sirfunctions_io("write", NULL,
+                      file_loc = file.path(analytic_folder, dplyr::pull(out_files[i, ], file_name)),
+                      obj = out[[dplyr::pull(out_files[i, ], tag)]],
+                      edav = use_edav
+      )}
+    }
+
+# set up path for spatial df
+  sp_file_path <- file.path(analytic_folder, paste0("spatial.data", output_format))
+
+  sirfunctions_io("write", NULL,
+    file_loc = sp_file_path,
+    obj = spatial.data, edav = use_edav
+  )
+
+  # Create tags only if not using "archived" version
+  if (use_edav & !use_archived_data) {
+    # Create raw data file tag for future comparisons
+    sirfunctions_io("write", NULL,
+                    file_loc = file.path(analytic_folder, paste0("raw_data_timestamp", output_format)),
+                    obj = Sys.time())
+
+    # Create spatial data file tag for future comparisons
+    spatial_files <- sirfunctions_io("list",
+                                     NULL,
+                                     spatial_folder,
+                                     edav = use_edav,
+                                     full_names = TRUE)
+
+    edav_spatial_timestamp <- spatial_files |>
+      dplyr::filter(stringr::str_detect(name, "global."),
+                    stringr::str_ends(name, output_format)) |>
+      dplyr::select(name, lastModified)
+
+    sirfunctions_io(
+      "write",
+      NULL,
+      file.path(analytic_folder, paste0("spatial_timestamp", output_format)),
+      obj = edav_spatial_timestamp,
+      edav = use_edav
+    )
+  }
+
+  cli::cli_process_done()
+}
+
+raw_data_cut_size <- switch(size,
+                            "small" = small_year,
+                            "medium" = med_year,
+                            "large" = 2000)
+
+raw.data <- split_concat_raw_data(action = "split",
+                                  split.years = raw_data_cut_size,
+                                  raw.data.all = raw.data)[[1]]
+
+cli::cli_process_start("Checking for duplicates in datasets.")
+raw.data <- duplicate_check(raw.data)
+cli::cli_process_done()
+
+if (attach.spatial.data) {
+  raw.data$global.ctry <- spatial.data$global.ctry
+  raw.data$global.prov <- spatial.data$global.prov
+  raw.data$global.dist <- spatial.data$global.dist
+  raw.data$roads <- spatial.data$roads
+  raw.data$cities <- spatial.data$cities
+}
+
+if (use_archived_data) {
+  cli::cli_alert_success(paste0("Successfully recreated global polio data from ",
+                                basename(polis_data_folder)))
+}
+
+return(raw.data)
+
+}
\ No newline at end of file

From c128e56795165429ff64e27e3f2e40ca8dab07f7 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Tue, 31 Mar 2026 16:06:02 -0400
Subject: [PATCH 22/28] update docs to reflect get_all_polio_data moving to its
 own script

---
 man/build_parquet_raw_data.Rd | 2 +-
 man/get_all_polio_data.Rd     | 2 +-
 man/upload_parquet_to_edav.Rd | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/man/build_parquet_raw_data.Rd b/man/build_parquet_raw_data.Rd
index 96c6240e..c0d0a7a6 100644
--- a/man/build_parquet_raw_data.Rd
+++ b/man/build_parquet_raw_data.Rd
@@ -6,8 +6,8 @@
 \usage{
 build_parquet_raw_data(
   path = "GID/PEB/SIR/Data/analytic",
-  from_edav = TRUE,
   dataset = "all",
+  from_edav = TRUE,
   container = get_azure_storage_connection()
 )
 }
diff --git a/man/get_all_polio_data.Rd b/man/get_all_polio_data.Rd
index 93b44408..a09aa17f 100644
--- a/man/get_all_polio_data.Rd
+++ b/man/get_all_polio_data.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/dal.R
+% Please edit documentation in R/get_all_polio_data.R
 \name{get_all_polio_data}
 \alias{get_all_polio_data}
 \title{Retrieve all pre-processed polio data}
diff --git a/man/upload_parquet_to_edav.Rd b/man/upload_parquet_to_edav.Rd
index 41064538..97c0a633 100644
--- a/man/upload_parquet_to_edav.Rd
+++ b/man/upload_parquet_to_edav.Rd
@@ -4,7 +4,7 @@
 \alias{upload_parquet_to_edav}
 \title{Uploads a local parquet folder to EDAV}
 \usage{
-upload_parquet_to_edav(src, dest, container = NULL)
+upload_parquet_to_edav(src, dest, container = get_azure_storage_connection())
 }
 \arguments{
 \item{src}{\code{str} Local path to the parquet folder.}

From 36c0ae787f8746a8a373206fea1fcd44a1fc9335 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Wed, 1 Apr 2026 11:51:45 -0400
Subject: [PATCH 23/28] minimal example of get_all_polio_data() using parquet

---
 NAMESPACE                                 |   1 +
 R/get_all_polio_data_2.R                  | 640 ++++++++++++++++++++++
 man/cache_raw_data.Rd                     |  22 +
 man/check_data_folder.Rd                  |  32 ++
 man/check_spatial_data_for_processing.Rd  |  20 +
 man/create_raw_data_tags.Rd               |  20 +
 man/get_all_polio_data_2.Rd               |  45 ++
 man/list_required_files_for_processing.Rd |  22 +
 man/process_afp_epi_raw_data.Rd           |  18 +
 man/process_afp_raw_data.Rd               |  20 +
 man/process_metadata_raw_data.Rd          |  32 ++
 man/process_paralytic_raw_data.Rd         |  18 +
 man/pull_data_from_dl_table.Rd            |  22 +
 man/reprocess_polio_data.Rd               |  32 ++
 14 files changed, 944 insertions(+)
 create mode 100644 R/get_all_polio_data_2.R
 create mode 100644 man/cache_raw_data.Rd
 create mode 100644 man/check_data_folder.Rd
 create mode 100644 man/check_spatial_data_for_processing.Rd
 create mode 100644 man/create_raw_data_tags.Rd
 create mode 100644 man/get_all_polio_data_2.Rd
 create mode 100644 man/list_required_files_for_processing.Rd
 create mode 100644 man/process_afp_epi_raw_data.Rd
 create mode 100644 man/process_afp_raw_data.Rd
 create mode 100644 man/process_metadata_raw_data.Rd
 create mode 100644 man/process_paralytic_raw_data.Rd
 create mode 100644 man/pull_data_from_dl_table.Rd
 create mode 100644 man/reprocess_polio_data.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 7368b8d1..07935ca6 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -102,6 +102,7 @@ export(generate_timeliness_maps)
 export(generate_timely_det_violin)
 export(generate_timely_ship_violin)
 export(get_all_polio_data)
+export(get_all_polio_data_2)
 export(get_azure_storage_connection)
 export(get_cdc_childvaxview_data)
 export(get_constant)
diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R
new file mode 100644
index 00000000..3a0b939a
--- /dev/null
+++ b/R/get_all_polio_data_2.R
@@ -0,0 +1,640 @@
+# Helper functions
+
+#' Checks for required subfolders in the data folder
+#'
+#' @param data_folder `str` Path to the data folder.
+#' @param polis_folder `str` POLIS folder with preprocessed data.
+#' @param core_ready_folder `str` Name of the core ready folder. Need to be specified if preprocessing specific regions, which have their own core ready folder.
+#' @param use_edav `logical` Whether to use EDAV or not.
+#' @param cache `logical` Whether to cache the preprocessed data to data/polis subfolder.
+#'
+#' @returns `list` List of paths to the specific subfolders.
+#'
+#' @keywords internal
+check_data_folder <- function(data_folder, polis_folder, core_ready_folder, use_edav, cache) {
+
+  analytic_folder <- file.path(data_folder, "analytic")
+  polis_data_folder <- file.path(data_folder, "polis")
+  spatial_folder <- file.path(data_folder, "spatial")
+  coverage_folder <- file.path(data_folder, "coverage")
+  pop_folder <- file.path(data_folder, "pop")
+
+  # Check that the required folders have data
+  for (folder in c(analytic_folder, polis_data_folder, spatial_folder,
+                   coverage_folder, pop_folder)) {
+
+    # get_all_polio_data will recreate the analytic folder if it's missing
+    switch(basename(folder),
+           "analytic" = {
+             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
+               cli::cli_alert_info("No analytics folder found. Will create a new one.")
+               sirfunctions_io("create.dir", NULL, folder, edav = use_edav)
+             }
+           },
+           "polis" = {
+             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
+               cli::cli_alert_info("Creating polis folder in the data folder")
+               sirfunctions_io("create.dir", NULL, folder, edav = use_edav)
+             } else {
+               cli::cli_alert_info("Moving updated polis data to the data folder")
+             }
+
+
+             create_polis_data_folder(
+              data_folder,
+              polis_folder,
+              core_ready_folder,
+              use_edav,
+              cache,
+              Inf
+            )
+
+           },
+           "spatial" = {
+             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
+               cli::cli_abort(paste0("No spatial data found in the data folder.",
+                                     " Ensure that the output folder when running ",
+                                     " tidypolis::process_spatial() is ",
+                                     spatial_folder),
+               )
+             }
+           },
+           "coverage" = {
+             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
+               cli::cli_abort(paste0("Coverage data not found.",
+                                     "Please create and add coverage data in: ",
+                                     folder))
+             }
+           },
+           "pop" = {
+             if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) {
+               cli::cli_abort(paste0("Population data not found. ",
+                                     "Preprocessing of population files may be required. ",
+                                     "Please create a pop data folder and add data in: ",
+                                     folder))
+             }
+           }
+    )
+  }
+
+  return(list(analytic_folder = analytic_folder, 
+              polis_data_folder = polis_data_folder, 
+              spatial_folder = spatial_folder,
+              coverage_folder = coverage_folder, 
+              pop_folder = pop_folder))
+
+}
+
+#' Creates the "download table", with paths to files required for recreating static files
+#'
+#' @param data_folders_paths `list` Output of [check_data_folder()].
+#' @param polis_folder `str` POLIS folder containing preprocessed data. NOT the subfolder under the data folder.
+#' @param use_edav `logical` Whether to use EDAV or not.
+#'
+#' @returns `tibble` Dataset containing paths to required files.
+#'
+#' @keywords internal
+list_required_files_for_processing <- function(data_folders_paths, polis_folder, use_edav) {
+  dl_table <- dplyr::bind_rows(
+    sirfunctions_io(
+      "list",
+      NULL,
+      data_folders_paths$polis_data_folder,
+      edav = use_edav
+    ),
+    sirfunctions_io(
+      "list",
+      NULL,
+      data_folders_paths$spatial_folder,
+      edav = use_edav
+    ),
+    sirfunctions_io(
+      "list",
+      NULL,
+      data_folders_paths$coverage_folder,
+      edav = use_edav
+    ),
+    sirfunctions_io(
+      "list",
+      NULL,
+      data_folders_paths$pop_folder,
+      edav = use_edav
+    ),
+    sirfunctions_io(
+      "list",
+      NULL,
+      polis_folder,
+      edav = use_edav
+    ) |>
+      dplyr::filter(grepl("cache", name))
+  ) |>
+    dplyr::filter(!is.na(size), !grepl("afp_linelist_2019", name)) |>
+    dplyr::select("file" = "name", "size")
+
+  return(dl_table)
+}
+
+#' Create the spatial data for processing
+#'
+#' @param data_folder `str` Path to the data folder.
+#' @param use_edav `logical` Use EDAV or not.
+#'
+#' @returns `list` Contains spatial datasets.
+#'
+#' @keywords internal
+check_spatial_data_for_processing <- function(data_folder, use_edav) {
+  spatial_folder <- file.path(data_folder, "spatial")
+  analytic_folder <- file.path(data_folder, "analytic")
+  global_ctry_sf_name <- "global.ctry.rds"
+  global_prov_sf_name <- "global.prov.rds"
+  global_dist_sf_name <- "global.dist.rds"
+  spatial_data <- list()
+
+  # Check if spatial data needs to be redownloaded from the analytics folder
+  spatial_timestamp_exists <- sirfunctions_io(
+    "exists.file",
+    NULL,
+    file.path(analytic_folder, "spatial_timestamp.parquet"),
+    edav = use_edav
+  )
+
+  if (spatial_timestamp_exists) {
+    # Check if it's recent or needs updating
+    edav_spatial_timestamp <- sirfunctions_io(
+      "read",
+      NULL,
+      file.path(analytic_folder, "spatial_timestamp.parquet"),
+      edav = use_edav
+    ) |>
+      dplyr::select(name, lastModifiedEDAV = lastModified)
+
+    edav_spatial_folder_info <- sirfunctions_io(
+      "list",
+      NULL,
+      file.path(spatial_folder),
+      edav = use_edav
+    ) |>
+      dplyr::select(name, lastModified)
+
+    spatial_timestamp_comparison <- dplyr::left_join(
+      edav_spatial_timestamp,
+      edav_spatial_folder_info
+    ) |>
+      dplyr::mutate(
+        updated = ifelse(lastModifiedEDAV == lastModified, TRUE, FALSE)
+      ) |>
+      dplyr::pull(updated) |>
+      sum(na.rm = TRUE)
+  } else {
+    spatial_timestamp_comparison <- 0
+  }
+
+  if (spatial_timestamp_comparison == 3) {
+    cli::cli_alert_success(
+      "Spatial data in the analytic folder is up to date. Loading from cache..."
+    )
+    spatial_data <- build_parquet_raw_data(
+      file.path(data_folder, "analytic"),
+      dataset = c("global.ctry", "global.prov", "global.dist"),
+      from_edav = use_edav
+    )
+  } else {
+    if (spatial_timestamp_exists) {
+      cli::cli_alert_warning(
+        "Spatial data in the analytic folder is outdated. Recreating from the spatial folder"
+      )
+    } else {
+      cli::cli_alert_warning(
+        "No spatial timestamp exists. Recreating from the spatial folder"
+      )
+    }
+
+    cli::cli_process_start("1) Loading country shape files")
+    spatial_data$global.ctry <- load_clean_ctry_sp(
+      fp = file.path(spatial_folder, global_ctry_sf_name),
+      edav = use_edav
+    )
+    cli::cli_process_done()
+
+    cli::cli_process_start("2) Loading province shape files")
+    spatial_data$global.prov <- load_clean_prov_sp(
+      fp = file.path(spatial_folder, global_prov_sf_name),
+      edav = use_edav
+    )
+    cli::cli_process_done()
+
+    cli::cli_process_start("3) Loading district shape files")
+    spatial_data$global.dist <- load_clean_dist_sp(
+      fp = file.path(spatial_folder, global_dist_sf_name),
+      edav = use_edav
+    )
+    cli::cli_process_done()
+  }
+
+  return(spatial_data)
+
+}
+
+#' Creates the AFP dataset of raw_data
+#'
+#' @param dl_table `tibble` Output of [list_required_files_for_processing()].
+#' @param use_edav `logical` Whether to use EDAV or not.
+#'
+#' @returns `tibble` AFP dataset.
+#'
+#' @keywords internal
+process_afp_raw_data <- function(dl_table, use_edav) {
+
+  afp <- sirfunctions_io("read", NULL, file_loc = dplyr::filter(
+      dl_table,
+      grepl("afp", file)
+    ) |>
+      dplyr::pull(file), edav = use_edav) |>
+    dplyr::filter(surveillancetypename == "AFP") |>
+    dplyr::mutate(
+      cdc.classification.all2 = dplyr::case_when(
+        final.cell.culture.result == "Not received in lab" &
+          cdc.classification.all == "PENDING" ~ "LAB PENDING",
+        TRUE ~ cdc.classification.all
+      ),
+      hot.case = ifelse(
+        paralysis.asymmetric == "Yes" &
+          paralysis.onset.fever == "Yes" &
+          paralysis.rapid.progress == "Yes",
+        1,
+        0
+      ),
+      hot.case = ifelse(is.na(hot.case), 99, hot.case)
+    )
+  
+  return(afp)
+
+}
+
+#' Creates afp.epi dataset
+#'
+#' @param afp `tibble` Output of [process_afp_raw_data()].
+#'
+#' @returns `tibble` Summary of AFP cases by year/epi-week per country.
+#'
+#' @keywords internal
+process_afp_epi_raw_data <- function(afp) {
+
+  afp.epi <- afp |>
+    dplyr::mutate(epi.week = lubridate::epiweek(dateonset)) |>
+    dplyr::group_by(place.admin.0, epi.week, yronset, cdc.classification.all2) |>
+    dplyr::summarize(afp.cases = dplyr::n(),
+                     .groups = "drop") |>
+    dplyr::mutate(epiweek.year = paste(yronset, epi.week, sep = "-")) |>
+    # manual fix of epi week
+    dplyr::mutate(epi.week = ifelse(epi.week == 52 &
+      yronset == 2022, 1, epi.week))
+
+  # factoring cdc classification to have an order we like in stacked bar chart
+  afp.epi$cdc.classification.all2 <-
+    factor(
+      afp.epi$cdc.classification.all2,
+      levels = c(
+        "WILD 1",
+        "cVDPV 2",
+        "VDPV 2",
+        "cVDPV 1",
+        "VDPV 1",
+        "COMPATIBLE",
+        "PENDING",
+        "LAB PENDING",
+        "NPAFP",
+        "NOT-AFP",
+        "UNKNOWN",
+        "aVDPV 1",
+        "aVDPV 3",
+        "cVDPV1andcVDPV2",
+        "CombinationWild1-cVDPV 2",
+        "aVDPV 2",
+        "VDPV 3",
+        "iVDPV 2",
+        "VDPV1andcVDPV2",
+        "VAPP",
+        "cVDPV 3",
+        "iVDPV 3",
+        "WILD 3",
+        "WILD1andWILD3",
+        "iVDPV 1",
+        "cVDPV2andcVDPV3"
+      ),
+      labels = c(
+        "WILD 1",
+        "cVDPV 2",
+        "VDPV 2",
+        "cVDPV 1",
+        "VDPV 1",
+        "COMPATIBLE",
+        "PENDING",
+        "LAB PENDING",
+        "NPAFP",
+        "NOT-AFP",
+        "UNKNOWN",
+        "aVDPV 1",
+        "aVDPV 3",
+        "cVDPV1andcVDPV2",
+        "CombinationWild1-cVDPV 2",
+        "aVDPV 2",
+        "VDPV 3",
+        "iVDPV 2",
+        "VDPV1andcVDPV2",
+        "VAPP",
+        "cVDPV 3",
+        "iVDPV 3",
+        "WILD 3",
+        "WILD1andWILD3",
+        "iVDPV 1",
+        "cVDPV2andcVDPV3"
+      )
+    )
+  
+  return(afp.epi)
+}
+
+#' Creates paralytics cases dataset
+#'
+#' @inheritParams process_afp_epi_raw_data
+#'
+#' @returns `tibble` Dataset with paralytic cases only.
+#'
+#' @keywords internal
+process_paralytic_raw_data <- function(afp) {
+  para.case <- afp |>
+    dplyr::filter(
+      stringr::str_detect(cdc.classification.all2, "VDPV|WILD|COMPATIBLE")
+    ) |>
+    dplyr::mutate(yronset = ifelse(is.na(yronset) == T, 2022, yronset)) # this fix was for the manually added MOZ case
+
+  return(para.case)
+}
+
+#' Pull data listed in the download table
+#'
+#' @param dl_table `tibble` Output of [list_required_files_for_processing()].
+#' @param grepl_pattern `str` Pattern to use to filter the `dl_table`.
+#' @param use_edav `logical` Whether to use EDAV or not.
+#'
+#' @returns `tibble` One of the datasets listed in `dl_table`.
+#'
+#' @keywords internal
+pull_data_from_dl_table <- function(dl_table, grepl_pattern, use_edav) {
+  pulled_data <- sirfunctions_io(
+    "read",
+    NULL,
+    file_loc = dplyr::filter(dl_table, grepl(grepl_pattern, file)) |>
+      dplyr::pull(file),
+    edav = use_edav
+  ) |>
+    dplyr::ungroup()
+
+  return(pulled_data)
+
+}
+
+#' Creates metadata tag
+#'
+#' @param dl_table `tibble` Output of [list_required_files_for_processing()].
+#' @param raw_data `list` Processed data combining all polio data.
+#' @param polis_folder `str` Path to POLIS folder.
+#' @param core_ready_folder `str` Name of the core ready folder.
+#' @param use_edav `logical` Whether to use EDAV or not.
+#'
+#' @returns `tibble` Metadata tibble.
+#'
+#' @keywords internal
+process_metadata_raw_data <- function(dl_table, raw_data, polis_folder, core_ready_folder, use_edav) {
+  metadata <- list()
+  polis.cache <- sirfunctions_io("read", NULL,
+    file_loc = dplyr::filter(dl_table, grepl("cache.rds", file)) |>
+      dplyr::pull(file), edav = use_edav
+  ) |>
+    dplyr::mutate(last_sync = as.Date(last_sync))
+
+  metadata$download_time <- max(polis.cache$last_sync, na.rm = TRUE)
+
+  metadata$processed_time <- sirfunctions_io("list", NULL,
+    file.path(polis_folder, "data", core_ready_folder),
+    edav = use_edav
+  ) |>
+    dplyr::filter(grepl("positives_2001-01-01", name)) |>
+    dplyr::select("ctime" = "lastModified") |>
+    dplyr::mutate(ctime = as.Date(ctime)) |>
+    dplyr::pull(ctime)
+
+  metadata$user <- polis.cache |>
+    dplyr::filter(table == "virus") |>
+    dplyr::pull(last_user)
+
+  metadata$most_recent_pos <- max(raw_data$pos$dateonset, na.rm = TRUE)
+  metadata$most_recent_pos_loc <- raw_data$pos |>
+    dplyr::arrange(dplyr::desc(dateonset)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(place.admin.0)
+
+
+  metadata$most_recent_afp <- max(raw_data$afp$dateonset, na.rm = TRUE)
+  metadata$most_recent_afp_loc <- raw_data$afp |>
+    dplyr::arrange(dplyr::desc(dateonset)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(place.admin.0)
+
+
+  metadata$most_recent_env <- max(raw_data$es$collect.date, na.rm = TRUE)
+  metadata$most_recent_env_loc <- raw_data$es |>
+    dplyr::arrange(dplyr::desc(collect.date)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(ADM0_NAME)
+
+
+  metadata$most_recent_sia <- max(raw_data$sia$sub.activity.start.date)
+  metadata$most_recent_sia_code <- raw_data$sia |>
+    dplyr::arrange(dplyr::desc(sub.activity.start.date)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(sia.code)
+  metadata$most_recent_sia_location <- raw_data$sia |>
+    dplyr::arrange(dplyr::desc(sub.activity.start.date)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(place.admin.0)
+  metadata$most_recent_sia_vax <- raw_data$sia |>
+    dplyr::arrange(dplyr::desc(sub.activity.start.date)) |>
+    dplyr::slice(1) |>
+    dplyr::pull(vaccine.type)
+
+  metadata$most_recent_vdpv_class_change_date <- raw_data$pos$vdpvclassificationchangedate |>
+    lubridate::as_date() |>
+    max(na.rm = T)
+
+  return(metadata)
+
+}
+
+#' Cache the raw data
+#'
+#' @param raw_data `list` Processed list of all polio data.
+#' @param analytic_folder_path `str` Path to analytic folder.
+#' @param use_edav `logical` Whether to use EDAV or not.
+#'
+#' @returns `NULL`, invisibly.
+#'
+#' @keywords internal
+cache_raw_data <- function(raw_data, analytic_folder_path, use_edav) {
+
+    if (use_edav) {
+      withr::with_tempdir({
+      
+      create_raw_data_parquet(raw_data, getwd())
+      upload_parquet_to_edav(getwd(), analytic_folder_path, get_azure_storage_connection())
+      
+      })
+    } else {
+      create_raw_data_parquet(raw_data, analytic_folder_path)
+    }
+
+  invisible()
+}
+
+#' Create timestamps for raw data and spatial data
+#'
+#' @param data_folders_paths `list` Output of [check_data_folder()].
+#' @param use_edav `logical` Whether to use EDAV or not.
+#'
+#' @returns `NULL`, invisibly.
+#'
+#' @keywords internal
+create_raw_data_tags <- function(data_folders_paths, use_edav) {
+
+  # Create tags only if not using "archived" version
+  if (use_edav) {
+    # Create raw data file tag for future comparisons
+    sirfunctions_io("write", NULL,
+                    file_loc = file.path(data_folders_paths$analytic_folder, paste0("raw_data_timestamp.parquet")),
+                    obj = Sys.time())
+
+    # Create spatial data file tag for future comparisons
+    spatial_files <- sirfunctions_io("list",
+                                     NULL,
+                                     data_folders_paths$spatial_folder,
+                                     edav = use_edav,
+                                     full_names = TRUE)
+
+    edav_spatial_timestamp <- spatial_files |>
+      dplyr::filter(stringr::str_detect(name, "global."),
+                    stringr::str_ends(name, output_format)) |>
+      dplyr::select(name, lastModified)
+
+    sirfunctions_io(
+      "write",
+      NULL,
+      file.path(data_folder_paths$analytic_folder, paste0("spatial_timestamp", output_format)),
+      obj = edav_spatial_timestamp,
+      edav = use_edav
+    )
+  }
+
+}
+
+#' Reprocess the global polio dataset
+#'
+#' @param data_folder `str` Path to the data folder.
+#' @param polis_folder `str` Path to the POLIS folder.
+#' @param core_ready_folder `str` Name of the core ready folder.
+#' @param use_edav `logical` Whether to use EDAV or not.
+#' @param cache `logical` Whether to cache the preprocessed data to the data/polis subfolder.
+#'
+#' @returns `list` Processed raw data.
+#'
+#' @keywords internal
+reprocess_polio_data <- function(data_folder, polis_folder, core_ready_folder, use_edav, cache) {
+
+  # NOTE: we will need to add mechanism for retrieving and loading archived parquet folders
+  data_folders_paths <- check_data_folder(data_folder, polis_folder, core_ready_folder, use_edav, cache)
+
+  # List files required for processing
+  dl_table <- list_required_files_for_processing(data_folders_paths, polis_folder, use_edav)
+
+  # Obtain spatial data information
+  spatial_data <- check_spatial_data_for_processing(data_folder, use_edav)
+
+  # Process raw.data
+  raw_data$afp <- process_afp_raw_data(dl_table, use_edav)
+  raw_data$afp.epi <- process_afp_epi_raw_data(raw_data$afp)
+  raw_data$para.case <- process_paralytic_raw_data(raw_data$afp)
+  raw_data$ctry.pop <- pull_data_from_dl_table(dl_table, "ctry.pop", use_edav)
+  raw_data$prov.pop <- pull_data_from_dl_table(dl_table, "prov.pop", use_edav)
+  raw_data$dist.pop <- pull_data_from_dl_table(dl_table, "dist.pop", use_edav)
+  raw_data$ctry.coverage <- pull_data_from_dl_table(dl_table, "ctry_cov", use_edav)
+  raw_data$prov.coverage <- pull_data_from_dl_table(dl_table, "prov_cov", use_edav)
+  raw_data$dist.coverage <- pull_data_from_dl_table(dl_table, "dist_cov", use_edav)
+  raw_data$es <- pull_data_from_dl_table(dl_table, "/es_2001", use_edav)
+  raw_data$sia <- pull_data_from_dl_table(dl_table, "sia", use_edav)
+  raw_data$pos <- pull_data_from_dl_table(dl_table, "/pos", use_edav)
+  raw_data$other <- pull_data_from_dl_table(dl_table, "/other", use_edav)
+
+  # Add spatial data to raw_data
+  raw_data$global.ctry <- spatial_data$global.ctry
+  raw_data$global.prov <- spatial_data$global.prov
+  raw_data$global.dist <- spatial_data$global.dist
+  raw_data$roads <- pull_data_from_dl_table(dl_table, "roads.rds", use_edav)
+  raw_data$cities <- pull_data_from_dl_table(dl_table, "cities.rds", use_edav)
+
+  # Create metadata
+  raw_data$metadata <- process_metadata_raw_data(dl_table, raw_data, polis_folder, core_ready_folder, use_edav)
+
+  # Check for duplicates
+  raw_data <- duplicate_check(raw_data)
+
+  # Cache processed data only if we aren't using the archived version
+  cache_raw_data(raw_data,data_folders_paths$analytic_folder, use_edav)
+
+  browser()
+
+  # Create data tags only if we aren't using the archived version
+  create_raw_data_tags(data_folders_paths, use_edav)
+
+  return(raw_data)
+
+}
+
+# Main function
+
+#' Pull global polio dataset
+#'
+#' @param dataset `str` Name of the dataset. Defaults to 'all'.
+#' @param data_folder `str` Path to data folder.
+#' @param polis_folder `str` Path to the POLIS folder.
+#' @param core_ready_folder `str` Name of the core ready folder.
+#' @param recreate.static.files `logical` Whether to reprocess global polio data.
+#' @param use_edav `logical` Whether to use EDAV or not.
+#' @param azcontainer `azcontainer` Azure container object.
+#' @param cache `logical` Whether to cache the preprocessed datasets in the `data/polis` folder.
+#'
+#' @returns `list` Global polio datasets.
+#'
+#' @export
+#' @examples
+#' \dontrun{
+#' raw_data <- get_all_polio_data_2()
+#' }
+get_all_polio_data_2 <- function(dataset = "all",
+    data_folder = "GID/PEB/SIR/Data",
+    polis_folder = "GID/PEB/SIR/POLIS",
+    core_ready_folder = "Core_Ready_Files",
+    recreate.static.files = FALSE,
+    use_edav = TRUE,
+    azcontainer = get_azure_storage_connection(),
+  cache = TRUE) {
+  
+  if (recreate.static.files) {
+    raw_data <- reprocess_polio_data(data_folder, polis_folder, core_ready_folder, use_edav, cache)
+  } else {
+    raw_data <- build_parquet_raw_data(file.path(data_folder, "analytic"), dataset, use_edav, azcontainer)
+  }
+
+  return(raw_data)
+
+}
+
diff --git a/man/cache_raw_data.Rd b/man/cache_raw_data.Rd
new file mode 100644
index 00000000..3bde69c9
--- /dev/null
+++ b/man/cache_raw_data.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{cache_raw_data}
+\alias{cache_raw_data}
+\title{Cache the raw data}
+\usage{
+cache_raw_data(raw_data, analytic_folder_path, use_edav)
+}
+\arguments{
+\item{raw_data}{\code{list} Processed list of all polio data.}
+
+\item{analytic_folder_path}{\code{str} Path to analytic folder.}
+
+\item{use_edav}{\code{logical} Whether to use EDAV or not.}
+}
+\value{
+\code{NULL}, invisibly.
+}
+\description{
+Cache the raw data
+}
+\keyword{internal}
diff --git a/man/check_data_folder.Rd b/man/check_data_folder.Rd
new file mode 100644
index 00000000..6e50559e
--- /dev/null
+++ b/man/check_data_folder.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{check_data_folder}
+\alias{check_data_folder}
+\title{Checks for required subfolders in the data folder}
+\usage{
+check_data_folder(
+  data_folder,
+  polis_folder,
+  core_ready_folder,
+  use_edav,
+  cache
+)
+}
+\arguments{
+\item{data_folder}{\code{str} Path to the data folder.}
+
+\item{polis_folder}{\code{str} POLIS folder with preprocessed data.}
+
+\item{core_ready_folder}{\code{str} Name of the core ready folder. Need to be specified if preprocessing specific regions, which have their own core ready folder.}
+
+\item{use_edav}{\code{logical} Whether to use EDAV or not.}
+
+\item{cache}{\code{logical} Whether to cache the preprocessed data to data/polis subfolder.}
+}
+\value{
+\code{list} List of paths to the specific subfolders.
+}
+\description{
+Checks for required subfolders in the data folder
+}
+\keyword{internal}
diff --git a/man/check_spatial_data_for_processing.Rd b/man/check_spatial_data_for_processing.Rd
new file mode 100644
index 00000000..b0bc6998
--- /dev/null
+++ b/man/check_spatial_data_for_processing.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{check_spatial_data_for_processing}
+\alias{check_spatial_data_for_processing}
+\title{Create the spatial data for processing}
+\usage{
+check_spatial_data_for_processing(data_folder, use_edav)
+}
+\arguments{
+\item{data_folder}{\code{str} Path to the data folder.}
+
+\item{use_edav}{\code{logical} Use EDAV or not.}
+}
+\value{
+\code{list} Contains spatial datasets.
+}
+\description{
+Create the spatial data for processing
+}
+\keyword{internal}
diff --git a/man/create_raw_data_tags.Rd b/man/create_raw_data_tags.Rd
new file mode 100644
index 00000000..33f205f8
--- /dev/null
+++ b/man/create_raw_data_tags.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{create_raw_data_tags}
+\alias{create_raw_data_tags}
+\title{Create timestamps for raw data and spatial data}
+\usage{
+create_raw_data_tags(data_folders_paths, use_edav)
+}
+\arguments{
+\item{data_folders_paths}{\code{list} Output of \code{\link[=check_data_folder]{check_data_folder()}}.}
+
+\item{use_edav}{\code{logical} Whether to use EDAV or not.}
+}
+\value{
+\code{NULL}, invisibly.
+}
+\description{
+Create timestamps for raw data and spatial data
+}
+\keyword{internal}
diff --git a/man/get_all_polio_data_2.Rd b/man/get_all_polio_data_2.Rd
new file mode 100644
index 00000000..0980df03
--- /dev/null
+++ b/man/get_all_polio_data_2.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{get_all_polio_data_2}
+\alias{get_all_polio_data_2}
+\title{Pull global polio dataset}
+\usage{
+get_all_polio_data_2(
+  dataset = "all",
+  data_folder = "GID/PEB/SIR/Data",
+  polis_folder = "GID/PEB/SIR/POLIS",
+  core_ready_folder = "Core_Ready_Files",
+  recreate.static.files = FALSE,
+  use_edav = TRUE,
+  azcontainer = get_azure_storage_connection(),
+  cache = TRUE
+)
+}
+\arguments{
+\item{dataset}{\code{str} Name of the dataset. Defaults to 'all'.}
+
+\item{data_folder}{\code{str} Path to data folder.}
+
+\item{polis_folder}{\code{str} Path to the POLIS folder.}
+
+\item{core_ready_folder}{\code{str} Name of the core ready folder.}
+
+\item{recreate.static.files}{\code{logical} Whether to reprocess global polio data.}
+
+\item{use_edav}{\code{logical} Whether to use EDAV or not.}
+
+\item{azcontainer}{\code{azcontainer} Azure container object.}
+
+\item{cache}{\code{logical} Whether to cache the preprocessed datasets in the \code{data/polis} folder.}
+}
+\value{
+\code{list} Global polio datasets.
+}
+\description{
+Pull global polio dataset
+}
+\examples{
+\dontrun{
+raw_data <- get_all_polio_data_2()
+}
+}
diff --git a/man/list_required_files_for_processing.Rd b/man/list_required_files_for_processing.Rd
new file mode 100644
index 00000000..9a06e5e3
--- /dev/null
+++ b/man/list_required_files_for_processing.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{list_required_files_for_processing}
+\alias{list_required_files_for_processing}
+\title{Creates the "download table", with paths to files required for recreating static files}
+\usage{
+list_required_files_for_processing(data_folders_paths, polis_folder, use_edav)
+}
+\arguments{
+\item{data_folders_paths}{\code{list} Output of \code{\link[=check_data_folder]{check_data_folder()}}.}
+
+\item{polis_folder}{\code{str} POLIS folder containing preprocessed data. NOT the subfolder under the data folder.}
+
+\item{use_edav}{\code{logical} Whether to use EDAV or not.}
+}
+\value{
+\code{tibble} Dataset containing paths to required files.
+}
+\description{
+Creates the "download table", with paths to files required for recreating static files
+}
+\keyword{internal}
diff --git a/man/process_afp_epi_raw_data.Rd b/man/process_afp_epi_raw_data.Rd
new file mode 100644
index 00000000..8bba2c7b
--- /dev/null
+++ b/man/process_afp_epi_raw_data.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{process_afp_epi_raw_data}
+\alias{process_afp_epi_raw_data}
+\title{Creates afp.epi dataset}
+\usage{
+process_afp_epi_raw_data(afp)
+}
+\arguments{
+\item{afp}{\code{tibble} Output of \code{\link[=process_afp_raw_data]{process_afp_raw_data()}}.}
+}
+\value{
+\code{tibble} Summary of AFP cases by year/epi-week per country.
+}
+\description{
+Creates afp.epi dataset
+}
+\keyword{internal}
diff --git a/man/process_afp_raw_data.Rd b/man/process_afp_raw_data.Rd
new file mode 100644
index 00000000..502e8796
--- /dev/null
+++ b/man/process_afp_raw_data.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{process_afp_raw_data}
+\alias{process_afp_raw_data}
+\title{Creates the AFP dataset of raw_data}
+\usage{
+process_afp_raw_data(dl_table, use_edav)
+}
+\arguments{
+\item{dl_table}{\code{tibble} Output of \code{\link[=list_required_files_for_processing]{list_required_files_for_processing()}}.}
+
+\item{use_edav}{\code{logical} Whether to use EDAV or not.}
+}
+\value{
+\code{tibble} AFP dataset.
+}
+\description{
+Creates the AFP dataset of raw_data
+}
+\keyword{internal}
diff --git a/man/process_metadata_raw_data.Rd b/man/process_metadata_raw_data.Rd
new file mode 100644
index 00000000..ebf919fa
--- /dev/null
+++ b/man/process_metadata_raw_data.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{process_metadata_raw_data}
+\alias{process_metadata_raw_data}
+\title{Creates metadata tag}
+\usage{
+process_metadata_raw_data(
+  dl_table,
+  raw_data,
+  polis_folder,
+  core_ready_folder,
+  use_edav
+)
+}
+\arguments{
+\item{dl_table}{\code{tibble} Output of \code{\link[=list_required_files_for_processing]{list_required_files_for_processing()}}.}
+
+\item{raw_data}{\code{list} Processed data combining all polio data.}
+
+\item{polis_folder}{\code{str} Path to POLIS folder.}
+
+\item{core_ready_folder}{\code{str} Name of the core ready folder.}
+
+\item{use_edav}{\code{logical} Whether to use EDAV or not.}
+}
+\value{
+\code{tibble} Metadata tibble.
+}
+\description{
+Creates metadata tag
+}
+\keyword{internal}
diff --git a/man/process_paralytic_raw_data.Rd b/man/process_paralytic_raw_data.Rd
new file mode 100644
index 00000000..b9c29de6
--- /dev/null
+++ b/man/process_paralytic_raw_data.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{process_paralytic_raw_data}
+\alias{process_paralytic_raw_data}
+\title{Creates paralytics cases dataset}
+\usage{
+process_paralytic_raw_data(afp)
+}
+\arguments{
+\item{afp}{\code{tibble} Output of \code{\link[=process_afp_raw_data]{process_afp_raw_data()}}.}
+}
+\value{
+\code{tibble} Dataset with paralytic cases only.
+}
+\description{
+Creates paralytics cases dataset
+}
+\keyword{internal}
diff --git a/man/pull_data_from_dl_table.Rd b/man/pull_data_from_dl_table.Rd
new file mode 100644
index 00000000..be0d0378
--- /dev/null
+++ b/man/pull_data_from_dl_table.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{pull_data_from_dl_table}
+\alias{pull_data_from_dl_table}
+\title{Pull data listed in the download table}
+\usage{
+pull_data_from_dl_table(dl_table, grepl_pattern, use_edav)
+}
+\arguments{
+\item{dl_table}{\code{tibble} Output of \code{\link[=list_required_files_for_processing]{list_required_files_for_processing()}}.}
+
+\item{grepl_pattern}{\code{str} Pattern to use to filter the \code{dl_table}.}
+
+\item{use_edav}{\code{logical} Whether to use EDAV or not.}
+}
+\value{
+\code{tibble} One of the datasets listed in \code{dl_table}.
+}
+\description{
+Pull data listed in the download table
+}
+\keyword{internal}
diff --git a/man/reprocess_polio_data.Rd b/man/reprocess_polio_data.Rd
new file mode 100644
index 00000000..54564147
--- /dev/null
+++ b/man/reprocess_polio_data.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_all_polio_data_2.R
+\name{reprocess_polio_data}
+\alias{reprocess_polio_data}
+\title{Reprocess the global polio dataset}
+\usage{
+reprocess_polio_data(
+  data_folder,
+  polis_folder,
+  core_ready_folder,
+  use_edav,
+  cache
+)
+}
+\arguments{
+\item{data_folder}{\code{str} Path to the data folder.}
+
+\item{polis_folder}{\code{str} Path to the POLIS folder.}
+
+\item{core_ready_folder}{\code{str} Name of the core ready folder.}
+
+\item{use_edav}{\code{logical} Whether to use EDAV or not.}
+
+\item{cache}{\code{logical} Whether to cache the preprocessed data to the data/polis subfolder.}
+}
+\value{
+\code{list} Processed raw data.
+}
+\description{
+Reprocess the global polio dataset
+}
+\keyword{internal}

From 40d46e63cc2a1c9ae57145dba8a1e748f3404d19 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Wed, 1 Apr 2026 11:54:14 -0400
Subject: [PATCH 24/28] remove browser

---
 R/get_all_polio_data_2.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R
index 3a0b939a..ff7f695e 100644
--- a/R/get_all_polio_data_2.R
+++ b/R/get_all_polio_data_2.R
@@ -590,8 +590,6 @@ reprocess_polio_data <- function(data_folder, polis_folder, core_ready_folder, u
   # Cache processed data only if we aren't using the archived version
   cache_raw_data(raw_data,data_folders_paths$analytic_folder, use_edav)
 
-  browser()
-
   # Create data tags only if we aren't using the archived version
   create_raw_data_tags(data_folders_paths, use_edav)
 

From 633b31e8be28b9561547a6fe7a41dfa9ac542c35 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Wed, 1 Apr 2026 11:57:31 -0400
Subject: [PATCH 25/28] Update get_all_polio_data_2.R

---
 R/get_all_polio_data_2.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R
index ff7f695e..4b54b0c9 100644
--- a/R/get_all_polio_data_2.R
+++ b/R/get_all_polio_data_2.R
@@ -550,6 +550,7 @@ create_raw_data_tags <- function(data_folders_paths, use_edav) {
 #' @keywords internal
 reprocess_polio_data <- function(data_folder, polis_folder, core_ready_folder, use_edav, cache) {
 
+  raw_data <- list()
   # NOTE: we will need to add mechanism for retrieving and loading archived parquet folders
   data_folders_paths <- check_data_folder(data_folder, polis_folder, core_ready_folder, use_edav, cache)
 

From ae3163f8642a9d3506f3c4ecd5c4473a4c2c31fc Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Wed, 1 Apr 2026 12:10:10 -0400
Subject: [PATCH 26/28] fix raw data parquet folder name

---
 R/dal.parquet.R          | 2 +-
 R/get_all_polio_data_2.R | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/dal.parquet.R b/R/dal.parquet.R
index f9078808..8709f434 100644
--- a/R/dal.parquet.R
+++ b/R/dal.parquet.R
@@ -142,7 +142,7 @@ upload_parquet_to_edav <- function(src, dest, container = get_azure_storage_conn
   AzureStor::multiupload_adls_file(
     container,
     paste0(src, "/*"),
-    file.path(dest, basename(src)),
+    file.path(dest, "raw_data_parquet"),
     recursive = TRUE
   )
   cli::cli_process_done()
diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R
index 4b54b0c9..e767b181 100644
--- a/R/get_all_polio_data_2.R
+++ b/R/get_all_polio_data_2.R
@@ -511,7 +511,7 @@ create_raw_data_tags <- function(data_folders_paths, use_edav) {
   if (use_edav) {
     # Create raw data file tag for future comparisons
     sirfunctions_io("write", NULL,
-                    file_loc = file.path(data_folders_paths$analytic_folder, paste0("raw_data_timestamp.parquet")),
+                    file_loc = file.path(data_folders_paths$analytic_folder, paste0("raw_data_timestamp.rds")),
                     obj = Sys.time())
 
     # Create spatial data file tag for future comparisons
@@ -529,7 +529,7 @@ create_raw_data_tags <- function(data_folders_paths, use_edav) {
     sirfunctions_io(
       "write",
       NULL,
-      file.path(data_folder_paths$analytic_folder, paste0("spatial_timestamp", output_format)),
+      file.path(data_folder_paths$analytic_folder, paste0("spatial_timestamp.rds")),
       obj = edav_spatial_timestamp,
       edav = use_edav
     )

From c0cf0bff0611994565318fa8f622511146df1e55 Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Wed, 1 Apr 2026 12:15:45 -0400
Subject: [PATCH 27/28] Update get_all_polio_data_2.R

---
 R/get_all_polio_data_2.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R
index e767b181..646962d0 100644
--- a/R/get_all_polio_data_2.R
+++ b/R/get_all_polio_data_2.R
@@ -523,7 +523,7 @@ create_raw_data_tags <- function(data_folders_paths, use_edav) {
 
     edav_spatial_timestamp <- spatial_files |>
       dplyr::filter(stringr::str_detect(name, "global."),
-                    stringr::str_ends(name, output_format)) |>
+                    stringr::str_ends(name, "parquet")) |>
       dplyr::select(name, lastModified)
 
     sirfunctions_io(

From 7df83afcb2ac5777c5b48e4ad912c455baa5435f Mon Sep 17 00:00:00 2001
From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com>
Date: Wed, 1 Apr 2026 12:20:49 -0400
Subject: [PATCH 28/28] Update get_all_polio_data_2.R

---
 R/get_all_polio_data_2.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R
index 646962d0..6b27a8f3 100644
--- a/R/get_all_polio_data_2.R
+++ b/R/get_all_polio_data_2.R
@@ -529,7 +529,7 @@ create_raw_data_tags <- function(data_folders_paths, use_edav) {
     sirfunctions_io(
       "write",
       NULL,
-      file.path(data_folder_paths$analytic_folder, paste0("spatial_timestamp.rds")),
+      file.path(data_folders_paths$analytic_folder, paste0("spatial_timestamp.rds")),
       obj = edav_spatial_timestamp,
       edav = use_edav
     )