From 138441e1471aabac25f200a8b47e3adc82a225ad Mon Sep 17 00:00:00 2001 From: mcuadera Date: Thu, 13 Mar 2025 10:53:58 -0400 Subject: [PATCH 01/28] create create_raw_data_parquet fx --- NAMESPACE | 1 + R/dal.parquet.R | 62 ++++++++++++++++++++++++++++++++++ man/create_raw_data_parquet.Rd | 26 ++++++++++++++ man/get_partition_cols.Rd | 23 +++++++++++++ 4 files changed, 112 insertions(+) create mode 100644 R/dal.parquet.R create mode 100644 man/create_raw_data_parquet.Rd create mode 100644 man/get_partition_cols.Rd diff --git a/NAMESPACE b/NAMESPACE index 79a0de4e..58fb1a2e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ export(create_emergence_group_gif) export(create_npafp_export) export(create_pop_check_export) export(create_pot_comp_clust_export) +export(create_raw_data_parquet) export(create_stool_adequacy_export) export(ctry_data_errors) export(duplicate_check) diff --git a/R/dal.parquet.R b/R/dal.parquet.R new file mode 100644 index 00000000..fdf89960 --- /dev/null +++ b/R/dal.parquet.R @@ -0,0 +1,62 @@ +#' Convert raw data into a parquet hierarchal folder +#' +#' The function takes a `raw_data` object (output of [get_all_polio_data()]) and +#' saves it into a parquet directory +#' @param raw_data `list` A `raw_data` object. +#' @param path `str` Path to export the parquet folder to. +#' +#' @returns None. +#' @export +#' +#' @examples +#' \dontrun{ +#' raw_data <- get_all_polio_data() +#' create_raw_data_parquet(raw_data, "C:/Users/ABC1/Desktop/raw_data_parquet") +#' } +create_raw_data_parquet <- function(raw_data, path){ + df_names <- names(raw_data) + + for (i in df_names) { + switch() + } + +} + +# Private functions ---- + +#' Gets the column used to partition a column +#' +#' @param name `str` Name of the column +#' +#' @return `chr` A character vector of columns to partition with. +#' @keywords internal +#' +#' @examples +#' \dontrun{ +#' get_partition_cols("afp") +#' } +get_partition_cols <- function(name) { + switch(name, + "afp" = c("place.admin.0", "yronset"), + "afp.dupe" = c("place.admin.0", "yronset"), + "afp.epi" = c("place.admin.0", "yronset"), + "para.case" = c("place.admin.0", "yronset"), + "es" = c("ADM0_NAME", "collect.yr"), + "es.dupe" = c("ADM0_NAME", "collect.yr"), + "sia" = c("place.admin.0", "yr.sia"), + "sia.dupe" = c("place.admin.0", "yr.sia"), + "pos" = c("place.admin.0", "yronset"), + "pos.dupe" = c("place.admin.0", "yronset"), + "other" = c("place.admin.0", "yronset"), + "other.dupe" = c("place.admin.0", "yronset"), + "dist.pop" = c("ADM0_NAME", "year"), + "prov.pop" = c("ADM0_NAME", "year"), + "ctry.pop" = c("ADM0_NAME", "year"), + "global.ctry" = c("ADM0_NAME"), + "global.prov" = c("ADM0_NAME"), + "global.dist" = c("ADM0_NAME"), + "roads" = c("continent"), + "cities" = c("CTRY_NAME"), + "metadata" = "download_time" + ) +} diff --git a/man/create_raw_data_parquet.Rd b/man/create_raw_data_parquet.Rd new file mode 100644 index 00000000..e65e2f63 --- /dev/null +++ b/man/create_raw_data_parquet.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dal.parquet.R +\name{create_raw_data_parquet} +\alias{create_raw_data_parquet} +\title{Convert raw data into a parquet hierarchal folder} +\usage{ +create_raw_data_parquet(raw_data, path) +} +\arguments{ +\item{raw_data}{\code{list} A \code{raw_data} object.} + +\item{path}{\code{str} Path to export the parquet folder to.} +} +\value{ +None. +} +\description{ +The function takes a \code{raw_data} object (output of \code{\link[=get_all_polio_data]{get_all_polio_data()}}) and +saves it into a parquet directory +} +\examples{ +\dontrun{ +raw_data <- get_all_polio_data() +create_raw_data_parquet(raw_data, "C:/Users/ABC1/Desktop/raw_data_parquet") +} +} diff --git a/man/get_partition_cols.Rd b/man/get_partition_cols.Rd new file mode 100644 index 00000000..cad472b2 --- /dev/null +++ b/man/get_partition_cols.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dal.parquet.R +\name{get_partition_cols} +\alias{get_partition_cols} +\title{Gets the column used to partition a column} +\usage{ +get_partition_cols(name) +} +\arguments{ +\item{name}{\code{str} Name of the column} +} +\value{ +\code{chr} A character vector of columns to partition with. +} +\description{ +Gets the column used to partition a column +} +\examples{ +\dontrun{ +get_partition_cols("afp") +} +} +\keyword{internal} From 6ca024be0a00f4881b335080825299e231d913df Mon Sep 17 00:00:00 2001 From: mcuadera Date: Fri, 14 Mar 2025 12:44:25 -0400 Subject: [PATCH 02/28] build local parquet --- NAMESPACE | 1 + R/dal.parquet.R | 176 +++++++++++++++++++++++++++- man/build_parquet_raw_data.Rd | 35 ++++++ man/build_parquet_raw_data_edav.Rd | 24 ++++ man/build_parquet_raw_data_local.Rd | 19 +++ sirfunctions.Rproj | 1 - 6 files changed, 249 insertions(+), 7 deletions(-) create mode 100644 man/build_parquet_raw_data.Rd create mode 100644 man/build_parquet_raw_data_edav.Rd create mode 100644 man/build_parquet_raw_data_local.Rd diff --git a/NAMESPACE b/NAMESPACE index 58fb1a2e..87202a8e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(build_parquet_raw_data) export(check_afp_guid_ctry_data) export(check_cache) export(check_missing_rows) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index fdf89960..53036d5b 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -14,12 +14,97 @@ #' create_raw_data_parquet(raw_data, "C:/Users/ABC1/Desktop/raw_data_parquet") #' } create_raw_data_parquet <- function(raw_data, path){ + start <- Sys.time() df_names <- names(raw_data) + if (!dir.exists(path)) { + cli::cli_abort("Directory path does not exist.") + } + + cli::cli_process_start("Creating raw_data parquet folder") + iter <- 1 for (i in df_names) { - switch() + cli::cli_alert_info(paste0("Now processing: ", i)) + + if (i %in% c("global.prov", "global.dist")) { + raw_data[[i]] |> + dplyr::mutate(SHAPE = sf::st_as_text(SHAPE)) |> + arrow::write_dataset(path = file.path(path, i), + partitioning = get_partition_cols(i)) + + } else if (i == "global.ctry") { + raw_data[[i]] |> + dplyr::mutate(Shape = sf::st_as_text(Shape)) |> + arrow::write_dataset(path = file.path(path, i), + partitioning = get_partition_cols(i)) + } else if (i %in% c("cities", "roads")) { + raw_data[[i]] |> + dplyr::mutate(geometry = sf::st_as_text(geometry)) |> + arrow::write_dataset(path = file.path(path, i), + partitioning = get_partition_cols(i)) + + } else if (i == "metadata") { + raw_data[[i]] |> + dplyr::as_tibble() |> + arrow::write_dataset(path = file.path(path, i), + partitioning = get_partition_cols(i)) + } else { + raw_data[[i]] |> arrow::write_dataset(path = file.path(path, i), + partitioning = get_partition_cols(i)) + } + + cli::cli_alert_info(paste0(iter, "/", length(df_names), " processed.")) + iter <- iter + 1 } + cli::cli_process_done() + cli::cli_alert_success("raw_data parquet folder created!") + cli::cli_alert_info(paste0("Data processed in: ", + round(difftime(Sys.time(), start, "mins"), 2), + " mins.")) +} + +#' Recreate raw data from local parquet folder +#' +#' Recreates an output of [get_all_polio_data()] from a folder housing +#' data in parquet format. +#' +#' @param path `str` Local path to the parquet folder +#' @param from_edav `bool` Build using local files or files in EDAV? +#' @param container `azcontainer` An instance of an Azure container to connect +#' to. Pass [get_azure_storage_connection()] using defaults if not accessing +#' using a service principal. +#' +#' @returns `list` A list containing connections to the folders associated with +#' individual datasets in the original output of [get_all_polio_data()]. +#' @export +#' +#' @examples +#' \dontrun{ +#' # Building raw_data locally +#' parquet_path <- "C:/Users/ABC1/Desktop/parquet_folder" +#' raw_data <- build_parquet_raw_data(parquet_path) +#' +#' # Build raw_data from EDAV +#' raw_data <- build_parquet_raw_data() +#' } +build_parquet_raw_data <- function(path = NULL, from_edav = F, container = NULL) { + + if (from_edav) { + # Default values + if (!is.null(path)) { + path <- "GID/PEB/SIR/Sandbox/parquet_sandbox" + } + if (!is.null(container)) { + container <- get_azure_storage_connection() + } + + raw_data <- build_parquet_raw_data_edav(path, container) + } else { + raw_data <- build_parquet_raw_data_local(path) + } + + return(raw_data) } # Private functions ---- @@ -28,7 +113,7 @@ create_raw_data_parquet <- function(raw_data, path){ #' #' @param name `str` Name of the column #' -#' @return `chr` A character vector of columns to partition with. +#' @returns `chr` A character vector of columns to partition with. #' @keywords internal #' #' @examples @@ -49,14 +134,93 @@ get_partition_cols <- function(name) { "pos.dupe" = c("place.admin.0", "yronset"), "other" = c("place.admin.0", "yronset"), "other.dupe" = c("place.admin.0", "yronset"), - "dist.pop" = c("ADM0_NAME", "year"), - "prov.pop" = c("ADM0_NAME", "year"), - "ctry.pop" = c("ADM0_NAME", "year"), + "dist.pop" = c("ADM0_NAME"), + "prov.pop" = c("ADM0_NAME"), + "ctry.pop" = c("ADM0_NAME"), "global.ctry" = c("ADM0_NAME"), "global.prov" = c("ADM0_NAME"), "global.dist" = c("ADM0_NAME"), "roads" = c("continent"), - "cities" = c("CTRY_NAME"), + "cities" = c("CNTRY_NAME"), "metadata" = "download_time" ) } + +#' Build raw_data using local parquet files +#' +#' @param path `str` A path to the parquet directory +#' +#' @returns `list` A list containing connections to the folders associated with +#' individual datasets in the original output of [get_all_polio_data()]. +#' @keywords internal +#' +build_parquet_raw_data_local <- function(path = NULL) { + + if (!dir.exists(path)) { + cli::cli_abort("Not a valid directory.") + } + + valid_values <- c("afp", "afp.dupe", "afp.epi", "para.case", "es", "es.dupe", + "sia", "sia.dupe", "pos", "pos.dupe", "other", "other.dupe", + "dist.pop", "prov.pop", "ctry.pop", "global.ctry", + "global.prov", "global.dist", "roads" , "cities", "metadata" + ) + data <- list.files(path) + data <- intersect(data, valid_values) + + raw_data <- list() + for (i in data) { + raw_data[[i]] <- arrow::open_dataset(file.path(path, i)) + } + + return(raw_data) + +} + +#' Build raw_data using EDAV files +#' +#' @param path `str` Path to EDAV folder containing parquet files. This must +#' be the absolute file path from the Blob endpoint of the container. +#' @param container `azcontainer` An instance of an Azure container to connect +#' to. Pass [get_azure_storage_connection()] using defaults if not accessing +#' using a service principal. +#' +#' @returns `list` A list containing connections to the folders associated with +#' individual datasets in the original output of [get_all_polio_data()]. +#' @keywords internal +#' +build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { + + if (is.null(container)) { + container <- get_azure_storage_connection() + } + + exist <- edav_io("exists.dir", default_dir = "", + file_loc = path, azcontainer = container) + if (!exist) { + cli::cli_abort("The directory does not exist on EDAV.") + } else { + rm(exist) + } + + cli::cli_process_start("Building raw_data from EDAV parquet files") + start <- Sys.time() + + raw_data <- NULL + # Download files locally in the temp directory first + dest <- "C:/Users/XRG9/Desktop/test" + local_pq <- file.path(dest, basename(path)) + AzureStor::multidownload_adls_file(container, + src = "GID/PEB/SIR/Sandbox/parquet_sandbox/*", + dest = local_pq, + recursive = TRUE, + overwrite = TRUE + ) + + raw_data <- build_parquet_raw_data_local(local_pq) + cli::cli_process_done() + cli::cli_process_start(paste0("Built in ", difftime(start, Sys.time(), "mins"), " mins.")) + + return(raw_data) + +} diff --git a/man/build_parquet_raw_data.Rd b/man/build_parquet_raw_data.Rd new file mode 100644 index 00000000..1eef21a7 --- /dev/null +++ b/man/build_parquet_raw_data.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dal.parquet.R +\name{build_parquet_raw_data} +\alias{build_parquet_raw_data} +\title{Recreate raw data from local parquet folder} +\usage{ +build_parquet_raw_data(path = NULL, from_edav = F, container = NULL) +} +\arguments{ +\item{path}{\code{str} Local path to the parquet folder} + +\item{from_edav}{\code{bool} Build using local files or files in EDAV?} + +\item{container}{\code{azcontainer} An instance of an Azure container to connect +to. Pass \code{\link[=get_azure_storage_connection]{get_azure_storage_connection()}} using defaults if not accessing +using a service principal.} +} +\value{ +\code{list} A list containing connections to the folders associated with +individual datasets in the original output of \code{\link[=get_all_polio_data]{get_all_polio_data()}}. +} +\description{ +Recreates an output of \code{\link[=get_all_polio_data]{get_all_polio_data()}} from a folder housing +data in parquet format. +} +\examples{ +\dontrun{ +# Building raw_data locally +parquet_path <- "C:/Users/ABC1/Desktop/parquet_folder" +raw_data <- build_parquet_raw_data(parquet_path) + +# Build raw_data from EDAV +raw_data <- build_parquet_raw_data() +} +} diff --git a/man/build_parquet_raw_data_edav.Rd b/man/build_parquet_raw_data_edav.Rd new file mode 100644 index 00000000..a8af4023 --- /dev/null +++ b/man/build_parquet_raw_data_edav.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dal.parquet.R +\name{build_parquet_raw_data_edav} +\alias{build_parquet_raw_data_edav} +\title{Build raw_data using EDAV files} +\usage{ +build_parquet_raw_data_edav(path = NULL, container = NULL, ...) +} +\arguments{ +\item{path}{\code{str} Path to EDAV folder containing parquet files. This must +be the absolute file path from the Blob endpoint of the container.} + +\item{container}{\code{azcontainer} An instance of an Azure container to connect +to. Pass \code{\link[=get_azure_storage_connection]{get_azure_storage_connection()}} using defaults if not accessing +using a service principal.} +} +\value{ +\code{list} A list containing connections to the folders associated with +individual datasets in the original output of \code{\link[=get_all_polio_data]{get_all_polio_data()}}. +} +\description{ +Build raw_data using EDAV files +} +\keyword{internal} diff --git a/man/build_parquet_raw_data_local.Rd b/man/build_parquet_raw_data_local.Rd new file mode 100644 index 00000000..d6b7aba5 --- /dev/null +++ b/man/build_parquet_raw_data_local.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dal.parquet.R +\name{build_parquet_raw_data_local} +\alias{build_parquet_raw_data_local} +\title{Build raw_data using local parquet files} +\usage{ +build_parquet_raw_data_local(path = NULL) +} +\arguments{ +\item{path}{\code{str} A path to the parquet directory} +} +\value{ +\code{list} A list containing connections to the folders associated with +individual datasets in the original output of \code{\link[=get_all_polio_data]{get_all_polio_data()}}. +} +\description{ +Build raw_data using local parquet files +} +\keyword{internal} diff --git a/sirfunctions.Rproj b/sirfunctions.Rproj index fd189303..69fafd4b 100644 --- a/sirfunctions.Rproj +++ b/sirfunctions.Rproj @@ -1,5 +1,4 @@ Version: 1.0 -ProjectId: e9616991-2fba-4185-b9cb-72e1f1045eb4 RestoreWorkspace: No SaveWorkspace: No From fa59911fa74f1a1a9c9de7c80afafb27c2776479 Mon Sep 17 00:00:00 2001 From: mcuadera Date: Fri, 14 Mar 2025 14:23:09 -0400 Subject: [PATCH 03/28] create parquet folder upload fx --- NAMESPACE | 1 + R/dal.parquet.R | 50 ++++++++++++++++++++++++++++++++--- man/upload_parquet_to_edav.Rd | 28 ++++++++++++++++++++ 3 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 man/upload_parquet_to_edav.Rd diff --git a/NAMESPACE b/NAMESPACE index 87202a8e..f0eee07f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -85,6 +85,7 @@ export(send_teams_message) export(set_emergence_colors) export(test_EDAV_connection) export(upload_dr_to_github) +export(upload_parquet_to_edav) export(upload_to_sharepoint) import(dplyr) import(ggplot2) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 53036d5b..bfc272d3 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -107,6 +107,50 @@ build_parquet_raw_data <- function(path = NULL, from_edav = F, container = NULL) return(raw_data) } +#' Uploads a local parquet folder to EDAV +#' +#' Uploads a folder containing parquet files to EDAV +#' +#' @param src `str` Local path to the parquet folder. +#' @param dest `str` EDAV endpoint. +#' @param container `azcontainer` An instance of an Azure container. +#' +#' @returns None. +#' @export +#' +#' @examples +#' \dontrun{ +#' local_dir <- "C:/Users/ABC1/Desktop/parquet_folder" +#' edav_dir <- "ABC/parquet_folder" +#' upload_parquet_to_edav(local_dir, edav_dir) +#' } +upload_parquet_to_edav <- function(src, dest, container = NULL) { + if (is.null(container)) { + container <- get_azure_storage_connection() + } + + while (TRUE) { + cli::cli_alert_info(paste0("Confirm upload to: ", dest, "/", basename(src), " (y/n)")) + response <- stringr::str_to_lower(stringr::str_trim(readline("> "))) + if (!response %in% c("y", "n")) { + cli::cli_alert_warning("Invalid response. Try again.") + } else if (response == "n") { + cli::cli_alert("Upload cancelled.") + } else if (response == "y") { + break + } + } + + cli::cli_process_start("Uploading parquet folder to EDAV") + start <- Sys.time() + AzureStor::multiupload_adls_file(container, paste0(src, "/*"), dest, + recursive = TRUE) + cli::cli_process_done() + cli::cli_alert_success(c("Uploaded in: ", + round(difftime(Sys.time(), start, "mins"), 2), + " mins")) +} + # Private functions ---- #' Gets the column used to partition a column @@ -208,10 +252,10 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { raw_data <- NULL # Download files locally in the temp directory first - dest <- "C:/Users/XRG9/Desktop/test" + dest <- tempdir() local_pq <- file.path(dest, basename(path)) AzureStor::multidownload_adls_file(container, - src = "GID/PEB/SIR/Sandbox/parquet_sandbox/*", + src = paste0(path, "/*"), dest = local_pq, recursive = TRUE, overwrite = TRUE @@ -219,7 +263,7 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { raw_data <- build_parquet_raw_data_local(local_pq) cli::cli_process_done() - cli::cli_process_start(paste0("Built in ", difftime(start, Sys.time(), "mins"), " mins.")) + cli::cli_process_start(paste0("Built in ", difftime(Sys.time(), start, "mins"), " mins.")) return(raw_data) diff --git a/man/upload_parquet_to_edav.Rd b/man/upload_parquet_to_edav.Rd new file mode 100644 index 00000000..41064538 --- /dev/null +++ b/man/upload_parquet_to_edav.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dal.parquet.R +\name{upload_parquet_to_edav} +\alias{upload_parquet_to_edav} +\title{Uploads a local parquet folder to EDAV} +\usage{ +upload_parquet_to_edav(src, dest, container = NULL) +} +\arguments{ +\item{src}{\code{str} Local path to the parquet folder.} + +\item{dest}{\code{str} EDAV endpoint.} + +\item{container}{\code{azcontainer} An instance of an Azure container.} +} +\value{ +None. +} +\description{ +Uploads a folder containing parquet files to EDAV +} +\examples{ +\dontrun{ +local_dir <- "C:/Users/ABC1/Desktop/parquet_folder" +edav_dir <- "ABC/parquet_folder" +upload_parquet_to_edav(local_dir, edav_dir) +} +} From 9b8185d75b0cfd995c6c5362e21ae654072b3ef9 Mon Sep 17 00:00:00 2001 From: mcuadera Date: Wed, 19 Mar 2025 08:41:52 -0400 Subject: [PATCH 04/28] using storage multidownload instead of multidownload_adls_file for building the parquet raw data from EDAV --- R/dal.parquet.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index bfc272d3..e5f30cdc 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -254,7 +254,7 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { # Download files locally in the temp directory first dest <- tempdir() local_pq <- file.path(dest, basename(path)) - AzureStor::multidownload_adls_file(container, + AzureStor::storage_multidownload(container, src = paste0(path, "/*"), dest = local_pq, recursive = TRUE, From 58632f776465f12021fc36e54794bf1e2b69e67b Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 13:20:15 -0400 Subject: [PATCH 05/28] shard using ctry name ctry + year creates too much sharding --- R/dal.parquet.R | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index e5f30cdc..648f6e39 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -166,26 +166,26 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) { #' } get_partition_cols <- function(name) { switch(name, - "afp" = c("place.admin.0", "yronset"), - "afp.dupe" = c("place.admin.0", "yronset"), - "afp.epi" = c("place.admin.0", "yronset"), - "para.case" = c("place.admin.0", "yronset"), - "es" = c("ADM0_NAME", "collect.yr"), - "es.dupe" = c("ADM0_NAME", "collect.yr"), - "sia" = c("place.admin.0", "yr.sia"), - "sia.dupe" = c("place.admin.0", "yr.sia"), - "pos" = c("place.admin.0", "yronset"), - "pos.dupe" = c("place.admin.0", "yronset"), - "other" = c("place.admin.0", "yronset"), - "other.dupe" = c("place.admin.0", "yronset"), - "dist.pop" = c("ADM0_NAME"), - "prov.pop" = c("ADM0_NAME"), - "ctry.pop" = c("ADM0_NAME"), - "global.ctry" = c("ADM0_NAME"), - "global.prov" = c("ADM0_NAME"), - "global.dist" = c("ADM0_NAME"), - "roads" = c("continent"), - "cities" = c("CNTRY_NAME"), + "afp" = "place.admin.0", + "afp.dupe" = "place.admin.0", + "afp.epi" = "place.admin.0", + "para.case" = "place.admin.0", + "es" = "ADM0_NAME", + "es.dupe" = "ADM0_NAME", + "sia" = "place.admin.0", + "sia.dupe" = "place.admin.0", + "pos" = "place.admin.0", + "pos.dupe" = "place.admin.0", + "other" = "place.admin.0", + "other.dupe" = "place.admin.0", + "dist.pop" = "ctry", + "prov.pop" = "ctry", + "ctry.pop" = "ctry", + "global.ctry" = "ADM0_NAME", + "global.prov" = "ADM0_NAME", + "global.dist" = "ADM0_NAME", + "roads" = "continent", + "cities" = "CNTRY_NAME", "metadata" = "download_time" ) } From 580c777ccf326ae36c56bb4b5fa3b19d4c25f751 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 13:23:09 -0400 Subject: [PATCH 06/28] Add coverage datasets --- R/dal.parquet.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 648f6e39..b0580d8d 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -184,6 +184,9 @@ get_partition_cols <- function(name) { "global.ctry" = "ADM0_NAME", "global.prov" = "ADM0_NAME", "global.dist" = "ADM0_NAME", + "ctry.coverage" = "year", + "prov.coverage" = "year", + "dist.coverage" = "year", "roads" = "continent", "cities" = "CNTRY_NAME", "metadata" = "download_time" From 69b9980d27c70e956f6eced8d639f7134a547e82 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 14:05:46 -0400 Subject: [PATCH 07/28] change partition column also include new helper function --- R/dal.parquet.R | 101 ++++++++++++++++++++++++++++++------------------ 1 file changed, 64 insertions(+), 37 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index b0580d8d..e2602f3f 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -17,6 +17,9 @@ create_raw_data_parquet <- function(raw_data, path){ start <- Sys.time() df_names <- names(raw_data) + options(arrow.use_threads = TRUE) + on.exit(options(arrow.use_threads = old_threads), add = TRUE) + if (!dir.exists(path)) { cli::cli_abort("Directory path does not exist.") } @@ -26,17 +29,12 @@ create_raw_data_parquet <- function(raw_data, path){ for (i in df_names) { cli::cli_alert_info(paste0("Now processing: ", i)) - if (i %in% c("global.prov", "global.dist")) { - raw_data[[i]] |> - dplyr::mutate(SHAPE = sf::st_as_text(SHAPE)) |> - arrow::write_dataset(path = file.path(path, i), - partitioning = get_partition_cols(i)) - - } else if (i == "global.ctry") { + if (i %in% c("global.ctry", "global.prov", "global.dist")) { raw_data[[i]] |> dplyr::mutate(Shape = sf::st_as_text(Shape)) |> arrow::write_dataset(path = file.path(path, i), partitioning = get_partition_cols(i)) + } else if (i %in% c("cities", "roads")) { raw_data[[i]] |> dplyr::mutate(geometry = sf::st_as_text(geometry)) |> @@ -166,24 +164,24 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) { #' } get_partition_cols <- function(name) { switch(name, - "afp" = "place.admin.0", - "afp.dupe" = "place.admin.0", - "afp.epi" = "place.admin.0", - "para.case" = "place.admin.0", - "es" = "ADM0_NAME", - "es.dupe" = "ADM0_NAME", - "sia" = "place.admin.0", - "sia.dupe" = "place.admin.0", - "pos" = "place.admin.0", - "pos.dupe" = "place.admin.0", - "other" = "place.admin.0", - "other.dupe" = "place.admin.0", - "dist.pop" = "ctry", - "prov.pop" = "ctry", - "ctry.pop" = "ctry", - "global.ctry" = "ADM0_NAME", - "global.prov" = "ADM0_NAME", - "global.dist" = "ADM0_NAME", + "afp" = "yronset", + "afp.dupe" = "yronset", + "afp.epi" = "yronset", + "para.case" = "yronset", + "es" = "collect.yr", + "es.dupe" = "collect.yr", + "sia" = "yr.sia", + "sia.dupe" = "yr.sia", + "pos" = "yronset", + "pos.dupe" = "yronset", + "other" = "yronset", + "other.dupe" = "yronset", + "dist.pop" = "year", + "prov.pop" = "year", + "ctry.pop" = "year", + "global.ctry" = "WHO_REGION", + "global.prov" = "WHO_REGION", + "global.dist" = "WHO_REGION", "ctry.coverage" = "year", "prov.coverage" = "year", "dist.coverage" = "year", @@ -210,7 +208,9 @@ build_parquet_raw_data_local <- function(path = NULL) { valid_values <- c("afp", "afp.dupe", "afp.epi", "para.case", "es", "es.dupe", "sia", "sia.dupe", "pos", "pos.dupe", "other", "other.dupe", "dist.pop", "prov.pop", "ctry.pop", "global.ctry", - "global.prov", "global.dist", "roads" , "cities", "metadata" + "global.prov", "global.dist", + "ctry.coverage", "prov.coverage", "dist.coverage", + "roads" , "cities", "metadata" ) data <- list.files(path) data <- intersect(data, valid_values) @@ -255,19 +255,46 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { raw_data <- NULL # Download files locally in the temp directory first - dest <- tempdir() - local_pq <- file.path(dest, basename(path)) - AzureStor::storage_multidownload(container, - src = paste0(path, "/*"), - dest = local_pq, - recursive = TRUE, - overwrite = TRUE - ) - - raw_data <- build_parquet_raw_data_local(local_pq) - cli::cli_process_done() + withr::with_tempdir({ + local_pq <- file.path(getwd(), basename(path)) + AzureStor::storage_multidownload(container, + src = paste0(path, "/*"), + dest = local_pq, + recursive = TRUE, + overwrite = TRUE + ) + + raw_data <- build_parquet_raw_data_local(local_pq) + cli::cli_process_done() + }) + cli::cli_process_start(paste0("Built in ", difftime(Sys.time(), start, "mins"), " mins.")) return(raw_data) } + +#' Drop Shape column and convert to binary +#' +#' @param x `sf` or `data.frame` Geodata. +#' @param geom_col `str` Name of the geometry column. +#' +#' @returns `tibble` Data without any Shape column. +#' +#' @keywords internal +#' +to_wkb_drop_sf <- function(x, geom_col) { + # Works whether x is sf or a plain data.frame with an sfc column + geom <- if (inherits(x, "sf")) { + sf::st_geometry(x) + } else { + x[[geom_col]] + } + + x[[paste0(geom_col, "_wkb")]] <- sf::st_as_binary(geom) + x[[geom_col]] <- NULL + if (inherits(x, "sf")) { + x <- sf::st_drop_geometry(x) + } + return(x) +} \ No newline at end of file From 918a58636969c7cf63776a8f568a4f9a54721a52 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 14:13:08 -0400 Subject: [PATCH 08/28] use threading to create partitions --- R/dal.parquet.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index e2602f3f..755d94f9 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -17,6 +17,7 @@ create_raw_data_parquet <- function(raw_data, path){ start <- Sys.time() df_names <- names(raw_data) + old_threads <- getOption("arrow.use_threads") options(arrow.use_threads = TRUE) on.exit(options(arrow.use_threads = old_threads), add = TRUE) @@ -29,15 +30,15 @@ create_raw_data_parquet <- function(raw_data, path){ for (i in df_names) { cli::cli_alert_info(paste0("Now processing: ", i)) + data <- + if (i %in% c("global.ctry", "global.prov", "global.dist")) { - raw_data[[i]] |> - dplyr::mutate(Shape = sf::st_as_text(Shape)) |> + to_wkb_drop_sf(raw_data[[i]], "Shape") |> arrow::write_dataset(path = file.path(path, i), partitioning = get_partition_cols(i)) } else if (i %in% c("cities", "roads")) { - raw_data[[i]] |> - dplyr::mutate(geometry = sf::st_as_text(geometry)) |> + to_wkb_drop_sf(raw_data[[i]], "geometry") |> arrow::write_dataset(path = file.path(path, i), partitioning = get_partition_cols(i)) @@ -186,7 +187,7 @@ get_partition_cols <- function(name) { "prov.coverage" = "year", "dist.coverage" = "year", "roads" = "continent", - "cities" = "CNTRY_NAME", + "cities" = "POP_CLASS", "metadata" = "download_time" ) } From 105b3b7cffca6c919dc683025167c11c02f7e063 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 14:28:05 -0400 Subject: [PATCH 09/28] remove execution time code and fix to_wkb_drop_sf --- R/dal.parquet.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 755d94f9..49304cad 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -14,7 +14,7 @@ #' create_raw_data_parquet(raw_data, "C:/Users/ABC1/Desktop/raw_data_parquet") #' } create_raw_data_parquet <- function(raw_data, path){ - start <- Sys.time() + df_names <- names(raw_data) old_threads <- getOption("arrow.use_threads") @@ -57,10 +57,6 @@ create_raw_data_parquet <- function(raw_data, path){ } cli::cli_process_done() - cli::cli_alert_success("raw_data parquet folder created!") - cli::cli_alert_info(paste0("Data processed in: ", - round(difftime(Sys.time(), start, "mins"), 2), - " mins.")) } #' Recreate raw data from local parquet folder @@ -292,7 +288,11 @@ to_wkb_drop_sf <- function(x, geom_col) { x[[geom_col]] } - x[[paste0(geom_col, "_wkb")]] <- sf::st_as_binary(geom) + # Convert to WKB (list of raw vectors), then drop the "WKB" class + wkb <- sf::st_as_binary(geom) + wkb <- unclass(wkb) # <- key line: makes it a plain list Arrow can infer + + x[[paste0(geom_col, "_wkb")]] <- wkb x[[geom_col]] <- NULL if (inherits(x, "sf")) { x <- sf::st_drop_geometry(x) From 4638444c987b6ae44a4b5e3c1f90007a02c945e7 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 15:36:53 -0400 Subject: [PATCH 10/28] add function to convert wkb to sf --- R/dal.parquet.R | 66 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 49304cad..7b27dc43 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -61,6 +61,7 @@ create_raw_data_parquet <- function(raw_data, path){ #' Recreate raw data from local parquet folder #' +#' @description #' Recreates an output of [get_all_polio_data()] from a folder housing #' data in parquet format. #' @@ -69,6 +70,9 @@ create_raw_data_parquet <- function(raw_data, path){ #' @param container `azcontainer` An instance of an Azure container to connect #' to. Pass [get_azure_storage_connection()] using defaults if not accessing #' using a service principal. +#' +#' @details +#' For tibbles with Shapes, pass to [from_wkb_to_sf()] first before creating maps. #' #' @returns `list` A list containing connections to the folders associated with #' individual datasets in the original output of [get_all_polio_data()]. @@ -274,28 +278,66 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { #' Drop Shape column and convert to binary #' #' @param x `sf` or `data.frame` Geodata. -#' @param geom_col `str` Name of the geometry column. -#' -#' @returns `tibble` Data without any Shape column. +#' +#' @details +#' This function was written using the CDC EDAV Chatbot using the model GPT-5.2. +#' @returns `tibble` dData without any Shape column. #' #' @keywords internal #' -to_wkb_drop_sf <- function(x, geom_col) { +to_wkb_drop_sf <- function(sf_data) { + + if ("Shape" %in% names(sf_data)) { + geom_col <- "Shape" + } else if ("geometry" %in% names(sf_data)) { + geom_col <- "geometry" + } else { + cli::cli_abort("Not an sf dataset.") + } + # Works whether x is sf or a plain data.frame with an sfc column - geom <- if (inherits(x, "sf")) { - sf::st_geometry(x) + geom <- if (inherits(sf_data, "sf")) { + sf::st_geometry(sf_data) } else { - x[[geom_col]] + sf_data[[geom_col]] } # Convert to WKB (list of raw vectors), then drop the "WKB" class wkb <- sf::st_as_binary(geom) wkb <- unclass(wkb) # <- key line: makes it a plain list Arrow can infer - x[[paste0(geom_col, "_wkb")]] <- wkb - x[[geom_col]] <- NULL - if (inherits(x, "sf")) { - x <- sf::st_drop_geometry(x) + sf_data[[geom_col]] <- wkb + if (inherits(sf_data, "sf")) { + sf_data <- sf::st_drop_geometry(sf_data) + } + return(sf_data) +} + +#' Convert WKB back to sf column +#' +#' @param sf_data `arrow connection` Geodata arrow connection. +#' +#' @returns `tibble` Geodata with `sf`. +#' +#' @export +from_wkb_to_sf <- function(sf_data) { + + + # Ensure that global shapefiles have Shape and city/roads as geometry. + # Otherwise, need to modify this function. + if ("Shape" %in% names(sf_data)) { + wkb_col <- "Shape" + } else if ("geometry" %in% names(sf_data)) { + wkb_col <- "geometry" + } else { + cli::cli_abort("Not an sf dataset.") } - return(x) + + sf_data |> + dplyr::mutate(dplyr::across(dplyr::any_of(wkb_col), \(x) { + sf::st_as_sf(x, EWKB = TRUE, crs = 4326) + })) + + return(sf_data) + } \ No newline at end of file From 9e7c95bca449a8247b389158f60b95395d6ef8d5 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 16:00:35 -0400 Subject: [PATCH 11/28] fix raw data compile from EDAV --- R/dal.parquet.R | 104 +++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 55 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 7b27dc43..34c03ac3 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -33,12 +33,14 @@ create_raw_data_parquet <- function(raw_data, path){ data <- if (i %in% c("global.ctry", "global.prov", "global.dist")) { - to_wkb_drop_sf(raw_data[[i]], "Shape") |> + raw_data[[i]] |> + to_wkb_drop_sf() |> arrow::write_dataset(path = file.path(path, i), partitioning = get_partition_cols(i)) } else if (i %in% c("cities", "roads")) { - to_wkb_drop_sf(raw_data[[i]], "geometry") |> + raw_data[[i]] |> + to_wkb_drop_sf() |> arrow::write_dataset(path = file.path(path, i), partitioning = get_partition_cols(i)) @@ -91,10 +93,10 @@ build_parquet_raw_data <- function(path = NULL, from_edav = F, container = NULL) if (from_edav) { # Default values - if (!is.null(path)) { + if (is.null(path)) { path <- "GID/PEB/SIR/Sandbox/parquet_sandbox" } - if (!is.null(container)) { + if (is.null(container)) { container <- get_azure_storage_connection() } @@ -141,13 +143,38 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) { } cli::cli_process_start("Uploading parquet folder to EDAV") - start <- Sys.time() - AzureStor::multiupload_adls_file(container, paste0(src, "/*"), dest, + AzureStor::multiupload_adls_file(container, paste0(src, "/*"), file.path(dest, basename(src)), recursive = TRUE) cli::cli_process_done() - cli::cli_alert_success(c("Uploaded in: ", - round(difftime(Sys.time(), start, "mins"), 2), - " mins")) +} + +#' Convert WKB back to sf column +#' +#' @param sf_data `arrow connection` Geodata arrow connection. +#' +#' @returns `tibble` Geodata with `sf`. +#' +#' @export +from_wkb_to_sf <- function(sf_data) { + + + # Ensure that global shapefiles have Shape and city/roads as geometry. + # Otherwise, need to modify this function. + if ("Shape" %in% names(sf_data)) { + wkb_col <- "Shape" + } else if ("geometry" %in% names(sf_data)) { + wkb_col <- "geometry" + } else { + cli::cli_abort("Not an sf dataset.") + } + + sf_data |> + dplyr::mutate(dplyr::across(dplyr::any_of(wkb_col), \(x) { + sf::st_as_sf(x, EWKB = TRUE, crs = 4326) + })) + + return(sf_data) + } # Private functions ---- @@ -243,8 +270,9 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { container <- get_azure_storage_connection() } - exist <- edav_io("exists.dir", default_dir = "", + exist <- edav_io("exists.dir", NULL, file_loc = path, azcontainer = container) + if (!exist) { cli::cli_abort("The directory does not exist on EDAV.") } else { @@ -252,24 +280,19 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { } cli::cli_process_start("Building raw_data from EDAV parquet files") - start <- Sys.time() raw_data <- NULL - # Download files locally in the temp directory first - withr::with_tempdir({ - local_pq <- file.path(getwd(), basename(path)) - AzureStor::storage_multidownload(container, - src = paste0(path, "/*"), - dest = local_pq, - recursive = TRUE, - overwrite = TRUE - ) - - raw_data <- build_parquet_raw_data_local(local_pq) - cli::cli_process_done() - }) - - cli::cli_process_start(paste0("Built in ", difftime(Sys.time(), start, "mins"), " mins.")) + + local_pq <- file.path(rappdirs::user_data_dir("sirfunctions"), basename(path)) + AzureStor::storage_multidownload(container, + src = paste0(path, "/*"), + dest = local_pq, + recursive = TRUE, + overwrite = TRUE + ) + + raw_data <- build_parquet_raw_data_local(local_pq) + cli::cli_process_done() return(raw_data) @@ -311,33 +334,4 @@ to_wkb_drop_sf <- function(sf_data) { sf_data <- sf::st_drop_geometry(sf_data) } return(sf_data) -} - -#' Convert WKB back to sf column -#' -#' @param sf_data `arrow connection` Geodata arrow connection. -#' -#' @returns `tibble` Geodata with `sf`. -#' -#' @export -from_wkb_to_sf <- function(sf_data) { - - - # Ensure that global shapefiles have Shape and city/roads as geometry. - # Otherwise, need to modify this function. - if ("Shape" %in% names(sf_data)) { - wkb_col <- "Shape" - } else if ("geometry" %in% names(sf_data)) { - wkb_col <- "geometry" - } else { - cli::cli_abort("Not an sf dataset.") - } - - sf_data |> - dplyr::mutate(dplyr::across(dplyr::any_of(wkb_col), \(x) { - sf::st_as_sf(x, EWKB = TRUE, crs = 4326) - })) - - return(sf_data) - } \ No newline at end of file From ad1714a05a09df6734202cf0b1c783990a917a2c Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 16:02:29 -0400 Subject: [PATCH 12/28] add docs for helper functions --- NAMESPACE | 3 ++- man/build_parquet_raw_data.Rd | 3 +++ man/from_wkb_to_sf.Rd | 17 +++++++++++++++++ man/to_wkb_drop_sf.Rd | 21 +++++++++++++++++++++ 4 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 man/from_wkb_to_sf.Rd create mode 100644 man/to_wkb_drop_sf.Rd diff --git a/NAMESPACE b/NAMESPACE index 4b589f9b..7368b8d1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,8 @@ # Generated by roxygen2: do not edit by hand export(add_rolling_years) -export(check_afp_geographies) export(build_parquet_raw_data) +export(check_afp_geographies) export(check_afp_guid_ctry_data) export(check_cache) export(check_missing_rows) @@ -38,6 +38,7 @@ export(f.stool.ad.01) export(f.timely.detection.01) export(fix_ctry_data_missing_guids) export(force_load_polio_data_cache) +export(from_wkb_to_sf) export(generate_60_day_tab) export(generate_60_day_table_data) export(generate_ad_final_col) diff --git a/man/build_parquet_raw_data.Rd b/man/build_parquet_raw_data.Rd index 1eef21a7..3390b92a 100644 --- a/man/build_parquet_raw_data.Rd +++ b/man/build_parquet_raw_data.Rd @@ -23,6 +23,9 @@ individual datasets in the original output of \code{\link[=get_all_polio_data]{g Recreates an output of \code{\link[=get_all_polio_data]{get_all_polio_data()}} from a folder housing data in parquet format. } +\details{ +For tibbles with Shapes, pass to \code{\link[=from_wkb_to_sf]{from_wkb_to_sf()}} first before creating maps. +} \examples{ \dontrun{ # Building raw_data locally diff --git a/man/from_wkb_to_sf.Rd b/man/from_wkb_to_sf.Rd new file mode 100644 index 00000000..33920411 --- /dev/null +++ b/man/from_wkb_to_sf.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dal.parquet.R +\name{from_wkb_to_sf} +\alias{from_wkb_to_sf} +\title{Convert WKB back to sf column} +\usage{ +from_wkb_to_sf(sf_data) +} +\arguments{ +\item{sf_data}{\verb{arrow connection} Geodata arrow connection.} +} +\value{ +\code{tibble} Geodata with \code{sf}. +} +\description{ +Convert WKB back to sf column +} diff --git a/man/to_wkb_drop_sf.Rd b/man/to_wkb_drop_sf.Rd new file mode 100644 index 00000000..d1f0f560 --- /dev/null +++ b/man/to_wkb_drop_sf.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dal.parquet.R +\name{to_wkb_drop_sf} +\alias{to_wkb_drop_sf} +\title{Drop Shape column and convert to binary} +\usage{ +to_wkb_drop_sf(sf_data) +} +\arguments{ +\item{x}{\code{sf} or \code{data.frame} Geodata.} +} +\value{ +\code{tibble} dData without any Shape column. +} +\description{ +Drop Shape column and convert to binary +} +\details{ +This function was written using the CDC EDAV Chatbot using the model GPT-5.2. +} +\keyword{internal} From 355844a4f614da16690a0132039be9f72e77a39f Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 16:07:08 -0400 Subject: [PATCH 13/28] format R code --- R/dal.parquet.R | 211 ++++++++++++++++++++++++++---------------------- 1 file changed, 114 insertions(+), 97 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 34c03ac3..30ce57e5 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -13,8 +13,7 @@ #' raw_data <- get_all_polio_data() #' create_raw_data_parquet(raw_data, "C:/Users/ABC1/Desktop/raw_data_parquet") #' } -create_raw_data_parquet <- function(raw_data, path){ - +create_raw_data_parquet <- function(raw_data, path) { df_names <- names(raw_data) old_threads <- getOption("arrow.use_threads") @@ -30,29 +29,35 @@ create_raw_data_parquet <- function(raw_data, path){ for (i in df_names) { cli::cli_alert_info(paste0("Now processing: ", i)) - data <- - - if (i %in% c("global.ctry", "global.prov", "global.dist")) { - raw_data[[i]] |> - to_wkb_drop_sf() |> - arrow::write_dataset(path = file.path(path, i), - partitioning = get_partition_cols(i)) - - } else if (i %in% c("cities", "roads")) { - raw_data[[i]] |> - to_wkb_drop_sf() |> - arrow::write_dataset(path = file.path(path, i), - partitioning = get_partition_cols(i)) - - } else if (i == "metadata") { - raw_data[[i]] |> - dplyr::as_tibble() |> - arrow::write_dataset(path = file.path(path, i), - partitioning = get_partition_cols(i)) - } else { - raw_data[[i]] |> arrow::write_dataset(path = file.path(path, i), - partitioning = get_partition_cols(i)) - } + data <- + if (i %in% c("global.ctry", "global.prov", "global.dist")) { + raw_data[[i]] |> + to_wkb_drop_sf() |> + arrow::write_dataset( + path = file.path(path, i), + partitioning = get_partition_cols(i) + ) + } else if (i %in% c("cities", "roads")) { + raw_data[[i]] |> + to_wkb_drop_sf() |> + arrow::write_dataset( + path = file.path(path, i), + partitioning = get_partition_cols(i) + ) + } else if (i == "metadata") { + raw_data[[i]] |> + dplyr::as_tibble() |> + arrow::write_dataset( + path = file.path(path, i), + partitioning = get_partition_cols(i) + ) + } else { + raw_data[[i]] |> + arrow::write_dataset( + path = file.path(path, i), + partitioning = get_partition_cols(i) + ) + } cli::cli_alert_info(paste0(iter, "/", length(df_names), " processed.")) iter <- iter + 1 @@ -72,7 +77,7 @@ create_raw_data_parquet <- function(raw_data, path){ #' @param container `azcontainer` An instance of an Azure container to connect #' to. Pass [get_azure_storage_connection()] using defaults if not accessing #' using a service principal. -#' +#' #' @details #' For tibbles with Shapes, pass to [from_wkb_to_sf()] first before creating maps. #' @@ -89,8 +94,11 @@ create_raw_data_parquet <- function(raw_data, path){ #' # Build raw_data from EDAV #' raw_data <- build_parquet_raw_data() #' } -build_parquet_raw_data <- function(path = NULL, from_edav = F, container = NULL) { - +build_parquet_raw_data <- function( + path = NULL, + from_edav = F, + container = NULL +) { if (from_edav) { # Default values if (is.null(path)) { @@ -130,21 +138,18 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) { container <- get_azure_storage_connection() } - while (TRUE) { - cli::cli_alert_info(paste0("Confirm upload to: ", dest, "/", basename(src), " (y/n)")) - response <- stringr::str_to_lower(stringr::str_trim(readline("> "))) - if (!response %in% c("y", "n")) { - cli::cli_alert_warning("Invalid response. Try again.") - } else if (response == "n") { - cli::cli_alert("Upload cancelled.") - } else if (response == "y") { - break - } + dir_exists <- edav_io("exists.dir", NULL, dest) + if (!dir_exists) { + cli::cli_abort("Folder doesn't exist on EDAV. Unable to upload") } cli::cli_process_start("Uploading parquet folder to EDAV") - AzureStor::multiupload_adls_file(container, paste0(src, "/*"), file.path(dest, basename(src)), - recursive = TRUE) + AzureStor::multiupload_adls_file( + container, + paste0(src, "/*"), + file.path(dest, basename(src)), + recursive = TRUE + ) cli::cli_process_done() } @@ -156,9 +161,7 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) { #' #' @export from_wkb_to_sf <- function(sf_data) { - - - # Ensure that global shapefiles have Shape and city/roads as geometry. + # Ensure that global shapefiles have Shape and city/roads as geometry. # Otherwise, need to modify this function. if ("Shape" %in% names(sf_data)) { wkb_col <- "Shape" @@ -172,9 +175,8 @@ from_wkb_to_sf <- function(sf_data) { dplyr::mutate(dplyr::across(dplyr::any_of(wkb_col), \(x) { sf::st_as_sf(x, EWKB = TRUE, crs = 4326) })) - - return(sf_data) + return(sf_data) } # Private functions ---- @@ -191,32 +193,33 @@ from_wkb_to_sf <- function(sf_data) { #' get_partition_cols("afp") #' } get_partition_cols <- function(name) { - switch(name, - "afp" = "yronset", - "afp.dupe" = "yronset", - "afp.epi" = "yronset", - "para.case" = "yronset", - "es" = "collect.yr", - "es.dupe" = "collect.yr", - "sia" = "yr.sia", - "sia.dupe" = "yr.sia", - "pos" = "yronset", - "pos.dupe" = "yronset", - "other" = "yronset", - "other.dupe" = "yronset", - "dist.pop" = "year", - "prov.pop" = "year", - "ctry.pop" = "year", - "global.ctry" = "WHO_REGION", - "global.prov" = "WHO_REGION", - "global.dist" = "WHO_REGION", - "ctry.coverage" = "year", - "prov.coverage" = "year", - "dist.coverage" = "year", - "roads" = "continent", - "cities" = "POP_CLASS", - "metadata" = "download_time" - ) + switch( + name, + "afp" = "yronset", + "afp.dupe" = "yronset", + "afp.epi" = "yronset", + "para.case" = "yronset", + "es" = "collect.yr", + "es.dupe" = "collect.yr", + "sia" = "yr.sia", + "sia.dupe" = "yr.sia", + "pos" = "yronset", + "pos.dupe" = "yronset", + "other" = "yronset", + "other.dupe" = "yronset", + "dist.pop" = "year", + "prov.pop" = "year", + "ctry.pop" = "year", + "global.ctry" = "WHO_REGION", + "global.prov" = "WHO_REGION", + "global.dist" = "WHO_REGION", + "ctry.coverage" = "year", + "prov.coverage" = "year", + "dist.coverage" = "year", + "roads" = "continent", + "cities" = "POP_CLASS", + "metadata" = "download_time" + ) } #' Build raw_data using local parquet files @@ -228,18 +231,36 @@ get_partition_cols <- function(name) { #' @keywords internal #' build_parquet_raw_data_local <- function(path = NULL) { - if (!dir.exists(path)) { cli::cli_abort("Not a valid directory.") } - valid_values <- c("afp", "afp.dupe", "afp.epi", "para.case", "es", "es.dupe", - "sia", "sia.dupe", "pos", "pos.dupe", "other", "other.dupe", - "dist.pop", "prov.pop", "ctry.pop", "global.ctry", - "global.prov", "global.dist", - "ctry.coverage", "prov.coverage", "dist.coverage", - "roads" , "cities", "metadata" - ) + valid_values <- c( + "afp", + "afp.dupe", + "afp.epi", + "para.case", + "es", + "es.dupe", + "sia", + "sia.dupe", + "pos", + "pos.dupe", + "other", + "other.dupe", + "dist.pop", + "prov.pop", + "ctry.pop", + "global.ctry", + "global.prov", + "global.dist", + "ctry.coverage", + "prov.coverage", + "dist.coverage", + "roads", + "cities", + "metadata" + ) data <- list.files(path) data <- intersect(data, valid_values) @@ -249,7 +270,6 @@ build_parquet_raw_data_local <- function(path = NULL) { } return(raw_data) - } #' Build raw_data using EDAV files @@ -265,14 +285,12 @@ build_parquet_raw_data_local <- function(path = NULL) { #' @keywords internal #' build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { - if (is.null(container)) { container <- get_azure_storage_connection() } - exist <- edav_io("exists.dir", NULL, - file_loc = path, azcontainer = container) - + exist <- edav_io("exists.dir", NULL, file_loc = path, azcontainer = container) + if (!exist) { cli::cli_abort("The directory does not exist on EDAV.") } else { @@ -284,32 +302,31 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { raw_data <- NULL local_pq <- file.path(rappdirs::user_data_dir("sirfunctions"), basename(path)) - AzureStor::storage_multidownload(container, - src = paste0(path, "/*"), - dest = local_pq, - recursive = TRUE, - overwrite = TRUE - ) + AzureStor::storage_multidownload( + container, + src = paste0(path, "/*"), + dest = local_pq, + recursive = TRUE, + overwrite = TRUE + ) raw_data <- build_parquet_raw_data_local(local_pq) cli::cli_process_done() return(raw_data) - } #' Drop Shape column and convert to binary #' #' @param x `sf` or `data.frame` Geodata. -#' +#' #' @details #' This function was written using the CDC EDAV Chatbot using the model GPT-5.2. #' @returns `tibble` dData without any Shape column. #' #' @keywords internal -#' +#' to_wkb_drop_sf <- function(sf_data) { - if ("Shape" %in% names(sf_data)) { geom_col <- "Shape" } else if ("geometry" %in% names(sf_data)) { @@ -323,15 +340,15 @@ to_wkb_drop_sf <- function(sf_data) { sf::st_geometry(sf_data) } else { sf_data[[geom_col]] - } + } # Convert to WKB (list of raw vectors), then drop the "WKB" class wkb <- sf::st_as_binary(geom) - wkb <- unclass(wkb) # <- key line: makes it a plain list Arrow can infer + wkb <- unclass(wkb) # <- key line: makes it a plain list Arrow can infer sf_data[[geom_col]] <- wkb if (inherits(sf_data, "sf")) { - sf_data <- sf::st_drop_geometry(sf_data) + sf_data <- sf::st_drop_geometry(sf_data) } return(sf_data) } \ No newline at end of file From a8eed2220b23c8847210ffa5620f4f58de46f846 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 16:08:44 -0400 Subject: [PATCH 14/28] don't default to a file path --- R/dal.parquet.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 30ce57e5..a12a068f 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -102,7 +102,7 @@ build_parquet_raw_data <- function( if (from_edav) { # Default values if (is.null(path)) { - path <- "GID/PEB/SIR/Sandbox/parquet_sandbox" + cli::cli_abort("Please pass a file path to the parquet folder") } if (is.null(container)) { container <- get_azure_storage_connection() From 79e669e78883d3b670f937be36fdfb77c4246de5 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Mon, 30 Mar 2026 16:30:53 -0400 Subject: [PATCH 15/28] provide ability to extract specific datasets from raw data --- R/dal.parquet.R | 34 ++++++++++++++++++++++------- man/build_parquet_raw_data.Rd | 7 +++++- man/build_parquet_raw_data_edav.Rd | 2 +- man/build_parquet_raw_data_local.Rd | 2 +- 4 files changed, 34 insertions(+), 11 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index a12a068f..d00bd89e 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -97,6 +97,7 @@ create_raw_data_parquet <- function(raw_data, path) { build_parquet_raw_data <- function( path = NULL, from_edav = F, + dataset = "all", container = NULL ) { if (from_edav) { @@ -108,9 +109,9 @@ build_parquet_raw_data <- function( container <- get_azure_storage_connection() } - raw_data <- build_parquet_raw_data_edav(path, container) + raw_data <- build_parquet_raw_data_edav(path, dataset, container) } else { - raw_data <- build_parquet_raw_data_local(path) + raw_data <- build_parquet_raw_data_local(path, dataset) } return(raw_data) @@ -225,12 +226,13 @@ get_partition_cols <- function(name) { #' Build raw_data using local parquet files #' #' @param path `str` A path to the parquet directory +#' @param dataset `str` A specific dataset. Defaults to `"all"`. Otherwise, can specify any dataset in the list returned by [get_all_polio_data()]. #' #' @returns `list` A list containing connections to the folders associated with #' individual datasets in the original output of [get_all_polio_data()]. #' @keywords internal #' -build_parquet_raw_data_local <- function(path = NULL) { +build_parquet_raw_data_local <- function(path = NULL, dataset = "all") { if (!dir.exists(path)) { cli::cli_abort("Not a valid directory.") } @@ -262,7 +264,11 @@ build_parquet_raw_data_local <- function(path = NULL) { "metadata" ) data <- list.files(path) - data <- intersect(data, valid_values) + if (dataset == "all") { + data <- intersect(data, valid_values) + } else { + data <- intersect(data, dataset) + } raw_data <- list() for (i in data) { @@ -276,6 +282,7 @@ build_parquet_raw_data_local <- function(path = NULL) { #' #' @param path `str` Path to EDAV folder containing parquet files. This must #' be the absolute file path from the Blob endpoint of the container. +#' @param dataset `str` A specific dataset. Defaults to `"all"`. Otherwise, can specify any dataset in the list returned by [get_all_polio_data()]. #' @param container `azcontainer` An instance of an Azure container to connect #' to. Pass [get_azure_storage_connection()] using defaults if not accessing #' using a service principal. @@ -284,7 +291,7 @@ build_parquet_raw_data_local <- function(path = NULL) { #' individual datasets in the original output of [get_all_polio_data()]. #' @keywords internal #' -build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { +build_parquet_raw_data_edav <- function(path = NULL, dataset = "all", container = NULL) { if (is.null(container)) { container <- get_azure_storage_connection() } @@ -301,19 +308,30 @@ build_parquet_raw_data_edav <- function(path = NULL, container = NULL, ...) { raw_data <- NULL + if (dataset == "all") { + source_path <- paste0(path, "raw_data_parquet/*") + } else { + source_path <- paste0(file.path(path, "raw_data_parquet", dataset), "/*") + } + local_pq <- file.path(rappdirs::user_data_dir("sirfunctions"), basename(path)) AzureStor::storage_multidownload( container, - src = paste0(path, "/*"), + src = source_path, dest = local_pq, recursive = TRUE, overwrite = TRUE ) - raw_data <- build_parquet_raw_data_local(local_pq) + raw_data <- build_parquet_raw_data_local(local_pq, dataset) cli::cli_process_done() + + if (length(raw_data) == 1) { + return(raw_data[[1]]) + } else { + return(raw_data) + } - return(raw_data) } #' Drop Shape column and convert to binary diff --git a/man/build_parquet_raw_data.Rd b/man/build_parquet_raw_data.Rd index 3390b92a..6f5de4cb 100644 --- a/man/build_parquet_raw_data.Rd +++ b/man/build_parquet_raw_data.Rd @@ -4,7 +4,12 @@ \alias{build_parquet_raw_data} \title{Recreate raw data from local parquet folder} \usage{ -build_parquet_raw_data(path = NULL, from_edav = F, container = NULL) +build_parquet_raw_data( + path = NULL, + from_edav = F, + dataset = "all", + container = NULL +) } \arguments{ \item{path}{\code{str} Local path to the parquet folder} diff --git a/man/build_parquet_raw_data_edav.Rd b/man/build_parquet_raw_data_edav.Rd index a8af4023..a61f0a83 100644 --- a/man/build_parquet_raw_data_edav.Rd +++ b/man/build_parquet_raw_data_edav.Rd @@ -4,7 +4,7 @@ \alias{build_parquet_raw_data_edav} \title{Build raw_data using EDAV files} \usage{ -build_parquet_raw_data_edav(path = NULL, container = NULL, ...) +build_parquet_raw_data_edav(path = NULL, container = NULL, dataset = "all") } \arguments{ \item{path}{\code{str} Path to EDAV folder containing parquet files. This must diff --git a/man/build_parquet_raw_data_local.Rd b/man/build_parquet_raw_data_local.Rd index d6b7aba5..fd6e95df 100644 --- a/man/build_parquet_raw_data_local.Rd +++ b/man/build_parquet_raw_data_local.Rd @@ -4,7 +4,7 @@ \alias{build_parquet_raw_data_local} \title{Build raw_data using local parquet files} \usage{ -build_parquet_raw_data_local(path = NULL) +build_parquet_raw_data_local(path = NULL, dataset = "all") } \arguments{ \item{path}{\code{str} A path to the parquet directory} From 1a330be2ec6de6774b25cc721d2b83e673917ec6 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Tue, 31 Mar 2026 10:28:54 -0400 Subject: [PATCH 16/28] add logic for loading specific datasets --- R/dal.parquet.R | 150 ++++++++++++++++++++++++---- man/build_parquet_raw_data_edav.Rd | 4 +- man/build_parquet_raw_data_local.Rd | 2 + 3 files changed, 134 insertions(+), 22 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index d00bd89e..72d9f6d6 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -226,7 +226,7 @@ get_partition_cols <- function(name) { #' Build raw_data using local parquet files #' #' @param path `str` A path to the parquet directory -#' @param dataset `str` A specific dataset. Defaults to `"all"`. Otherwise, can specify any dataset in the list returned by [get_all_polio_data()]. +#' @param dataset `str` A specific dataset. Defaults to `"all"`. Otherwise, can specify any dataset in the list returned by [get_all_polio_data()]. #' #' @returns `list` A list containing connections to the folders associated with #' individual datasets in the original output of [get_all_polio_data()]. @@ -264,15 +264,52 @@ build_parquet_raw_data_local <- function(path = NULL, dataset = "all") { "metadata" ) data <- list.files(path) - if (dataset == "all") { - data <- intersect(data, valid_values) - } else { - data <- intersect(data, dataset) - } - raw_data <- list() - for (i in data) { - raw_data[[i]] <- arrow::open_dataset(file.path(path, i)) + if (length(dataset) == 1 && dataset == "all") { + raw_data <- list() + + for (i in valid_values) { + + tryCatch({ + raw_data[[i]] <- arrow::open_dataset(file.path(path, i)) + }, error = \(e) { + cli::cli_alert_info(paste0("Dataset not found and won't be added: ", i)) + raw_data[[i]] <- NULL + }) + + } + } else if (length(dataset) > 1) { + invalid <- setdiff(dataset, valid_values) + + if (length(invalid) > 0) { + cli::cli_alert_info("The following type passed are invalid and won't be loaded: ") + cli::cli_li(invalid) + } + + valid <- dataset[!dataset %in% invalid] + + if (length(valid) == 0) { + cli::cli_abort("All the dataset passed are invalid.") + } + + has_all <- sum(stringr::str_detect(valid, "all")) + + if (has_all >= 1) { + cli::cli_abort("Please pass only 'all'.") + } + + raw_data <- list() + + for (i in valid) { + tryCatch({ + raw_data[[i]] <- arrow::open_dataset(file.path(path, i)) + }, error = \(e) { + cli::cli_alert_info(paste0("Dataset not found and won't be added: ", i)) + raw_data[[i]] <- NULL + }) + } + } else if (length(dataset) == 1 && dataset %in% valid_values) { + raw_data <- arrow::open_dataset(file.path(path, dataset)) } return(raw_data) @@ -296,6 +333,33 @@ build_parquet_raw_data_edav <- function(path = NULL, dataset = "all", container container <- get_azure_storage_connection() } + valid_values <- c( + "afp", + "afp.dupe", + "afp.epi", + "para.case", + "es", + "es.dupe", + "sia", + "sia.dupe", + "pos", + "pos.dupe", + "other", + "other.dupe", + "dist.pop", + "prov.pop", + "ctry.pop", + "global.ctry", + "global.prov", + "global.dist", + "ctry.coverage", + "prov.coverage", + "dist.coverage", + "roads", + "cities", + "metadata" + ) + exist <- edav_io("exists.dir", NULL, file_loc = path, azcontainer = container) if (!exist) { @@ -308,29 +372,73 @@ build_parquet_raw_data_edav <- function(path = NULL, dataset = "all", container raw_data <- NULL - if (dataset == "all") { - source_path <- paste0(path, "raw_data_parquet/*") - } else { + if (length(dataset) == 1 && dataset == "all") { + source_path <- file.path(path, "raw_data_parquet/*") + local_pq <- file.path(rappdirs::user_data_dir("sirfunctions"), "raw_data_parquet") + } else if (length(dataset) > 1) { + + invalid <- setdiff(dataset, valid_values) + + if (length(invalid) > 0) { + cli::cli_alert_info( + "The following type passed are invalid and won't be loaded: " + ) + cli::cli_li(invalid) + } + + valid <- dataset[!dataset %in% invalid] + + if (length(valid) == 0) { + cli::cli_abort("All the dataset passed are invalid.") + } + + has_all <- sum(stringr::str_detect(valid, "all")) + + if (has_all >= 1) { + cli::cli_abort("Please pass only 'all'.") + } + + source_path <- paste0(file.path(path, "raw_data_parquet"), "/", valid, "/*") + local_pq <- paste0(file.path(rappdirs::user_data_dir("sirfunctions"), "raw_data_parquet"), "/", valid) + } else if (length(dataset) == 1 && dataset %in% valid_values) { source_path <- paste0(file.path(path, "raw_data_parquet", dataset), "/*") + local_pq <- paste0(file.path(rappdirs::user_data_dir("sirfunctions"), "raw_data_parquet"), "/", dataset) } - local_pq <- file.path(rappdirs::user_data_dir("sirfunctions"), basename(path)) - AzureStor::storage_multidownload( + for (i in local_pq) { + + + unlink(i, recursive = TRUE, force = TRUE) + dir.create(i, recursive = TRUE) + + + } + + if (length(source_path) > 1) { + for (i in length(source_path)) { + + AzureStor::storage_multidownload( + container, + src = source_path[i], + dest = local_pq[i], + recursive = TRUE, + overwrite = TRUE + ) + } + } else { + AzureStor::storage_multidownload( container, src = source_path, dest = local_pq, recursive = TRUE, overwrite = TRUE ) + } - raw_data <- build_parquet_raw_data_local(local_pq, dataset) + raw_data <- build_parquet_raw_data_local(file.path(rappdirs::user_data_dir("sirfunctions"), "raw_data_parquet"), dataset) cli::cli_process_done() - - if (length(raw_data) == 1) { - return(raw_data[[1]]) - } else { - return(raw_data) - } + + return(raw_data) } diff --git a/man/build_parquet_raw_data_edav.Rd b/man/build_parquet_raw_data_edav.Rd index a61f0a83..849f2ea6 100644 --- a/man/build_parquet_raw_data_edav.Rd +++ b/man/build_parquet_raw_data_edav.Rd @@ -4,12 +4,14 @@ \alias{build_parquet_raw_data_edav} \title{Build raw_data using EDAV files} \usage{ -build_parquet_raw_data_edav(path = NULL, container = NULL, dataset = "all") +build_parquet_raw_data_edav(path = NULL, dataset = "all", container = NULL) } \arguments{ \item{path}{\code{str} Path to EDAV folder containing parquet files. This must be the absolute file path from the Blob endpoint of the container.} +\item{dataset}{\code{str} A specific dataset. Defaults to \code{"all"}. Otherwise, can specify any dataset in the list returned by \code{\link[=get_all_polio_data]{get_all_polio_data()}}.} + \item{container}{\code{azcontainer} An instance of an Azure container to connect to. Pass \code{\link[=get_azure_storage_connection]{get_azure_storage_connection()}} using defaults if not accessing using a service principal.} diff --git a/man/build_parquet_raw_data_local.Rd b/man/build_parquet_raw_data_local.Rd index fd6e95df..f4480ca1 100644 --- a/man/build_parquet_raw_data_local.Rd +++ b/man/build_parquet_raw_data_local.Rd @@ -8,6 +8,8 @@ build_parquet_raw_data_local(path = NULL, dataset = "all") } \arguments{ \item{path}{\code{str} A path to the parquet directory} + +\item{dataset}{\code{str} A specific dataset. Defaults to \code{"all"}. Otherwise, can specify any dataset in the list returned by \code{\link[=get_all_polio_data]{get_all_polio_data()}}.} } \value{ \code{list} A list containing connections to the folders associated with From ee3e18b1ab02325c10ae39da895543544e71a5c6 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Tue, 31 Mar 2026 11:16:28 -0400 Subject: [PATCH 17/28] fix issue with saving sf columns --- R/dal.parquet.R | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 72d9f6d6..5304473d 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -164,19 +164,23 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) { from_wkb_to_sf <- function(sf_data) { # Ensure that global shapefiles have Shape and city/roads as geometry. # Otherwise, need to modify this function. + + if (inherits(sf_data, "ArrowObject")) { + cli::cli_abort("Please run dplyr::collect() first prior to passing to the function.") + } + if ("Shape" %in% names(sf_data)) { - wkb_col <- "Shape" + sf_data <- sf_data |> + dplyr::mutate(Shape = sf::st_as_sfc(Shape, EWKB = TRUE, crs = 4326)) |> + sf::st_as_sf() } else if ("geometry" %in% names(sf_data)) { - wkb_col <- "geometry" + sf_data <- sf_data |> + dplyr::mutate(geometry = sf::st_as_sfc(geometry, EWKB = TRUE, crs = 4326)) |> + sf::st_as_sf() } else { cli::cli_abort("Not an sf dataset.") } - sf_data |> - dplyr::mutate(dplyr::across(dplyr::any_of(wkb_col), \(x) { - sf::st_as_sf(x, EWKB = TRUE, crs = 4326) - })) - return(sf_data) } @@ -473,8 +477,6 @@ to_wkb_drop_sf <- function(sf_data) { wkb <- unclass(wkb) # <- key line: makes it a plain list Arrow can infer sf_data[[geom_col]] <- wkb - if (inherits(sf_data, "sf")) { - sf_data <- sf::st_drop_geometry(sf_data) - } + return(sf_data) } \ No newline at end of file From ade9e88339de4c513b3717c81af3178c27b92046 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:36:19 -0400 Subject: [PATCH 18/28] add example of use from_wkb_to_sf --- R/dal.parquet.R | 8 ++++++++ man/from_wkb_to_sf.Rd | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 5304473d..7f04aeb2 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -161,6 +161,14 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) { #' @returns `tibble` Geodata with `sf`. #' #' @export +#' @examples +#' \dontrun { +#' raw_data <- build_parquet_raw_data() +#' kenya_ctry_sf <- raw_data$global.ctry |> +#' dplyr::filter(ctry == "KENYA") |> +#' dplyr::collect() |> +#' from_wkb_to_sf() +#' } from_wkb_to_sf <- function(sf_data) { # Ensure that global shapefiles have Shape and city/roads as geometry. # Otherwise, need to modify this function. diff --git a/man/from_wkb_to_sf.Rd b/man/from_wkb_to_sf.Rd index 33920411..e172bfdd 100644 --- a/man/from_wkb_to_sf.Rd +++ b/man/from_wkb_to_sf.Rd @@ -15,3 +15,12 @@ from_wkb_to_sf(sf_data) \description{ Convert WKB back to sf column } +\examples{ +\dontrun { +raw_data <- build_parquet_raw_data() +kenya_ctry_sf <- raw_data$global.ctry |> + dplyr::filter(ctry == "KENYA") |> + dplyr::collect() |> + from_wkb_to_sf() +} +} From 38a131f713db62011a984c5911d2c8fdf4668b50 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:48:53 -0400 Subject: [PATCH 19/28] add docs --- R/dal.parquet.R | 22 ++++++++-------------- man/build_parquet_raw_data.Rd | 12 ++++++------ man/from_wkb_to_sf.Rd | 2 +- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 7f04aeb2..403e2828 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -72,9 +72,9 @@ create_raw_data_parquet <- function(raw_data, path) { #' Recreates an output of [get_all_polio_data()] from a folder housing #' data in parquet format. #' -#' @param path `str` Local path to the parquet folder -#' @param from_edav `bool` Build using local files or files in EDAV? -#' @param container `azcontainer` An instance of an Azure container to connect +#' @param path `str` Absolute path to the parquet folder. +#' @param from_edav `bool` Build using local files or files in EDAV? Defaults to TRUE. +#' @param container `azcontainer` An instance of an Azure container to connect. #' to. Pass [get_azure_storage_connection()] using defaults if not accessing #' using a service principal. #' @@ -95,19 +95,16 @@ create_raw_data_parquet <- function(raw_data, path) { #' raw_data <- build_parquet_raw_data() #' } build_parquet_raw_data <- function( - path = NULL, - from_edav = F, + path = "GID/PEB/SIR/Data/analytic", + from_edav = TRUE, dataset = "all", - container = NULL + container = get_azure_storage_connection() ) { if (from_edav) { # Default values if (is.null(path)) { cli::cli_abort("Please pass a file path to the parquet folder") } - if (is.null(container)) { - container <- get_azure_storage_connection() - } raw_data <- build_parquet_raw_data_edav(path, dataset, container) } else { @@ -162,7 +159,7 @@ upload_parquet_to_edav <- function(src, dest, container = NULL) { #' #' @export #' @examples -#' \dontrun { +#' \dontrun{ #' raw_data <- build_parquet_raw_data() #' kenya_ctry_sf <- raw_data$global.ctry |> #' dplyr::filter(ctry == "KENYA") |> @@ -340,10 +337,7 @@ build_parquet_raw_data_local <- function(path = NULL, dataset = "all") { #' individual datasets in the original output of [get_all_polio_data()]. #' @keywords internal #' -build_parquet_raw_data_edav <- function(path = NULL, dataset = "all", container = NULL) { - if (is.null(container)) { - container <- get_azure_storage_connection() - } +build_parquet_raw_data_edav <- function(path = NULL, dataset = "all", container = get_azure_storage_connection()) { valid_values <- c( "afp", diff --git a/man/build_parquet_raw_data.Rd b/man/build_parquet_raw_data.Rd index 6f5de4cb..96c6240e 100644 --- a/man/build_parquet_raw_data.Rd +++ b/man/build_parquet_raw_data.Rd @@ -5,18 +5,18 @@ \title{Recreate raw data from local parquet folder} \usage{ build_parquet_raw_data( - path = NULL, - from_edav = F, + path = "GID/PEB/SIR/Data/analytic", + from_edav = TRUE, dataset = "all", - container = NULL + container = get_azure_storage_connection() ) } \arguments{ -\item{path}{\code{str} Local path to the parquet folder} +\item{path}{\code{str} Absolute path to the parquet folder.} -\item{from_edav}{\code{bool} Build using local files or files in EDAV?} +\item{from_edav}{\code{bool} Build using local files or files in EDAV? Defaults to TRUE.} -\item{container}{\code{azcontainer} An instance of an Azure container to connect +\item{container}{\code{azcontainer} An instance of an Azure container to connect. to. Pass \code{\link[=get_azure_storage_connection]{get_azure_storage_connection()}} using defaults if not accessing using a service principal.} } diff --git a/man/from_wkb_to_sf.Rd b/man/from_wkb_to_sf.Rd index e172bfdd..e1623f3b 100644 --- a/man/from_wkb_to_sf.Rd +++ b/man/from_wkb_to_sf.Rd @@ -16,7 +16,7 @@ from_wkb_to_sf(sf_data) Convert WKB back to sf column } \examples{ -\dontrun { +\dontrun{ raw_data <- build_parquet_raw_data() kenya_ctry_sf <- raw_data$global.ctry |> dplyr::filter(ctry == "KENYA") |> From 5ef7953de11786f33ed391bfd801bf73a0a73e46 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:58:25 -0400 Subject: [PATCH 20/28] simplify parameters and functions --- R/dal.parquet.R | 7 ++----- man/build_parquet_raw_data_edav.Rd | 6 +++++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index 403e2828..f9078808 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -96,8 +96,8 @@ create_raw_data_parquet <- function(raw_data, path) { #' } build_parquet_raw_data <- function( path = "GID/PEB/SIR/Data/analytic", - from_edav = TRUE, dataset = "all", + from_edav = TRUE, container = get_azure_storage_connection() ) { if (from_edav) { @@ -131,10 +131,7 @@ build_parquet_raw_data <- function( #' edav_dir <- "ABC/parquet_folder" #' upload_parquet_to_edav(local_dir, edav_dir) #' } -upload_parquet_to_edav <- function(src, dest, container = NULL) { - if (is.null(container)) { - container <- get_azure_storage_connection() - } +upload_parquet_to_edav <- function(src, dest, container = get_azure_storage_connection()) { dir_exists <- edav_io("exists.dir", NULL, dest) if (!dir_exists) { diff --git a/man/build_parquet_raw_data_edav.Rd b/man/build_parquet_raw_data_edav.Rd index 849f2ea6..3ed377ff 100644 --- a/man/build_parquet_raw_data_edav.Rd +++ b/man/build_parquet_raw_data_edav.Rd @@ -4,7 +4,11 @@ \alias{build_parquet_raw_data_edav} \title{Build raw_data using EDAV files} \usage{ -build_parquet_raw_data_edav(path = NULL, dataset = "all", container = NULL) +build_parquet_raw_data_edav( + path = NULL, + dataset = "all", + container = get_azure_storage_connection() +) } \arguments{ \item{path}{\code{str} Path to EDAV folder containing parquet files. This must From 8b9b3511826d3a140595e00c7b3fbfca6cd66949 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:05:13 -0400 Subject: [PATCH 21/28] move get_all_polio_data on its own R script --- R/dal.R | 949 ----------------------------------------- R/get_all_polio_data.R | 947 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 947 insertions(+), 949 deletions(-) create mode 100644 R/get_all_polio_data.R diff --git a/R/dal.R b/R/dal.R index 2ec10532..daa7aa25 100644 --- a/R/dal.R +++ b/R/dal.R @@ -897,955 +897,6 @@ normalize_format <- function(fmt) { #### 2) Key data pull functions #### - -#' Retrieve all pre-processed polio data -#' -#' @description Download POLIS data from the CDC pre-processed endpoint. By default -#' this function will return a "small" or recent dataset. This is primarily for data -#' that is from the past six years. You can specify a "medium" sized dataset for data -#' that is from 2016 onwards. Finally the "large" sized dataset will provide information -#' from 2000 onwards. Regular pulls form the data will recreate the "small" dataset -#' when new information is available and the Data Management Team can force the -#' creation of the "medium" and "large" static datasets as necessary. -#' -#' @param size `str` Size of data to download. Defaults to `"small"`. -#' - `"small"`: Data from the last six years. -#' - `"medium"`: Data from 2016-present. -#' - `"large"`: Data from 2000-present. -#' @param data_folder `str` Location of the data folder containing pre-processed POLIS data, -#' spatial files, coverage data, and population data. Defaults to `"GID/PEB/SIR/Data"`. -#' @param polis_folder `str` Location of the POLIS folder. Defaults to `"GID/PEB/SIR/POLIS"`. -#' @param core_ready_folder `str` Which core ready folder to use. Defaults to `"Core_Ready_Files"`. -#' @param force.new.run `logical` Default `FALSE`, if `TRUE` will run recent data and cache. -#' @param recreate.static.files `logical` Default `FALSE`, if `TRUE` will run all data and cache. -#' @param attach.spatial.data `logical` Default `TRUE`, adds spatial data to downloaded object. -#' @param use_edav `logical` Build raw data list using EDAV files. Defaults to `TRUE`. -#' @param archive Logical. Whether to archive previous output directories -#' before overwriting. Default is `TRUE`. -#' @param keep_n_archives Numeric. Number of archive folders to retain. -#' Defaults to `Inf`, which keeps all archives. Set to a finite number -#' (e.g., 3) to automatically delete older archives beyond the N most recent. -#' @param output_format str: output_format to save files as. -#' Available formats include 'rds' and 'qs2'. Defaults is 'rds'. -#' @param local_caching `logical` Enable local caching so data is stored locally and -#' only downloaded when there is updated data from EDAV. -#' @param use_archived_data `logical` Allows the ability to recreate the raw data file using previous -#' preprocessed data. If -#' @returns Named `list` containing polio data that is relevant to CDC. -#' @examples -#' \dontrun{ -#' raw.data <- get_all_polio_data() # downloads data for last 6 years, including spatial files -#' raw.data <- get_all_polio_data(size = "small", attach.spatial.data = FALSE) # exclude spatial data -#' } -#' -#' @export -get_all_polio_data <- function( - size = "small", - data_folder = "GID/PEB/SIR/Data", - polis_folder = "GID/PEB/SIR/POLIS", - core_ready_folder = "Core_Ready_Files", - force.new.run = FALSE, - recreate.static.files = FALSE, - attach.spatial.data = TRUE, - use_edav = TRUE, - use_archived_data = FALSE, - archive = TRUE, - keep_n_archives = Inf, - output_format = "rds", - local_caching = TRUE) { - - # check to see that size parameter is appropriate - if (!size %in% c("small", "medium", "large")) { - stop("The parameter 'size' must be either 'small', 'medium', or 'large'") - } - - # Check output format - if (!output_format %in% c("rds", "qs2")) { - stop("Only rds and qs2 is supported at this time.") - } - -# normalize and validate both output formats -output_format <- normalize_format(output_format) - -# Fail safe in instances where EDAV connection fails -if (use_edav) { - verify_edav <- tryCatch( - { - invisible(capture.output(test_EDAV_connection())) - cli::cli_alert_success("Connect to EDAV successful.") - TRUE - }, - error = \(e) { - cli::cli_alert_info("Connection to EDAV unsuccessful.") - FALSE - } - ) - - if (!verify_edav) { - cli::cli_alert_info("Unable to obtain data from EDAV. Loading from local cache instead.") - cli::cli_alert_info("NOTE: Data may be stale. Please review the global polio dataset metadata for information on when the data was last processed.") - raw.data <- force_load_polio_data_cache(attach.spatial.data, output_format) - return(raw.data) - } -} - -# Constant variables -# Each file comes out of these folders -analytic_folder <- file.path(data_folder, "analytic") -polis_data_folder <- file.path(data_folder, "polis") -spatial_folder <- file.path(data_folder, "spatial") -coverage_folder <- file.path(data_folder, "coverage") -pop_folder <- file.path(data_folder, "pop") - -# Year cutoffs for the different datasets -current_year <- lubridate::year(Sys.Date()) -small_year <- current_year - 5 -med_year <- 2016 #hardcode to 2016 because it's an important point in time - -# Required files -raw_data_recent_name <- paste0("raw.data.recent", output_format) -raw_data_medium_name <- paste0("raw.data.", med_year, ".", small_year - 1, output_format) -raw_data_2000_name <- paste0("raw.data.2000.", med_year - 1, output_format) -spatial_data_name <- paste0("spatial.data", output_format) -global_ctry_sf_name <- "global.ctry.rds" -global_prov_sf_name <- "global.prov.rds" -global_dist_sf_name <- "global.dist.rds" - -# Perform check to build using the archived polis folder -if (use_archived_data) { - cli::cli_alert_info("Using archived data") - cli::cli_alert_info("NOTE: the metadata will be for the most recent pull") - polis_data_folder <- get_archived_polis_data( - data_folder, - use_edav, - keep_n_archives - ) - recreate.static.files <- TRUE -} - -# look to see if the recent raw data rds is in the analytic folder -prev_table <- sirfunctions_io("list", NULL, analytic_folder, - edav = use_edav -) - -if (nrow(prev_table) > 0) { - prev_table <- prev_table |> - dplyr::filter(grepl(raw_data_recent_name, name)) |> - dplyr::select("file" = "name", "size", "ctime" = "lastModified") -} else { - # if empty, make sure to recreate tibble to the right format - prev_table <- tibble( - "file" = NA, - "size" = NA, - "ctime" = NA - ) |> - dplyr::mutate(file = as.character(file), - size = as.double(size), - ctime = as_datetime(ctime)) |> - dplyr::filter(!is.na(file)) -} - -if (recreate.static.files | force.new.run) { - force.new.run <- T - create.cache <- T -} - - -if (!force.new.run) { - - # Check if using the local cache is sufficient - if (use_edav & size == "small" & local_caching) { - if (!recache_raw_data(analytic_folder, use_edav, output_format)) { - - raw.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"), - paste0("raw_data", output_format)), - edav = FALSE) - - cli::cli_process_start("Checking for duplicates in datasets.") - raw.data <- duplicate_check(raw.data) - cli::cli_process_done() - if (attach.spatial.data) { - if (!recache_spatial_data(analytic_folder, spatial_folder, - use_edav, output_format)) { - spatial.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"), - paste0("spatial_data", output_format)), - edav = FALSE) - raw.data$global.ctry <- spatial.data$global.ctry - raw.data$global.prov <- spatial.data$global.prov - raw.data$global.dist <- spatial.data$global.dist - raw.data$roads <- spatial.data$roads - raw.data$cities <- spatial.data$cities - - return(raw.data) - } else { - spatial.data <- sirfunctions_io("read", NULL, file.path(analytic_folder, spatial_data_name), - edav = use_edav) - sirfunctions_io("write", NULL, file.path(rappdirs::user_data_dir("sirfunctions"), - paste0("spatial_data", output_format)), - obj = spatial.data, - edav = FALSE) - edav_spatial_timestamp <- sirfunctions_io( - "read", - NULL, - file.path(analytic_folder, paste0("spatial_timestamp", output_format)), - edav = use_edav - ) - sirfunctions_io("write", NULL, file.path(rappdirs::user_data_dir("sirfunctions"), - paste0("spatial_timestamp", output_format)), - obj = edav_spatial_timestamp, - edav = FALSE) - - raw.data$global.ctry <- spatial.data$global.ctry - raw.data$global.prov <- spatial.data$global.prov - raw.data$global.dist <- spatial.data$global.dist - raw.data$roads <- spatial.data$roads - raw.data$cities <- spatial.data$cities - - return(raw.data) - } - } else { - return(raw.data) - } - } - } - - if (use_edav) { - cli::cli_alert_info(paste0("Downloading most recent active polio data from ", small_year," onwards")) - } else { - cli::cli_alert_info(paste0("Loading most recent active polio data from ", small_year," onwards")) - } - - raw.data.small.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav) - - if (size == "small") { - raw.data <- raw.data.small.pull - } - - if (size == "medium") { - prev_table <- sirfunctions_io("list", NULL, analytic_folder, edav = use_edav) |> - dplyr::filter(grepl(raw_data_medium_name, name)) |> - dplyr::select("file" = "name", "size", "ctime" = "lastModified") - - if (use_edav) { - cli::cli_alert_info(paste0("Downloading static polio data from ", med_year, "-", small_year)) - } else { - cli::cli_alert_info(paste0("Loading static polio data from ", med_year, "-", small_year)) - } - - raw.data.medium.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav) - - raw.data <- split_concat_raw_data( - action = "concat", - raw.data.small.pull = raw.data.small.pull, - raw.data.medium.pull = raw.data.medium.pull - ) - } - - if (size == "large") { - prev_table <- sirfunctions_io("list", NULL, analytic_folder, - edav = use_edav, full_names = TRUE - ) |> - dplyr::filter(grepl(raw_data_medium_name, name)) |> - dplyr::select("file" = "name", "size", "ctime" = "lastModified") - - if (use_edav) { - cli::cli_alert_info(paste0("Downloading static polio data from ", med_year, "-", small_year)) - } else { - cli::cli_alert_info(paste0("Loading static polio data from ", med_year, "-", small_year)) - } - - raw.data.medium.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav) - - prev_table <- sirfunctions_io("list", NULL, analytic_folder, edav = use_edav) |> - dplyr::filter(grepl(raw_data_2000_name, name)) |> - dplyr::select("file" = "name", "size", "ctime" = "lastModified") - - if (use_edav) { - cli::cli_alert_info(paste0("Downloading static polio data from 2001-", med_year)) - } else { - cli::cli_alert_info(paste0("Loading static polio data from 2001-", med_year)) - } - - raw.data.large.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav) - - raw.data <- split_concat_raw_data( - action = "concat", - raw.data.small.pull = raw.data.small.pull, - raw.data.medium.pull = raw.data.medium.pull, - raw.data.large.pull = raw.data.large.pull - ) - } - - # Only cache the small dataset, which we use in 90% of the case - if (use_edav & local_caching) { - raw_data_timestamp_exists <- invisible(sirfunctions_io( - "exists.file", - NULL, - file.path(analytic_folder, paste0("raw_data_timestamp", output_format)), - edav = use_edav - )) - - } else { - raw_data_timestamp_exists <- FALSE - } - if (size == "small" & raw_data_timestamp_exists & local_caching) { - cli::cli_process_start("Caching global polio data locally") - - if (!dir.exists(rappdirs::user_data_dir("sirfunctions"))) { - dir.create(rappdirs::user_data_dir("sirfunctions"), recursive = TRUE) - } - - sirfunctions_io("write", NULL, - file.path(rappdirs::user_data_dir("sirfunctions"), paste0("raw_data", output_format)), - obj = raw.data, - edav = FALSE) - # Add edav tag file to local cache dir - edav_raw_data_timestamp <- sirfunctions_io( - "read", - NULL, - file.path(analytic_folder, paste0("raw_data_timestamp", output_format)), - edav = use_edav - ) - - sirfunctions_io("write", NULL, - file.path(rappdirs::user_data_dir("sirfunctions"), paste0("raw_data_timestamp", output_format)), - obj = edav_raw_data_timestamp, - edav = FALSE) - - cli::cli_process_done() - } - - cli::cli_process_done() - - cli::cli_process_start("Checking for duplicates in datasets.") - raw.data <- duplicate_check(raw.data) - cli::cli_process_done() - - if (attach.spatial.data) { - - # Don't recache spatial if up to date - if (!recache_spatial_data(analytic_folder, spatial_folder, - use_edav, output_format) & local_caching) { - spatial.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"), - paste0("spatial_data", output_format)), - edav = FALSE) - raw.data$global.ctry <- spatial.data$global.ctry - raw.data$global.prov <- spatial.data$global.prov - raw.data$global.dist <- spatial.data$global.dist - raw.data$roads <- spatial.data$roads - raw.data$cities <- spatial.data$cities - - return(raw.data) - } - - if (use_edav) { - cli::cli_process_start("Downloading and attaching spatial data") - } else { - cli::cli_process_start("Loading and attaching spatial data") - } - - spatial.data <- sirfunctions_io("read", NULL, - file.path(analytic_folder, spatial_data_name), - edav = use_edav - ) - - raw.data$global.ctry <- spatial.data$global.ctry - raw.data$global.prov <- spatial.data$global.prov - raw.data$global.dist <- spatial.data$global.dist - raw.data$roads <- spatial.data$roads - raw.data$cities <- spatial.data$cities - - cli::cli_process_done() - - if (use_edav & local_caching) { - spatial_timestamp_exists <- sirfunctions_io( - "exists.file", - NULL, - file.path(analytic_folder, paste0("spatial_timestamp", output_format)), - edav = use_edav - ) - } else { - spatial_timestamp_exists <- FALSE - } - - if (recache_spatial_data(analytic_folder, spatial_folder, - use_edav, output_format) & spatial_timestamp_exists & local_caching) { - sirfunctions_io("write", - NULL, - file.path(rappdirs::user_data_dir("sirfunctions"), - paste0("spatial_data", - output_format)), - obj = spatial.data, - edav = FALSE) - - spatial_processed_tag <- sirfunctions_io("read", - NULL, - file.path(analytic_folder, - paste0("spatial_timestamp", output_format)), - edav = use_edav) - sirfunctions_io("write", - NULL, - file.path(rappdirs::user_data_dir("sirfunctions"), - paste0("spatial_timestamp", output_format)), - obj = spatial_processed_tag, - edav = FALSE) - } - } - - return(raw.data) - -} else { - - # Check that the required folders have data - for (folder in c(analytic_folder, polis_data_folder, spatial_folder, - coverage_folder, pop_folder)) { - - # get_all_polio_data will recreate the analytic folder if it's missing - switch(basename(folder), - "analytic" = { - if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { - cli::cli_alert_info("No analytics folder found. Will create a new one.") - sirfunctions_io("create.dir", NULL, folder, edav = use_edav) - } - }, - "polis" = { - if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { - cli::cli_alert_info("Creating polis folder in the data folder") - sirfunctions_io("create.dir", NULL, folder, edav = use_edav) - } else { - cli::cli_alert_info("Moving updated polis data to the data folder") - } - - - create_polis_data_folder( - data_folder, - polis_folder, - core_ready_folder, - use_edav, - archive, - keep_n_archives - ) - - }, - "spatial" = { - if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { - cli::cli_abort(paste0("No spatial data found in the data folder.", - " Ensure that the output folder when running ", - " tidypolis::process_spatial() is ", - spatial_folder), - ) - } - }, - "coverage" = { - if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { - cli::cli_abort(paste0("Coverage data not found.", - "Please create and add coverage data in: ", - folder)) - } - }, - "pop" = { - if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { - cli::cli_abort(paste0("Population data not found. ", - "Preprocessing of population files may be required. ", - "Please create a pop data folder and add data in: ", - folder)) - } - } - ) - } - - if (use_edav) { - cli::cli_h1("Testing download times") - download_metrics <- test_EDAV_connection(return_list = T) - } - - # use the truncated AFP file - afp.trunc <- T - - if (recreate.static.files) { - afp.trunc <- F - } - - dl_table <- dplyr::bind_rows( - sirfunctions_io("list", NULL, polis_data_folder, edav = use_edav), - sirfunctions_io("list", NULL, spatial_folder, edav = use_edav), - sirfunctions_io("list", NULL, coverage_folder, edav = use_edav), - sirfunctions_io("list", NULL, pop_folder, edav = use_edav), - sirfunctions_io("list", NULL, polis_folder, edav = use_edav) |> - dplyr::filter(grepl("cache", name)) - ) |> - dplyr::filter(!is.na(size)) |> - dplyr::select("file" = "name", "size") - - if (use_edav) { - dl_table <- dl_table |> - dplyr::mutate( - "dl_time_sec" = size / download_metrics$size * download_metrics$d - ) - } - - if (afp.trunc) { - dl_table <- dl_table |> - dplyr::filter(!grepl("afp_linelist_2001", file)) - } else { - dl_table <- dl_table |> - dplyr::filter(!grepl("afp_linelist_2019", file)) - } - - file_size <- dl_table$size |> sum() - - if (use_edav) { - download_time <- dl_table$dl_time_sec |> sum() - } - - if (use_edav) { - cli::cli_h1("Downloading POLIS Data") - } else { - cli::cli_h1("Loading POLIS Data") - } - - raw.data <- list() - spatial.data <- list() - - # Check if spatial data needs to be redownloaded from the analytics folder - spatial_timestamp_exists <- sirfunctions_io( - "exists.file", - NULL, - file.path(analytic_folder, paste0("spatial_timestamp", output_format)), - edav = use_edav - ) - - if (spatial_timestamp_exists) { - # Check if it's recent or needs updating - edav_spatial_timestamp <- sirfunctions_io( - "read", - NULL, - file.path(analytic_folder, paste0("spatial_timestamp", output_format)), - edav = use_edav - ) |> - dplyr::select(name, lastModifiedEDAV = lastModified) - - edav_spatial_folder_info <- sirfunctions_io( - "list", - NULL, - file.path(spatial_folder), - edav = use_edav - ) |> - dplyr::select(name, lastModified) - - spatial_timestamp_comparison <- dplyr::left_join(edav_spatial_timestamp, - edav_spatial_folder_info) |> - dplyr::mutate(updated = ifelse(lastModifiedEDAV == lastModified, TRUE, FALSE)) |> - dplyr::pull(updated) |> sum(na.rm = TRUE) - } else { - - spatial_timestamp_comparison <- 0 - - } - - if (spatial_timestamp_comparison == 3) { - cli::cli_alert_success("Spatial data in the analytic folder is up to date. Loading from cache...") - spatial.data <- sirfunctions_io( - "read", - NULL, - file.path(analytic_folder, spatial_data_name), - edav = use_edav - ) - } else { - if (spatial_timestamp_exists) { - cli::cli_alert_warning("Spatial data in the analytic folder is outdated. Recreating from the spatial folder") - } else { - cli::cli_alert_warning("No spatial timestamp exists. Recreating from the spatial folder") - } - - cli::cli_process_start("1) Loading country shape files") - spatial.data$global.ctry <- load_clean_ctry_sp( - fp = file.path(spatial_folder, global_ctry_sf_name), - edav = use_edav - ) - cli::cli_process_done() - - cli::cli_process_start("2) Loading province shape files") - spatial.data$global.prov <- load_clean_prov_sp( - fp = file.path(spatial_folder, global_prov_sf_name), - edav = use_edav - ) - cli::cli_process_done() - - cli::cli_process_start("3) Loading district shape files") - spatial.data$global.dist <- load_clean_dist_sp( - fp = file.path(spatial_folder, global_dist_sf_name), - edav = use_edav - ) - cli::cli_process_done() - } - - cli::cli_process_start("4) Loading AFP line list data (This file is almost 3GB and can take a while)") - raw.data$afp <- - sirfunctions_io("read", NULL, file_loc = dplyr::filter( - dl_table, - grepl("afp", file) - ) |> - dplyr::pull(file), edav = use_edav) |> - dplyr::filter(surveillancetypename == "AFP") |> - dplyr::mutate( - cdc.classification.all2 = dplyr::case_when( - final.cell.culture.result == "Not received in lab" & - cdc.classification.all == "PENDING" ~ "LAB PENDING", - TRUE ~ cdc.classification.all - ), - hot.case = ifelse( - paralysis.asymmetric == "Yes" & - paralysis.onset.fever == "Yes" & - paralysis.rapid.progress == "Yes", - 1, - 0 - ), - hot.case = ifelse(is.na(hot.case), 99, hot.case) - ) - - cli::cli_process_done() - - cli::cli_process_start("Processing AFP data for analysis") - - raw.data$afp.epi <- raw.data$afp |> - dplyr::mutate(epi.week = lubridate::epiweek(dateonset)) |> - dplyr::group_by(place.admin.0, epi.week, yronset, cdc.classification.all2) |> - dplyr::summarize(afp.cases = dplyr::n(), - .groups = "drop") |> - dplyr::mutate(epiweek.year = paste(yronset, epi.week, sep = "-")) |> - # manual fix of epi week - dplyr::mutate(epi.week = ifelse(epi.week == 52 & - yronset == 2022, 1, epi.week)) - - # factoring cdc classification to have an order we like in stacked bar chart - raw.data$afp.epi$cdc.classification.all2 <- - factor( - raw.data$afp.epi$cdc.classification.all2, - levels = c( - "WILD 1", - "cVDPV 2", - "VDPV 2", - "cVDPV 1", - "VDPV 1", - "COMPATIBLE", - "PENDING", - "LAB PENDING", - "NPAFP", - "NOT-AFP", - "UNKNOWN", - "aVDPV 1", - "aVDPV 3", - "cVDPV1andcVDPV2", - "CombinationWild1-cVDPV 2", - "aVDPV 2", - "VDPV 3", - "iVDPV 2", - "VDPV1andcVDPV2", - "VAPP", - "cVDPV 3", - "iVDPV 3", - "WILD 3", - "WILD1andWILD3", - "iVDPV 1", - "cVDPV2andcVDPV3" - ), - labels = c( - "WILD 1", - "cVDPV 2", - "VDPV 2", - "cVDPV 1", - "VDPV 1", - "COMPATIBLE", - "PENDING", - "LAB PENDING", - "NPAFP", - "NOT-AFP", - "UNKNOWN", - "aVDPV 1", - "aVDPV 3", - "cVDPV1andcVDPV2", - "CombinationWild1-cVDPV 2", - "aVDPV 2", - "VDPV 3", - "iVDPV 2", - "VDPV1andcVDPV2", - "VAPP", - "cVDPV 3", - "iVDPV 3", - "WILD 3", - "WILD1andWILD3", - "iVDPV 1", - "cVDPV2andcVDPV3" - ) - ) - - raw.data$para.case <- raw.data$afp |> - dplyr::filter( - stringr::str_detect(cdc.classification.all2, "VDPV|WILD|COMPATIBLE") - ) |> - dplyr::mutate(yronset = ifelse(is.na(yronset) == T, 2022, yronset)) # this fix was for the manually added MOZ case - cli::cli_process_done() - - - cli::cli_process_start("5) Loading population data") - raw.data$dist.pop <- - sirfunctions_io("read", NULL, - dplyr::filter(dl_table, grepl("dist.pop", file)) |> - dplyr::pull(file), - edav = use_edav - ) |> - dplyr::ungroup() - - raw.data$prov.pop <- - sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("prov.pop", file)) |> - dplyr::pull(file), edav = use_edav - ) |> - dplyr::ungroup() - - raw.data$ctry.pop <- - sirfunctions_io("read", NULL, - dplyr::filter(dl_table, grepl("ctry.pop", file)) |> - dplyr::pull(file), - edav = use_edav - ) |> - dplyr::ungroup() - cli::cli_process_done() - - - cli::cli_process_start("6) Loading coverage data") - raw.data$ctry.coverage <- sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("ctry_cov", file)) |> - dplyr::pull(file), edav = use_edav - ) - - raw.data$prov.coverage <- sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("prov_cov", file)) |> - dplyr::pull(file), edav = use_edav - ) - - raw.data$dist.coverage <- sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("dist_cov", file)) |> - dplyr::pull(file), edav = use_edav - ) - - cli::cli_process_done() - - cli::cli_process_start("7) Loading ES data") - - raw.data$es <- - sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("/es_2001", file)) |> - dplyr::pull(file), edav = use_edav - ) - cli::cli_process_done() - - cli::cli_process_start("8) Loading SIA data") - raw.data$sia <- - sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("sia", file)) |> - dplyr::pull(file), edav = use_edav - ) - - cli::cli_process_done() - - cli::cli_process_start("9) Loading all positives") - raw.data$pos <- - sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("/pos", file)) |> - dplyr::pull(file), edav = use_edav - ) - - cli::cli_process_done() - - cli::cli_process_start("10) Loading other surveillance linelist") - raw.data$other <- - sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("/other", file)) |> - dplyr::pull(file), edav = use_edav - ) - - cli::cli_process_done() - - cli::cli_process_start("11) Loading road network data") - spatial.data$roads <- sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("roads.rds", file)) |> - dplyr::pull(file), edav = use_edav - ) - cli::cli_process_done() - - cli::cli_process_start("12) Loading city spatial data") - spatial.data$cities <- sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("cities.rds", file)) |> - dplyr::pull(file), edav = use_edav - ) - cli::cli_process_done() - - cli::cli_process_start("13) Creating Metadata object") - - polis.cache <- sirfunctions_io("read", NULL, - file_loc = dplyr::filter(dl_table, grepl("cache.rds", file)) |> - dplyr::pull(file), edav = use_edav - ) |> - dplyr::mutate(last_sync = as.Date(last_sync)) - - raw.data$metadata$download_time <- max(polis.cache$last_sync, na.rm = TRUE) - - raw.data$metadata$processed_time <- sirfunctions_io("list", NULL, - file.path(polis_folder, "data", core_ready_folder), - edav = use_edav - ) |> - dplyr::filter(grepl("positives_2001-01-01", name)) |> - dplyr::select("ctime" = "lastModified") |> - dplyr::mutate(ctime = as.Date(ctime)) |> - dplyr::pull(ctime) - - raw.data$metadata$user <- polis.cache |> - dplyr::filter(table == "virus") |> - dplyr::pull(last_user) - - raw.data$metadata$most_recent_pos <- max(raw.data$pos$dateonset, na.rm = TRUE) - raw.data$metadata$most_recent_pos_loc <- raw.data$pos |> - dplyr::arrange(dplyr::desc(dateonset)) |> - dplyr::slice(1) |> - dplyr::pull(place.admin.0) - - - raw.data$metadata$most_recent_afp <- max(raw.data$afp$dateonset, na.rm = TRUE) - raw.data$metadata$most_recent_afp_loc <- raw.data$afp |> - dplyr::arrange(dplyr::desc(dateonset)) |> - dplyr::slice(1) |> - dplyr::pull(place.admin.0) - - - raw.data$metadata$most_recent_env <- max(raw.data$es$collect.date, na.rm = TRUE) - raw.data$metadata$most_recent_env_loc <- raw.data$es |> - dplyr::arrange(dplyr::desc(collect.date)) |> - dplyr::slice(1) |> - dplyr::pull(ADM0_NAME) - - - raw.data$metadata$most_recent_sia <- max(raw.data$sia$sub.activity.start.date) - raw.data$metadata$most_recent_sia_code <- raw.data$sia |> - dplyr::arrange(dplyr::desc(sub.activity.start.date)) |> - dplyr::slice(1) |> - dplyr::pull(sia.code) - raw.data$metadata$most_recent_sia_location <- raw.data$sia |> - dplyr::arrange(dplyr::desc(sub.activity.start.date)) |> - dplyr::slice(1) |> - dplyr::pull(place.admin.0) - raw.data$metadata$most_recent_sia_vax <- raw.data$sia |> - dplyr::arrange(dplyr::desc(sub.activity.start.date)) |> - dplyr::slice(1) |> - dplyr::pull(vaccine.type) - - raw.data$metadata$most_recent_vdpv_class_change_date <- raw.data$pos$vdpvclassificationchangedate |> - lubridate::as_date() |> - max(na.rm = T) - - rm(polis.cache) - - cli::cli_process_done() - - cli::cli_process_start("14) Clearing out unused memory") - gc() - cli::cli_process_done() -} - -if (create.cache) { - cli::cli_process_start("15) Caching processed data") - - out <- split_concat_raw_data(action = "split", split.years = c(2000, med_year, small_year), raw.data.all = raw.data) - - out_files <- out$split.years |> - dplyr::mutate( - file_name = ifelse(grepl(current_year, tag), "recent", stringr::str_replace_all(tag, "-", ".")), - file_name = paste0("raw.data.", file_name, output_format) - ) - - if (!recreate.static.files) { - out_files <- out_files |> dplyr::filter(grepl("recent", file_name)) - } - - if (!use_archived_data) { - for (i in 1:nrow(out_files)) { - sirfunctions_io("write", NULL, - file_loc = file.path(analytic_folder, dplyr::pull(out_files[i, ], file_name)), - obj = out[[dplyr::pull(out_files[i, ], tag)]], - edav = use_edav - )} - } - -# set up path for spatial df - sp_file_path <- file.path(analytic_folder, paste0("spatial.data", output_format)) - - sirfunctions_io("write", NULL, - file_loc = sp_file_path, - obj = spatial.data, edav = use_edav - ) - - # Create tags only if not using "archived" version - if (use_edav & !use_archived_data) { - # Create raw data file tag for future comparisons - sirfunctions_io("write", NULL, - file_loc = file.path(analytic_folder, paste0("raw_data_timestamp", output_format)), - obj = Sys.time()) - - # Create spatial data file tag for future comparisons - spatial_files <- sirfunctions_io("list", - NULL, - spatial_folder, - edav = use_edav, - full_names = TRUE) - - edav_spatial_timestamp <- spatial_files |> - dplyr::filter(stringr::str_detect(name, "global."), - stringr::str_ends(name, output_format)) |> - dplyr::select(name, lastModified) - - sirfunctions_io( - "write", - NULL, - file.path(analytic_folder, paste0("spatial_timestamp", output_format)), - obj = edav_spatial_timestamp, - edav = use_edav - ) - } - - cli::cli_process_done() -} - -raw_data_cut_size <- switch(size, - "small" = small_year, - "medium" = med_year, - "large" = 2000) - -raw.data <- split_concat_raw_data(action = "split", - split.years = raw_data_cut_size, - raw.data.all = raw.data)[[1]] - -cli::cli_process_start("Checking for duplicates in datasets.") -raw.data <- duplicate_check(raw.data) -cli::cli_process_done() - -if (attach.spatial.data) { - raw.data$global.ctry <- spatial.data$global.ctry - raw.data$global.prov <- spatial.data$global.prov - raw.data$global.dist <- spatial.data$global.dist - raw.data$roads <- spatial.data$roads - raw.data$cities <- spatial.data$cities -} - -if (use_archived_data) { - cli::cli_alert_success(paste0("Successfully recreated global polio data from ", - basename(polis_data_folder))) -} - -return(raw.data) - -} - #' Assess duplicates in the get_all_polio_data() output #' #' @description diff --git a/R/get_all_polio_data.R b/R/get_all_polio_data.R new file mode 100644 index 00000000..9630661f --- /dev/null +++ b/R/get_all_polio_data.R @@ -0,0 +1,947 @@ +#' Retrieve all pre-processed polio data +#' +#' @description Download POLIS data from the CDC pre-processed endpoint. By default +#' this function will return a "small" or recent dataset. This is primarily for data +#' that is from the past six years. You can specify a "medium" sized dataset for data +#' that is from 2016 onwards. Finally the "large" sized dataset will provide information +#' from 2000 onwards. Regular pulls form the data will recreate the "small" dataset +#' when new information is available and the Data Management Team can force the +#' creation of the "medium" and "large" static datasets as necessary. +#' +#' @param size `str` Size of data to download. Defaults to `"small"`. +#' - `"small"`: Data from the last six years. +#' - `"medium"`: Data from 2016-present. +#' - `"large"`: Data from 2000-present. +#' @param data_folder `str` Location of the data folder containing pre-processed POLIS data, +#' spatial files, coverage data, and population data. Defaults to `"GID/PEB/SIR/Data"`. +#' @param polis_folder `str` Location of the POLIS folder. Defaults to `"GID/PEB/SIR/POLIS"`. +#' @param core_ready_folder `str` Which core ready folder to use. Defaults to `"Core_Ready_Files"`. +#' @param force.new.run `logical` Default `FALSE`, if `TRUE` will run recent data and cache. +#' @param recreate.static.files `logical` Default `FALSE`, if `TRUE` will run all data and cache. +#' @param attach.spatial.data `logical` Default `TRUE`, adds spatial data to downloaded object. +#' @param use_edav `logical` Build raw data list using EDAV files. Defaults to `TRUE`. +#' @param archive Logical. Whether to archive previous output directories +#' before overwriting. Default is `TRUE`. +#' @param keep_n_archives Numeric. Number of archive folders to retain. +#' Defaults to `Inf`, which keeps all archives. Set to a finite number +#' (e.g., 3) to automatically delete older archives beyond the N most recent. +#' @param output_format str: output_format to save files as. +#' Available formats include 'rds' and 'qs2'. Defaults is 'rds'. +#' @param local_caching `logical` Enable local caching so data is stored locally and +#' only downloaded when there is updated data from EDAV. +#' @param use_archived_data `logical` Allows the ability to recreate the raw data file using previous +#' preprocessed data. If +#' @returns Named `list` containing polio data that is relevant to CDC. +#' @examples +#' \dontrun{ +#' raw.data <- get_all_polio_data() # downloads data for last 6 years, including spatial files +#' raw.data <- get_all_polio_data(size = "small", attach.spatial.data = FALSE) # exclude spatial data +#' } +#' +#' @export +get_all_polio_data <- function( + size = "small", + data_folder = "GID/PEB/SIR/Data", + polis_folder = "GID/PEB/SIR/POLIS", + core_ready_folder = "Core_Ready_Files", + force.new.run = FALSE, + recreate.static.files = FALSE, + attach.spatial.data = TRUE, + use_edav = TRUE, + use_archived_data = FALSE, + archive = TRUE, + keep_n_archives = Inf, + output_format = "rds", + local_caching = TRUE) { + + # check to see that size parameter is appropriate + if (!size %in% c("small", "medium", "large")) { + stop("The parameter 'size' must be either 'small', 'medium', or 'large'") + } + + # Check output format + if (!output_format %in% c("rds", "qs2")) { + stop("Only rds and qs2 is supported at this time.") + } + +# normalize and validate both output formats +output_format <- normalize_format(output_format) + +# Fail safe in instances where EDAV connection fails +if (use_edav) { + verify_edav <- tryCatch( + { + invisible(capture.output(test_EDAV_connection())) + cli::cli_alert_success("Connect to EDAV successful.") + TRUE + }, + error = \(e) { + cli::cli_alert_info("Connection to EDAV unsuccessful.") + FALSE + } + ) + + if (!verify_edav) { + cli::cli_alert_info("Unable to obtain data from EDAV. Loading from local cache instead.") + cli::cli_alert_info("NOTE: Data may be stale. Please review the global polio dataset metadata for information on when the data was last processed.") + raw.data <- force_load_polio_data_cache(attach.spatial.data, output_format) + return(raw.data) + } +} + +# Constant variables +# Each file comes out of these folders +analytic_folder <- file.path(data_folder, "analytic") +polis_data_folder <- file.path(data_folder, "polis") +spatial_folder <- file.path(data_folder, "spatial") +coverage_folder <- file.path(data_folder, "coverage") +pop_folder <- file.path(data_folder, "pop") + +# Year cutoffs for the different datasets +current_year <- lubridate::year(Sys.Date()) +small_year <- current_year - 5 +med_year <- 2016 #hardcode to 2016 because it's an important point in time + +# Required files +raw_data_recent_name <- paste0("raw.data.recent", output_format) +raw_data_medium_name <- paste0("raw.data.", med_year, ".", small_year - 1, output_format) +raw_data_2000_name <- paste0("raw.data.2000.", med_year - 1, output_format) +spatial_data_name <- paste0("spatial.data", output_format) +global_ctry_sf_name <- "global.ctry.rds" +global_prov_sf_name <- "global.prov.rds" +global_dist_sf_name <- "global.dist.rds" + +# Perform check to build using the archived polis folder +if (use_archived_data) { + cli::cli_alert_info("Using archived data") + cli::cli_alert_info("NOTE: the metadata will be for the most recent pull") + polis_data_folder <- get_archived_polis_data( + data_folder, + use_edav, + keep_n_archives + ) + recreate.static.files <- TRUE +} + +# look to see if the recent raw data rds is in the analytic folder +prev_table <- sirfunctions_io("list", NULL, analytic_folder, + edav = use_edav +) + +if (nrow(prev_table) > 0) { + prev_table <- prev_table |> + dplyr::filter(grepl(raw_data_recent_name, name)) |> + dplyr::select("file" = "name", "size", "ctime" = "lastModified") +} else { + # if empty, make sure to recreate tibble to the right format + prev_table <- tibble( + "file" = NA, + "size" = NA, + "ctime" = NA + ) |> + dplyr::mutate(file = as.character(file), + size = as.double(size), + ctime = as_datetime(ctime)) |> + dplyr::filter(!is.na(file)) +} + +if (recreate.static.files | force.new.run) { + force.new.run <- T + create.cache <- T +} + + +if (!force.new.run) { + + # Check if using the local cache is sufficient + if (use_edav & size == "small" & local_caching) { + if (!recache_raw_data(analytic_folder, use_edav, output_format)) { + + raw.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"), + paste0("raw_data", output_format)), + edav = FALSE) + + cli::cli_process_start("Checking for duplicates in datasets.") + raw.data <- duplicate_check(raw.data) + cli::cli_process_done() + if (attach.spatial.data) { + if (!recache_spatial_data(analytic_folder, spatial_folder, + use_edav, output_format)) { + spatial.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"), + paste0("spatial_data", output_format)), + edav = FALSE) + raw.data$global.ctry <- spatial.data$global.ctry + raw.data$global.prov <- spatial.data$global.prov + raw.data$global.dist <- spatial.data$global.dist + raw.data$roads <- spatial.data$roads + raw.data$cities <- spatial.data$cities + + return(raw.data) + } else { + spatial.data <- sirfunctions_io("read", NULL, file.path(analytic_folder, spatial_data_name), + edav = use_edav) + sirfunctions_io("write", NULL, file.path(rappdirs::user_data_dir("sirfunctions"), + paste0("spatial_data", output_format)), + obj = spatial.data, + edav = FALSE) + edav_spatial_timestamp <- sirfunctions_io( + "read", + NULL, + file.path(analytic_folder, paste0("spatial_timestamp", output_format)), + edav = use_edav + ) + sirfunctions_io("write", NULL, file.path(rappdirs::user_data_dir("sirfunctions"), + paste0("spatial_timestamp", output_format)), + obj = edav_spatial_timestamp, + edav = FALSE) + + raw.data$global.ctry <- spatial.data$global.ctry + raw.data$global.prov <- spatial.data$global.prov + raw.data$global.dist <- spatial.data$global.dist + raw.data$roads <- spatial.data$roads + raw.data$cities <- spatial.data$cities + + return(raw.data) + } + } else { + return(raw.data) + } + } + } + + if (use_edav) { + cli::cli_alert_info(paste0("Downloading most recent active polio data from ", small_year," onwards")) + } else { + cli::cli_alert_info(paste0("Loading most recent active polio data from ", small_year," onwards")) + } + + raw.data.small.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav) + + if (size == "small") { + raw.data <- raw.data.small.pull + } + + if (size == "medium") { + prev_table <- sirfunctions_io("list", NULL, analytic_folder, edav = use_edav) |> + dplyr::filter(grepl(raw_data_medium_name, name)) |> + dplyr::select("file" = "name", "size", "ctime" = "lastModified") + + if (use_edav) { + cli::cli_alert_info(paste0("Downloading static polio data from ", med_year, "-", small_year)) + } else { + cli::cli_alert_info(paste0("Loading static polio data from ", med_year, "-", small_year)) + } + + raw.data.medium.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav) + + raw.data <- split_concat_raw_data( + action = "concat", + raw.data.small.pull = raw.data.small.pull, + raw.data.medium.pull = raw.data.medium.pull + ) + } + + if (size == "large") { + prev_table <- sirfunctions_io("list", NULL, analytic_folder, + edav = use_edav, full_names = TRUE + ) |> + dplyr::filter(grepl(raw_data_medium_name, name)) |> + dplyr::select("file" = "name", "size", "ctime" = "lastModified") + + if (use_edav) { + cli::cli_alert_info(paste0("Downloading static polio data from ", med_year, "-", small_year)) + } else { + cli::cli_alert_info(paste0("Loading static polio data from ", med_year, "-", small_year)) + } + + raw.data.medium.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav) + + prev_table <- sirfunctions_io("list", NULL, analytic_folder, edav = use_edav) |> + dplyr::filter(grepl(raw_data_2000_name, name)) |> + dplyr::select("file" = "name", "size", "ctime" = "lastModified") + + if (use_edav) { + cli::cli_alert_info(paste0("Downloading static polio data from 2001-", med_year)) + } else { + cli::cli_alert_info(paste0("Loading static polio data from 2001-", med_year)) + } + + raw.data.large.pull <- sirfunctions_io("read", NULL, prev_table$file, edav = use_edav) + + raw.data <- split_concat_raw_data( + action = "concat", + raw.data.small.pull = raw.data.small.pull, + raw.data.medium.pull = raw.data.medium.pull, + raw.data.large.pull = raw.data.large.pull + ) + } + + # Only cache the small dataset, which we use in 90% of the case + if (use_edav & local_caching) { + raw_data_timestamp_exists <- invisible(sirfunctions_io( + "exists.file", + NULL, + file.path(analytic_folder, paste0("raw_data_timestamp", output_format)), + edav = use_edav + )) + + } else { + raw_data_timestamp_exists <- FALSE + } + if (size == "small" & raw_data_timestamp_exists & local_caching) { + cli::cli_process_start("Caching global polio data locally") + + if (!dir.exists(rappdirs::user_data_dir("sirfunctions"))) { + dir.create(rappdirs::user_data_dir("sirfunctions"), recursive = TRUE) + } + + sirfunctions_io("write", NULL, + file.path(rappdirs::user_data_dir("sirfunctions"), paste0("raw_data", output_format)), + obj = raw.data, + edav = FALSE) + # Add edav tag file to local cache dir + edav_raw_data_timestamp <- sirfunctions_io( + "read", + NULL, + file.path(analytic_folder, paste0("raw_data_timestamp", output_format)), + edav = use_edav + ) + + sirfunctions_io("write", NULL, + file.path(rappdirs::user_data_dir("sirfunctions"), paste0("raw_data_timestamp", output_format)), + obj = edav_raw_data_timestamp, + edav = FALSE) + + cli::cli_process_done() + } + + cli::cli_process_done() + + cli::cli_process_start("Checking for duplicates in datasets.") + raw.data <- duplicate_check(raw.data) + cli::cli_process_done() + + if (attach.spatial.data) { + + # Don't recache spatial if up to date + if (!recache_spatial_data(analytic_folder, spatial_folder, + use_edav, output_format) & local_caching) { + spatial.data <- sirfunctions_io("read", NULL, file.path(rappdirs::user_data_dir("sirfunctions"), + paste0("spatial_data", output_format)), + edav = FALSE) + raw.data$global.ctry <- spatial.data$global.ctry + raw.data$global.prov <- spatial.data$global.prov + raw.data$global.dist <- spatial.data$global.dist + raw.data$roads <- spatial.data$roads + raw.data$cities <- spatial.data$cities + + return(raw.data) + } + + if (use_edav) { + cli::cli_process_start("Downloading and attaching spatial data") + } else { + cli::cli_process_start("Loading and attaching spatial data") + } + + spatial.data <- sirfunctions_io("read", NULL, + file.path(analytic_folder, spatial_data_name), + edav = use_edav + ) + + raw.data$global.ctry <- spatial.data$global.ctry + raw.data$global.prov <- spatial.data$global.prov + raw.data$global.dist <- spatial.data$global.dist + raw.data$roads <- spatial.data$roads + raw.data$cities <- spatial.data$cities + + cli::cli_process_done() + + if (use_edav & local_caching) { + spatial_timestamp_exists <- sirfunctions_io( + "exists.file", + NULL, + file.path(analytic_folder, paste0("spatial_timestamp", output_format)), + edav = use_edav + ) + } else { + spatial_timestamp_exists <- FALSE + } + + if (recache_spatial_data(analytic_folder, spatial_folder, + use_edav, output_format) & spatial_timestamp_exists & local_caching) { + sirfunctions_io("write", + NULL, + file.path(rappdirs::user_data_dir("sirfunctions"), + paste0("spatial_data", + output_format)), + obj = spatial.data, + edav = FALSE) + + spatial_processed_tag <- sirfunctions_io("read", + NULL, + file.path(analytic_folder, + paste0("spatial_timestamp", output_format)), + edav = use_edav) + sirfunctions_io("write", + NULL, + file.path(rappdirs::user_data_dir("sirfunctions"), + paste0("spatial_timestamp", output_format)), + obj = spatial_processed_tag, + edav = FALSE) + } + } + + return(raw.data) + +} else { + + # Check that the required folders have data + for (folder in c(analytic_folder, polis_data_folder, spatial_folder, + coverage_folder, pop_folder)) { + + # get_all_polio_data will recreate the analytic folder if it's missing + switch(basename(folder), + "analytic" = { + if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { + cli::cli_alert_info("No analytics folder found. Will create a new one.") + sirfunctions_io("create.dir", NULL, folder, edav = use_edav) + } + }, + "polis" = { + if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { + cli::cli_alert_info("Creating polis folder in the data folder") + sirfunctions_io("create.dir", NULL, folder, edav = use_edav) + } else { + cli::cli_alert_info("Moving updated polis data to the data folder") + } + + + create_polis_data_folder( + data_folder, + polis_folder, + core_ready_folder, + use_edav, + archive, + keep_n_archives + ) + + }, + "spatial" = { + if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { + cli::cli_abort(paste0("No spatial data found in the data folder.", + " Ensure that the output folder when running ", + " tidypolis::process_spatial() is ", + spatial_folder), + ) + } + }, + "coverage" = { + if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { + cli::cli_abort(paste0("Coverage data not found.", + "Please create and add coverage data in: ", + folder)) + } + }, + "pop" = { + if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { + cli::cli_abort(paste0("Population data not found. ", + "Preprocessing of population files may be required. ", + "Please create a pop data folder and add data in: ", + folder)) + } + } + ) + } + + if (use_edav) { + cli::cli_h1("Testing download times") + download_metrics <- test_EDAV_connection(return_list = T) + } + + # use the truncated AFP file + afp.trunc <- T + + if (recreate.static.files) { + afp.trunc <- F + } + + dl_table <- dplyr::bind_rows( + sirfunctions_io("list", NULL, polis_data_folder, edav = use_edav), + sirfunctions_io("list", NULL, spatial_folder, edav = use_edav), + sirfunctions_io("list", NULL, coverage_folder, edav = use_edav), + sirfunctions_io("list", NULL, pop_folder, edav = use_edav), + sirfunctions_io("list", NULL, polis_folder, edav = use_edav) |> + dplyr::filter(grepl("cache", name)) + ) |> + dplyr::filter(!is.na(size)) |> + dplyr::select("file" = "name", "size") + + if (use_edav) { + dl_table <- dl_table |> + dplyr::mutate( + "dl_time_sec" = size / download_metrics$size * download_metrics$d + ) + } + + if (afp.trunc) { + dl_table <- dl_table |> + dplyr::filter(!grepl("afp_linelist_2001", file)) + } else { + dl_table <- dl_table |> + dplyr::filter(!grepl("afp_linelist_2019", file)) + } + + file_size <- dl_table$size |> sum() + + if (use_edav) { + download_time <- dl_table$dl_time_sec |> sum() + } + + if (use_edav) { + cli::cli_h1("Downloading POLIS Data") + } else { + cli::cli_h1("Loading POLIS Data") + } + + raw.data <- list() + spatial.data <- list() + + # Check if spatial data needs to be redownloaded from the analytics folder + spatial_timestamp_exists <- sirfunctions_io( + "exists.file", + NULL, + file.path(analytic_folder, paste0("spatial_timestamp", output_format)), + edav = use_edav + ) + + if (spatial_timestamp_exists) { + # Check if it's recent or needs updating + edav_spatial_timestamp <- sirfunctions_io( + "read", + NULL, + file.path(analytic_folder, paste0("spatial_timestamp", output_format)), + edav = use_edav + ) |> + dplyr::select(name, lastModifiedEDAV = lastModified) + + edav_spatial_folder_info <- sirfunctions_io( + "list", + NULL, + file.path(spatial_folder), + edav = use_edav + ) |> + dplyr::select(name, lastModified) + + spatial_timestamp_comparison <- dplyr::left_join(edav_spatial_timestamp, + edav_spatial_folder_info) |> + dplyr::mutate(updated = ifelse(lastModifiedEDAV == lastModified, TRUE, FALSE)) |> + dplyr::pull(updated) |> sum(na.rm = TRUE) + } else { + + spatial_timestamp_comparison <- 0 + + } + + if (spatial_timestamp_comparison == 3) { + cli::cli_alert_success("Spatial data in the analytic folder is up to date. Loading from cache...") + spatial.data <- sirfunctions_io( + "read", + NULL, + file.path(analytic_folder, spatial_data_name), + edav = use_edav + ) + } else { + if (spatial_timestamp_exists) { + cli::cli_alert_warning("Spatial data in the analytic folder is outdated. Recreating from the spatial folder") + } else { + cli::cli_alert_warning("No spatial timestamp exists. Recreating from the spatial folder") + } + + cli::cli_process_start("1) Loading country shape files") + spatial.data$global.ctry <- load_clean_ctry_sp( + fp = file.path(spatial_folder, global_ctry_sf_name), + edav = use_edav + ) + cli::cli_process_done() + + cli::cli_process_start("2) Loading province shape files") + spatial.data$global.prov <- load_clean_prov_sp( + fp = file.path(spatial_folder, global_prov_sf_name), + edav = use_edav + ) + cli::cli_process_done() + + cli::cli_process_start("3) Loading district shape files") + spatial.data$global.dist <- load_clean_dist_sp( + fp = file.path(spatial_folder, global_dist_sf_name), + edav = use_edav + ) + cli::cli_process_done() + } + + cli::cli_process_start("4) Loading AFP line list data (This file is almost 3GB and can take a while)") + raw.data$afp <- + sirfunctions_io("read", NULL, file_loc = dplyr::filter( + dl_table, + grepl("afp", file) + ) |> + dplyr::pull(file), edav = use_edav) |> + dplyr::filter(surveillancetypename == "AFP") |> + dplyr::mutate( + cdc.classification.all2 = dplyr::case_when( + final.cell.culture.result == "Not received in lab" & + cdc.classification.all == "PENDING" ~ "LAB PENDING", + TRUE ~ cdc.classification.all + ), + hot.case = ifelse( + paralysis.asymmetric == "Yes" & + paralysis.onset.fever == "Yes" & + paralysis.rapid.progress == "Yes", + 1, + 0 + ), + hot.case = ifelse(is.na(hot.case), 99, hot.case) + ) + + cli::cli_process_done() + + cli::cli_process_start("Processing AFP data for analysis") + + raw.data$afp.epi <- raw.data$afp |> + dplyr::mutate(epi.week = lubridate::epiweek(dateonset)) |> + dplyr::group_by(place.admin.0, epi.week, yronset, cdc.classification.all2) |> + dplyr::summarize(afp.cases = dplyr::n(), + .groups = "drop") |> + dplyr::mutate(epiweek.year = paste(yronset, epi.week, sep = "-")) |> + # manual fix of epi week + dplyr::mutate(epi.week = ifelse(epi.week == 52 & + yronset == 2022, 1, epi.week)) + + # factoring cdc classification to have an order we like in stacked bar chart + raw.data$afp.epi$cdc.classification.all2 <- + factor( + raw.data$afp.epi$cdc.classification.all2, + levels = c( + "WILD 1", + "cVDPV 2", + "VDPV 2", + "cVDPV 1", + "VDPV 1", + "COMPATIBLE", + "PENDING", + "LAB PENDING", + "NPAFP", + "NOT-AFP", + "UNKNOWN", + "aVDPV 1", + "aVDPV 3", + "cVDPV1andcVDPV2", + "CombinationWild1-cVDPV 2", + "aVDPV 2", + "VDPV 3", + "iVDPV 2", + "VDPV1andcVDPV2", + "VAPP", + "cVDPV 3", + "iVDPV 3", + "WILD 3", + "WILD1andWILD3", + "iVDPV 1", + "cVDPV2andcVDPV3" + ), + labels = c( + "WILD 1", + "cVDPV 2", + "VDPV 2", + "cVDPV 1", + "VDPV 1", + "COMPATIBLE", + "PENDING", + "LAB PENDING", + "NPAFP", + "NOT-AFP", + "UNKNOWN", + "aVDPV 1", + "aVDPV 3", + "cVDPV1andcVDPV2", + "CombinationWild1-cVDPV 2", + "aVDPV 2", + "VDPV 3", + "iVDPV 2", + "VDPV1andcVDPV2", + "VAPP", + "cVDPV 3", + "iVDPV 3", + "WILD 3", + "WILD1andWILD3", + "iVDPV 1", + "cVDPV2andcVDPV3" + ) + ) + + raw.data$para.case <- raw.data$afp |> + dplyr::filter( + stringr::str_detect(cdc.classification.all2, "VDPV|WILD|COMPATIBLE") + ) |> + dplyr::mutate(yronset = ifelse(is.na(yronset) == T, 2022, yronset)) # this fix was for the manually added MOZ case + cli::cli_process_done() + + + cli::cli_process_start("5) Loading population data") + raw.data$dist.pop <- + sirfunctions_io("read", NULL, + dplyr::filter(dl_table, grepl("dist.pop", file)) |> + dplyr::pull(file), + edav = use_edav + ) |> + dplyr::ungroup() + + raw.data$prov.pop <- + sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("prov.pop", file)) |> + dplyr::pull(file), edav = use_edav + ) |> + dplyr::ungroup() + + raw.data$ctry.pop <- + sirfunctions_io("read", NULL, + dplyr::filter(dl_table, grepl("ctry.pop", file)) |> + dplyr::pull(file), + edav = use_edav + ) |> + dplyr::ungroup() + cli::cli_process_done() + + + cli::cli_process_start("6) Loading coverage data") + raw.data$ctry.coverage <- sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("ctry_cov", file)) |> + dplyr::pull(file), edav = use_edav + ) + + raw.data$prov.coverage <- sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("prov_cov", file)) |> + dplyr::pull(file), edav = use_edav + ) + + raw.data$dist.coverage <- sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("dist_cov", file)) |> + dplyr::pull(file), edav = use_edav + ) + + cli::cli_process_done() + + cli::cli_process_start("7) Loading ES data") + + raw.data$es <- + sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("/es_2001", file)) |> + dplyr::pull(file), edav = use_edav + ) + cli::cli_process_done() + + cli::cli_process_start("8) Loading SIA data") + raw.data$sia <- + sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("sia", file)) |> + dplyr::pull(file), edav = use_edav + ) + + cli::cli_process_done() + + cli::cli_process_start("9) Loading all positives") + raw.data$pos <- + sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("/pos", file)) |> + dplyr::pull(file), edav = use_edav + ) + + cli::cli_process_done() + + cli::cli_process_start("10) Loading other surveillance linelist") + raw.data$other <- + sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("/other", file)) |> + dplyr::pull(file), edav = use_edav + ) + + cli::cli_process_done() + + cli::cli_process_start("11) Loading road network data") + spatial.data$roads <- sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("roads.rds", file)) |> + dplyr::pull(file), edav = use_edav + ) + cli::cli_process_done() + + cli::cli_process_start("12) Loading city spatial data") + spatial.data$cities <- sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("cities.rds", file)) |> + dplyr::pull(file), edav = use_edav + ) + cli::cli_process_done() + + cli::cli_process_start("13) Creating Metadata object") + + polis.cache <- sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("cache.rds", file)) |> + dplyr::pull(file), edav = use_edav + ) |> + dplyr::mutate(last_sync = as.Date(last_sync)) + + raw.data$metadata$download_time <- max(polis.cache$last_sync, na.rm = TRUE) + + raw.data$metadata$processed_time <- sirfunctions_io("list", NULL, + file.path(polis_folder, "data", core_ready_folder), + edav = use_edav + ) |> + dplyr::filter(grepl("positives_2001-01-01", name)) |> + dplyr::select("ctime" = "lastModified") |> + dplyr::mutate(ctime = as.Date(ctime)) |> + dplyr::pull(ctime) + + raw.data$metadata$user <- polis.cache |> + dplyr::filter(table == "virus") |> + dplyr::pull(last_user) + + raw.data$metadata$most_recent_pos <- max(raw.data$pos$dateonset, na.rm = TRUE) + raw.data$metadata$most_recent_pos_loc <- raw.data$pos |> + dplyr::arrange(dplyr::desc(dateonset)) |> + dplyr::slice(1) |> + dplyr::pull(place.admin.0) + + + raw.data$metadata$most_recent_afp <- max(raw.data$afp$dateonset, na.rm = TRUE) + raw.data$metadata$most_recent_afp_loc <- raw.data$afp |> + dplyr::arrange(dplyr::desc(dateonset)) |> + dplyr::slice(1) |> + dplyr::pull(place.admin.0) + + + raw.data$metadata$most_recent_env <- max(raw.data$es$collect.date, na.rm = TRUE) + raw.data$metadata$most_recent_env_loc <- raw.data$es |> + dplyr::arrange(dplyr::desc(collect.date)) |> + dplyr::slice(1) |> + dplyr::pull(ADM0_NAME) + + + raw.data$metadata$most_recent_sia <- max(raw.data$sia$sub.activity.start.date) + raw.data$metadata$most_recent_sia_code <- raw.data$sia |> + dplyr::arrange(dplyr::desc(sub.activity.start.date)) |> + dplyr::slice(1) |> + dplyr::pull(sia.code) + raw.data$metadata$most_recent_sia_location <- raw.data$sia |> + dplyr::arrange(dplyr::desc(sub.activity.start.date)) |> + dplyr::slice(1) |> + dplyr::pull(place.admin.0) + raw.data$metadata$most_recent_sia_vax <- raw.data$sia |> + dplyr::arrange(dplyr::desc(sub.activity.start.date)) |> + dplyr::slice(1) |> + dplyr::pull(vaccine.type) + + raw.data$metadata$most_recent_vdpv_class_change_date <- raw.data$pos$vdpvclassificationchangedate |> + lubridate::as_date() |> + max(na.rm = T) + + rm(polis.cache) + + cli::cli_process_done() + + cli::cli_process_start("14) Clearing out unused memory") + gc() + cli::cli_process_done() +} + +if (create.cache) { + cli::cli_process_start("15) Caching processed data") + + out <- split_concat_raw_data(action = "split", split.years = c(2000, med_year, small_year), raw.data.all = raw.data) + + out_files <- out$split.years |> + dplyr::mutate( + file_name = ifelse(grepl(current_year, tag), "recent", stringr::str_replace_all(tag, "-", ".")), + file_name = paste0("raw.data.", file_name, output_format) + ) + + if (!recreate.static.files) { + out_files <- out_files |> dplyr::filter(grepl("recent", file_name)) + } + + if (!use_archived_data) { + for (i in 1:nrow(out_files)) { + sirfunctions_io("write", NULL, + file_loc = file.path(analytic_folder, dplyr::pull(out_files[i, ], file_name)), + obj = out[[dplyr::pull(out_files[i, ], tag)]], + edav = use_edav + )} + } + +# set up path for spatial df + sp_file_path <- file.path(analytic_folder, paste0("spatial.data", output_format)) + + sirfunctions_io("write", NULL, + file_loc = sp_file_path, + obj = spatial.data, edav = use_edav + ) + + # Create tags only if not using "archived" version + if (use_edav & !use_archived_data) { + # Create raw data file tag for future comparisons + sirfunctions_io("write", NULL, + file_loc = file.path(analytic_folder, paste0("raw_data_timestamp", output_format)), + obj = Sys.time()) + + # Create spatial data file tag for future comparisons + spatial_files <- sirfunctions_io("list", + NULL, + spatial_folder, + edav = use_edav, + full_names = TRUE) + + edav_spatial_timestamp <- spatial_files |> + dplyr::filter(stringr::str_detect(name, "global."), + stringr::str_ends(name, output_format)) |> + dplyr::select(name, lastModified) + + sirfunctions_io( + "write", + NULL, + file.path(analytic_folder, paste0("spatial_timestamp", output_format)), + obj = edav_spatial_timestamp, + edav = use_edav + ) + } + + cli::cli_process_done() +} + +raw_data_cut_size <- switch(size, + "small" = small_year, + "medium" = med_year, + "large" = 2000) + +raw.data <- split_concat_raw_data(action = "split", + split.years = raw_data_cut_size, + raw.data.all = raw.data)[[1]] + +cli::cli_process_start("Checking for duplicates in datasets.") +raw.data <- duplicate_check(raw.data) +cli::cli_process_done() + +if (attach.spatial.data) { + raw.data$global.ctry <- spatial.data$global.ctry + raw.data$global.prov <- spatial.data$global.prov + raw.data$global.dist <- spatial.data$global.dist + raw.data$roads <- spatial.data$roads + raw.data$cities <- spatial.data$cities +} + +if (use_archived_data) { + cli::cli_alert_success(paste0("Successfully recreated global polio data from ", + basename(polis_data_folder))) +} + +return(raw.data) + +} \ No newline at end of file From c128e56795165429ff64e27e3f2e40ca8dab07f7 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:06:02 -0400 Subject: [PATCH 22/28] update docs to reflect get_all_polio_data moving to its own script --- man/build_parquet_raw_data.Rd | 2 +- man/get_all_polio_data.Rd | 2 +- man/upload_parquet_to_edav.Rd | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/man/build_parquet_raw_data.Rd b/man/build_parquet_raw_data.Rd index 96c6240e..c0d0a7a6 100644 --- a/man/build_parquet_raw_data.Rd +++ b/man/build_parquet_raw_data.Rd @@ -6,8 +6,8 @@ \usage{ build_parquet_raw_data( path = "GID/PEB/SIR/Data/analytic", - from_edav = TRUE, dataset = "all", + from_edav = TRUE, container = get_azure_storage_connection() ) } diff --git a/man/get_all_polio_data.Rd b/man/get_all_polio_data.Rd index 93b44408..a09aa17f 100644 --- a/man/get_all_polio_data.Rd +++ b/man/get_all_polio_data.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dal.R +% Please edit documentation in R/get_all_polio_data.R \name{get_all_polio_data} \alias{get_all_polio_data} \title{Retrieve all pre-processed polio data} diff --git a/man/upload_parquet_to_edav.Rd b/man/upload_parquet_to_edav.Rd index 41064538..97c0a633 100644 --- a/man/upload_parquet_to_edav.Rd +++ b/man/upload_parquet_to_edav.Rd @@ -4,7 +4,7 @@ \alias{upload_parquet_to_edav} \title{Uploads a local parquet folder to EDAV} \usage{ -upload_parquet_to_edav(src, dest, container = NULL) +upload_parquet_to_edav(src, dest, container = get_azure_storage_connection()) } \arguments{ \item{src}{\code{str} Local path to the parquet folder.} From 36c0ae787f8746a8a373206fea1fcd44a1fc9335 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:51:45 -0400 Subject: [PATCH 23/28] minimal example of get_all_polio_data() using parquet --- NAMESPACE | 1 + R/get_all_polio_data_2.R | 640 ++++++++++++++++++++++ man/cache_raw_data.Rd | 22 + man/check_data_folder.Rd | 32 ++ man/check_spatial_data_for_processing.Rd | 20 + man/create_raw_data_tags.Rd | 20 + man/get_all_polio_data_2.Rd | 45 ++ man/list_required_files_for_processing.Rd | 22 + man/process_afp_epi_raw_data.Rd | 18 + man/process_afp_raw_data.Rd | 20 + man/process_metadata_raw_data.Rd | 32 ++ man/process_paralytic_raw_data.Rd | 18 + man/pull_data_from_dl_table.Rd | 22 + man/reprocess_polio_data.Rd | 32 ++ 14 files changed, 944 insertions(+) create mode 100644 R/get_all_polio_data_2.R create mode 100644 man/cache_raw_data.Rd create mode 100644 man/check_data_folder.Rd create mode 100644 man/check_spatial_data_for_processing.Rd create mode 100644 man/create_raw_data_tags.Rd create mode 100644 man/get_all_polio_data_2.Rd create mode 100644 man/list_required_files_for_processing.Rd create mode 100644 man/process_afp_epi_raw_data.Rd create mode 100644 man/process_afp_raw_data.Rd create mode 100644 man/process_metadata_raw_data.Rd create mode 100644 man/process_paralytic_raw_data.Rd create mode 100644 man/pull_data_from_dl_table.Rd create mode 100644 man/reprocess_polio_data.Rd diff --git a/NAMESPACE b/NAMESPACE index 7368b8d1..07935ca6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -102,6 +102,7 @@ export(generate_timeliness_maps) export(generate_timely_det_violin) export(generate_timely_ship_violin) export(get_all_polio_data) +export(get_all_polio_data_2) export(get_azure_storage_connection) export(get_cdc_childvaxview_data) export(get_constant) diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R new file mode 100644 index 00000000..3a0b939a --- /dev/null +++ b/R/get_all_polio_data_2.R @@ -0,0 +1,640 @@ +# Helper functions + +#' Checks for required subfolders in the data folder +#' +#' @param data_folder `str` Path to the data folder. +#' @param polis_folder `str` POLIS folder with preprocessed data. +#' @param core_ready_folder `str` Name of the core ready folder. Need to be specified if preprocessing specific regions, which have their own core ready folder. +#' @param use_edav `logical` Whether to use EDAV or not. +#' @param cache `logical` Whether to cache the preprocessed data to data/polis subfolder. +#' +#' @returns `list` List of paths to the specific subfolders. +#' +#' @keywords internal +check_data_folder <- function(data_folder, polis_folder, core_ready_folder, use_edav, cache) { + + analytic_folder <- file.path(data_folder, "analytic") + polis_data_folder <- file.path(data_folder, "polis") + spatial_folder <- file.path(data_folder, "spatial") + coverage_folder <- file.path(data_folder, "coverage") + pop_folder <- file.path(data_folder, "pop") + + # Check that the required folders have data + for (folder in c(analytic_folder, polis_data_folder, spatial_folder, + coverage_folder, pop_folder)) { + + # get_all_polio_data will recreate the analytic folder if it's missing + switch(basename(folder), + "analytic" = { + if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { + cli::cli_alert_info("No analytics folder found. Will create a new one.") + sirfunctions_io("create.dir", NULL, folder, edav = use_edav) + } + }, + "polis" = { + if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { + cli::cli_alert_info("Creating polis folder in the data folder") + sirfunctions_io("create.dir", NULL, folder, edav = use_edav) + } else { + cli::cli_alert_info("Moving updated polis data to the data folder") + } + + + create_polis_data_folder( + data_folder, + polis_folder, + core_ready_folder, + use_edav, + cache, + Inf + ) + + }, + "spatial" = { + if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { + cli::cli_abort(paste0("No spatial data found in the data folder.", + " Ensure that the output folder when running ", + " tidypolis::process_spatial() is ", + spatial_folder), + ) + } + }, + "coverage" = { + if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { + cli::cli_abort(paste0("Coverage data not found.", + "Please create and add coverage data in: ", + folder)) + } + }, + "pop" = { + if (!sirfunctions_io("exists.dir", NULL, folder, edav = use_edav)) { + cli::cli_abort(paste0("Population data not found. ", + "Preprocessing of population files may be required. ", + "Please create a pop data folder and add data in: ", + folder)) + } + } + ) + } + + return(list(analytic_folder = analytic_folder, + polis_data_folder = polis_data_folder, + spatial_folder = spatial_folder, + coverage_folder = coverage_folder, + pop_folder = pop_folder)) + +} + +#' Creates the "download table", with paths to files required for recreating static files +#' +#' @param data_folders_paths `list` Output of [check_data_folder()]. +#' @param polis_folder `str` POLIS folder containing preprocessed data. NOT the subfolder under the data folder. +#' @param use_edav `logical` Whether to use EDAV or not. +#' +#' @returns `tibble` Dataset containing paths to required files. +#' +#' @keywords internal +list_required_files_for_processing <- function(data_folders_paths, polis_folder, use_edav) { + dl_table <- dplyr::bind_rows( + sirfunctions_io( + "list", + NULL, + data_folders_paths$polis_data_folder, + edav = use_edav + ), + sirfunctions_io( + "list", + NULL, + data_folders_paths$spatial_folder, + edav = use_edav + ), + sirfunctions_io( + "list", + NULL, + data_folders_paths$coverage_folder, + edav = use_edav + ), + sirfunctions_io( + "list", + NULL, + data_folders_paths$pop_folder, + edav = use_edav + ), + sirfunctions_io( + "list", + NULL, + polis_folder, + edav = use_edav + ) |> + dplyr::filter(grepl("cache", name)) + ) |> + dplyr::filter(!is.na(size), !grepl("afp_linelist_2019", name)) |> + dplyr::select("file" = "name", "size") + + return(dl_table) +} + +#' Create the spatial data for processing +#' +#' @param data_folder `str` Path to the data folder. +#' @param use_edav `logical` Use EDAV or not. +#' +#' @returns `list` Contains spatial datasets. +#' +#' @keywords internal +check_spatial_data_for_processing <- function(data_folder, use_edav) { + spatial_folder <- file.path(data_folder, "spatial") + analytic_folder <- file.path(data_folder, "analytic") + global_ctry_sf_name <- "global.ctry.rds" + global_prov_sf_name <- "global.prov.rds" + global_dist_sf_name <- "global.dist.rds" + spatial_data <- list() + + # Check if spatial data needs to be redownloaded from the analytics folder + spatial_timestamp_exists <- sirfunctions_io( + "exists.file", + NULL, + file.path(analytic_folder, "spatial_timestamp.parquet"), + edav = use_edav + ) + + if (spatial_timestamp_exists) { + # Check if it's recent or needs updating + edav_spatial_timestamp <- sirfunctions_io( + "read", + NULL, + file.path(analytic_folder, "spatial_timestamp.parquet"), + edav = use_edav + ) |> + dplyr::select(name, lastModifiedEDAV = lastModified) + + edav_spatial_folder_info <- sirfunctions_io( + "list", + NULL, + file.path(spatial_folder), + edav = use_edav + ) |> + dplyr::select(name, lastModified) + + spatial_timestamp_comparison <- dplyr::left_join( + edav_spatial_timestamp, + edav_spatial_folder_info + ) |> + dplyr::mutate( + updated = ifelse(lastModifiedEDAV == lastModified, TRUE, FALSE) + ) |> + dplyr::pull(updated) |> + sum(na.rm = TRUE) + } else { + spatial_timestamp_comparison <- 0 + } + + if (spatial_timestamp_comparison == 3) { + cli::cli_alert_success( + "Spatial data in the analytic folder is up to date. Loading from cache..." + ) + spatial_data <- build_parquet_raw_data( + file.path(data_folder, "analytic"), + dataset = c("global.ctry", "global.prov", "global.dist"), + from_edav = use_edav + ) + } else { + if (spatial_timestamp_exists) { + cli::cli_alert_warning( + "Spatial data in the analytic folder is outdated. Recreating from the spatial folder" + ) + } else { + cli::cli_alert_warning( + "No spatial timestamp exists. Recreating from the spatial folder" + ) + } + + cli::cli_process_start("1) Loading country shape files") + spatial_data$global.ctry <- load_clean_ctry_sp( + fp = file.path(spatial_folder, global_ctry_sf_name), + edav = use_edav + ) + cli::cli_process_done() + + cli::cli_process_start("2) Loading province shape files") + spatial_data$global.prov <- load_clean_prov_sp( + fp = file.path(spatial_folder, global_prov_sf_name), + edav = use_edav + ) + cli::cli_process_done() + + cli::cli_process_start("3) Loading district shape files") + spatial_data$global.dist <- load_clean_dist_sp( + fp = file.path(spatial_folder, global_dist_sf_name), + edav = use_edav + ) + cli::cli_process_done() + } + + return(spatial_data) + +} + +#' Creates the AFP dataset of raw_data +#' +#' @param dl_table `tibble` Output of [list_required_files_for_processing()]. +#' @param use_edav `logical` Whether to use EDAV or not. +#' +#' @returns `tibble` AFP dataset. +#' +#' @keywords internal +process_afp_raw_data <- function(dl_table, use_edav) { + + afp <- sirfunctions_io("read", NULL, file_loc = dplyr::filter( + dl_table, + grepl("afp", file) + ) |> + dplyr::pull(file), edav = use_edav) |> + dplyr::filter(surveillancetypename == "AFP") |> + dplyr::mutate( + cdc.classification.all2 = dplyr::case_when( + final.cell.culture.result == "Not received in lab" & + cdc.classification.all == "PENDING" ~ "LAB PENDING", + TRUE ~ cdc.classification.all + ), + hot.case = ifelse( + paralysis.asymmetric == "Yes" & + paralysis.onset.fever == "Yes" & + paralysis.rapid.progress == "Yes", + 1, + 0 + ), + hot.case = ifelse(is.na(hot.case), 99, hot.case) + ) + + return(afp) + +} + +#' Creates afp.epi dataset +#' +#' @param afp `tibble` Output of [process_afp_raw_data()]. +#' +#' @returns `tibble` Summary of AFP cases by year/epi-week per country. +#' +#' @keywords internal +process_afp_epi_raw_data <- function(afp) { + + afp.epi <- afp |> + dplyr::mutate(epi.week = lubridate::epiweek(dateonset)) |> + dplyr::group_by(place.admin.0, epi.week, yronset, cdc.classification.all2) |> + dplyr::summarize(afp.cases = dplyr::n(), + .groups = "drop") |> + dplyr::mutate(epiweek.year = paste(yronset, epi.week, sep = "-")) |> + # manual fix of epi week + dplyr::mutate(epi.week = ifelse(epi.week == 52 & + yronset == 2022, 1, epi.week)) + + # factoring cdc classification to have an order we like in stacked bar chart + afp.epi$cdc.classification.all2 <- + factor( + afp.epi$cdc.classification.all2, + levels = c( + "WILD 1", + "cVDPV 2", + "VDPV 2", + "cVDPV 1", + "VDPV 1", + "COMPATIBLE", + "PENDING", + "LAB PENDING", + "NPAFP", + "NOT-AFP", + "UNKNOWN", + "aVDPV 1", + "aVDPV 3", + "cVDPV1andcVDPV2", + "CombinationWild1-cVDPV 2", + "aVDPV 2", + "VDPV 3", + "iVDPV 2", + "VDPV1andcVDPV2", + "VAPP", + "cVDPV 3", + "iVDPV 3", + "WILD 3", + "WILD1andWILD3", + "iVDPV 1", + "cVDPV2andcVDPV3" + ), + labels = c( + "WILD 1", + "cVDPV 2", + "VDPV 2", + "cVDPV 1", + "VDPV 1", + "COMPATIBLE", + "PENDING", + "LAB PENDING", + "NPAFP", + "NOT-AFP", + "UNKNOWN", + "aVDPV 1", + "aVDPV 3", + "cVDPV1andcVDPV2", + "CombinationWild1-cVDPV 2", + "aVDPV 2", + "VDPV 3", + "iVDPV 2", + "VDPV1andcVDPV2", + "VAPP", + "cVDPV 3", + "iVDPV 3", + "WILD 3", + "WILD1andWILD3", + "iVDPV 1", + "cVDPV2andcVDPV3" + ) + ) + + return(afp.epi) +} + +#' Creates paralytics cases dataset +#' +#' @inheritParams process_afp_epi_raw_data +#' +#' @returns `tibble` Dataset with paralytic cases only. +#' +#' @keywords internal +process_paralytic_raw_data <- function(afp) { + para.case <- afp |> + dplyr::filter( + stringr::str_detect(cdc.classification.all2, "VDPV|WILD|COMPATIBLE") + ) |> + dplyr::mutate(yronset = ifelse(is.na(yronset) == T, 2022, yronset)) # this fix was for the manually added MOZ case + + return(para.case) +} + +#' Pull data listed in the download table +#' +#' @param dl_table `tibble` Output of [list_required_files_for_processing()]. +#' @param grepl_pattern `str` Pattern to use to filter the `dl_table`. +#' @param use_edav `logical` Whether to use EDAV or not. +#' +#' @returns `tibble` One of the datasets listed in `dl_table`. +#' +#' @keywords internal +pull_data_from_dl_table <- function(dl_table, grepl_pattern, use_edav) { + pulled_data <- sirfunctions_io( + "read", + NULL, + file_loc = dplyr::filter(dl_table, grepl(grepl_pattern, file)) |> + dplyr::pull(file), + edav = use_edav + ) |> + dplyr::ungroup() + + return(pulled_data) + +} + +#' Creates metadata tag +#' +#' @param dl_table `tibble` Output of [list_required_files_for_processing()]. +#' @param raw_data `list` Processed data combining all polio data. +#' @param polis_folder `str` Path to POLIS folder. +#' @param core_ready_folder `str` Name of the core ready folder. +#' @param use_edav `logical` Whether to use EDAV or not. +#' +#' @returns `tibble` Metadata tibble. +#' +#' @keywords internal +process_metadata_raw_data <- function(dl_table, raw_data, polis_folder, core_ready_folder, use_edav) { + metadata <- list() + polis.cache <- sirfunctions_io("read", NULL, + file_loc = dplyr::filter(dl_table, grepl("cache.rds", file)) |> + dplyr::pull(file), edav = use_edav + ) |> + dplyr::mutate(last_sync = as.Date(last_sync)) + + metadata$download_time <- max(polis.cache$last_sync, na.rm = TRUE) + + metadata$processed_time <- sirfunctions_io("list", NULL, + file.path(polis_folder, "data", core_ready_folder), + edav = use_edav + ) |> + dplyr::filter(grepl("positives_2001-01-01", name)) |> + dplyr::select("ctime" = "lastModified") |> + dplyr::mutate(ctime = as.Date(ctime)) |> + dplyr::pull(ctime) + + metadata$user <- polis.cache |> + dplyr::filter(table == "virus") |> + dplyr::pull(last_user) + + metadata$most_recent_pos <- max(raw_data$pos$dateonset, na.rm = TRUE) + metadata$most_recent_pos_loc <- raw_data$pos |> + dplyr::arrange(dplyr::desc(dateonset)) |> + dplyr::slice(1) |> + dplyr::pull(place.admin.0) + + + metadata$most_recent_afp <- max(raw_data$afp$dateonset, na.rm = TRUE) + metadata$most_recent_afp_loc <- raw_data$afp |> + dplyr::arrange(dplyr::desc(dateonset)) |> + dplyr::slice(1) |> + dplyr::pull(place.admin.0) + + + metadata$most_recent_env <- max(raw_data$es$collect.date, na.rm = TRUE) + metadata$most_recent_env_loc <- raw_data$es |> + dplyr::arrange(dplyr::desc(collect.date)) |> + dplyr::slice(1) |> + dplyr::pull(ADM0_NAME) + + + metadata$most_recent_sia <- max(raw_data$sia$sub.activity.start.date) + metadata$most_recent_sia_code <- raw_data$sia |> + dplyr::arrange(dplyr::desc(sub.activity.start.date)) |> + dplyr::slice(1) |> + dplyr::pull(sia.code) + metadata$most_recent_sia_location <- raw_data$sia |> + dplyr::arrange(dplyr::desc(sub.activity.start.date)) |> + dplyr::slice(1) |> + dplyr::pull(place.admin.0) + metadata$most_recent_sia_vax <- raw_data$sia |> + dplyr::arrange(dplyr::desc(sub.activity.start.date)) |> + dplyr::slice(1) |> + dplyr::pull(vaccine.type) + + metadata$most_recent_vdpv_class_change_date <- raw_data$pos$vdpvclassificationchangedate |> + lubridate::as_date() |> + max(na.rm = T) + + return(metadata) + +} + +#' Cache the raw data +#' +#' @param raw_data `list` Processed list of all polio data. +#' @param analytic_folder_path `str` Path to analytic folder. +#' @param use_edav `logical` Whether to use EDAV or not. +#' +#' @returns `NULL`, invisibly. +#' +#' @keywords internal +cache_raw_data <- function(raw_data, analytic_folder_path, use_edav) { + + if (use_edav) { + withr::with_tempdir({ + + create_raw_data_parquet(raw_data, getwd()) + upload_parquet_to_edav(getwd(), analytic_folder_path, get_azure_storage_connection()) + + }) + } else { + create_raw_data_parquet(raw_data, analytic_folder_path) + } + + invisible() +} + +#' Create timestamps for raw data and spatial data +#' +#' @param data_folders_paths `list` Output of [check_data_folder()]. +#' @param use_edav `logical` Whether to use EDAV or not. +#' +#' @returns `NULL`, invisibly. +#' +#' @keywords internal +create_raw_data_tags <- function(data_folders_paths, use_edav) { + + # Create tags only if not using "archived" version + if (use_edav) { + # Create raw data file tag for future comparisons + sirfunctions_io("write", NULL, + file_loc = file.path(data_folders_paths$analytic_folder, paste0("raw_data_timestamp.parquet")), + obj = Sys.time()) + + # Create spatial data file tag for future comparisons + spatial_files <- sirfunctions_io("list", + NULL, + data_folders_paths$spatial_folder, + edav = use_edav, + full_names = TRUE) + + edav_spatial_timestamp <- spatial_files |> + dplyr::filter(stringr::str_detect(name, "global."), + stringr::str_ends(name, output_format)) |> + dplyr::select(name, lastModified) + + sirfunctions_io( + "write", + NULL, + file.path(data_folder_paths$analytic_folder, paste0("spatial_timestamp", output_format)), + obj = edav_spatial_timestamp, + edav = use_edav + ) + } + +} + +#' Reprocess the global polio dataset +#' +#' @param data_folder `str` Path to the data folder. +#' @param polis_folder `str` Path to the POLIS folder. +#' @param core_ready_folder `str` Name of the core ready folder. +#' @param use_edav `logical` Whether to use EDAV or not. +#' @param cache `logical` Whether to cache the preprocessed data to the data/polis subfolder. +#' +#' @returns `list` Processed raw data. +#' +#' @keywords internal +reprocess_polio_data <- function(data_folder, polis_folder, core_ready_folder, use_edav, cache) { + + # NOTE: we will need to add mechanism for retrieving and loading archived parquet folders + data_folders_paths <- check_data_folder(data_folder, polis_folder, core_ready_folder, use_edav, cache) + + # List files required for processing + dl_table <- list_required_files_for_processing(data_folders_paths, polis_folder, use_edav) + + # Obtain spatial data information + spatial_data <- check_spatial_data_for_processing(data_folder, use_edav) + + # Process raw.data + raw_data$afp <- process_afp_raw_data(dl_table, use_edav) + raw_data$afp.epi <- process_afp_epi_raw_data(raw_data$afp) + raw_data$para.case <- process_paralytic_raw_data(raw_data$afp) + raw_data$ctry.pop <- pull_data_from_dl_table(dl_table, "ctry.pop", use_edav) + raw_data$prov.pop <- pull_data_from_dl_table(dl_table, "prov.pop", use_edav) + raw_data$dist.pop <- pull_data_from_dl_table(dl_table, "dist.pop", use_edav) + raw_data$ctry.coverage <- pull_data_from_dl_table(dl_table, "ctry_cov", use_edav) + raw_data$prov.coverage <- pull_data_from_dl_table(dl_table, "prov_cov", use_edav) + raw_data$dist.coverage <- pull_data_from_dl_table(dl_table, "dist_cov", use_edav) + raw_data$es <- pull_data_from_dl_table(dl_table, "/es_2001", use_edav) + raw_data$sia <- pull_data_from_dl_table(dl_table, "sia", use_edav) + raw_data$pos <- pull_data_from_dl_table(dl_table, "/pos", use_edav) + raw_data$other <- pull_data_from_dl_table(dl_table, "/other", use_edav) + + # Add spatial data to raw_data + raw_data$global.ctry <- spatial_data$global.ctry + raw_data$global.prov <- spatial_data$global.prov + raw_data$global.dist <- spatial_data$global.dist + raw_data$roads <- pull_data_from_dl_table(dl_table, "roads.rds", use_edav) + raw_data$cities <- pull_data_from_dl_table(dl_table, "cities.rds", use_edav) + + # Create metadata + raw_data$metadata <- process_metadata_raw_data(dl_table, raw_data, polis_folder, core_ready_folder, use_edav) + + # Check for duplicates + raw_data <- duplicate_check(raw_data) + + # Cache processed data only if we aren't using the archived version + cache_raw_data(raw_data,data_folders_paths$analytic_folder, use_edav) + + browser() + + # Create data tags only if we aren't using the archived version + create_raw_data_tags(data_folders_paths, use_edav) + + return(raw_data) + +} + +# Main function + +#' Pull global polio dataset +#' +#' @param dataset `str` Name of the dataset. Defaults to 'all'. +#' @param data_folder `str` Path to data folder. +#' @param polis_folder `str` Path to the POLIS folder. +#' @param core_ready_folder `str` Name of the core ready folder. +#' @param recreate.static.files `logical` Whether to reprocess global polio data. +#' @param use_edav `logical` Whether to use EDAV or not. +#' @param azcontainer `azcontainer` Azure container object. +#' @param cache `logical` Whether to cache the preprocessed datasets in the `data/polis` folder. +#' +#' @returns `list` Global polio datasets. +#' +#' @export +#' @examples +#' \dontrun{ +#' raw_data <- get_all_polio_data_2() +#' } +get_all_polio_data_2 <- function(dataset = "all", + data_folder = "GID/PEB/SIR/Data", + polis_folder = "GID/PEB/SIR/POLIS", + core_ready_folder = "Core_Ready_Files", + recreate.static.files = FALSE, + use_edav = TRUE, + azcontainer = get_azure_storage_connection(), + cache = TRUE) { + + if (recreate.static.files) { + raw_data <- reprocess_polio_data(data_folder, polis_folder, core_ready_folder, use_edav, cache) + } else { + raw_data <- build_parquet_raw_data(file.path(data_folder, "analytic"), dataset, use_edav, azcontainer) + } + + return(raw_data) + +} + diff --git a/man/cache_raw_data.Rd b/man/cache_raw_data.Rd new file mode 100644 index 00000000..3bde69c9 --- /dev/null +++ b/man/cache_raw_data.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{cache_raw_data} +\alias{cache_raw_data} +\title{Cache the raw data} +\usage{ +cache_raw_data(raw_data, analytic_folder_path, use_edav) +} +\arguments{ +\item{raw_data}{\code{list} Processed list of all polio data.} + +\item{analytic_folder_path}{\code{str} Path to analytic folder.} + +\item{use_edav}{\code{logical} Whether to use EDAV or not.} +} +\value{ +\code{NULL}, invisibly. +} +\description{ +Cache the raw data +} +\keyword{internal} diff --git a/man/check_data_folder.Rd b/man/check_data_folder.Rd new file mode 100644 index 00000000..6e50559e --- /dev/null +++ b/man/check_data_folder.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{check_data_folder} +\alias{check_data_folder} +\title{Checks for required subfolders in the data folder} +\usage{ +check_data_folder( + data_folder, + polis_folder, + core_ready_folder, + use_edav, + cache +) +} +\arguments{ +\item{data_folder}{\code{str} Path to the data folder.} + +\item{polis_folder}{\code{str} POLIS folder with preprocessed data.} + +\item{core_ready_folder}{\code{str} Name of the core ready folder. Need to be specified if preprocessing specific regions, which have their own core ready folder.} + +\item{use_edav}{\code{logical} Whether to use EDAV or not.} + +\item{cache}{\code{logical} Whether to cache the preprocessed data to data/polis subfolder.} +} +\value{ +\code{list} List of paths to the specific subfolders. +} +\description{ +Checks for required subfolders in the data folder +} +\keyword{internal} diff --git a/man/check_spatial_data_for_processing.Rd b/man/check_spatial_data_for_processing.Rd new file mode 100644 index 00000000..b0bc6998 --- /dev/null +++ b/man/check_spatial_data_for_processing.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{check_spatial_data_for_processing} +\alias{check_spatial_data_for_processing} +\title{Create the spatial data for processing} +\usage{ +check_spatial_data_for_processing(data_folder, use_edav) +} +\arguments{ +\item{data_folder}{\code{str} Path to the data folder.} + +\item{use_edav}{\code{logical} Use EDAV or not.} +} +\value{ +\code{list} Contains spatial datasets. +} +\description{ +Create the spatial data for processing +} +\keyword{internal} diff --git a/man/create_raw_data_tags.Rd b/man/create_raw_data_tags.Rd new file mode 100644 index 00000000..33f205f8 --- /dev/null +++ b/man/create_raw_data_tags.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{create_raw_data_tags} +\alias{create_raw_data_tags} +\title{Create timestamps for raw data and spatial data} +\usage{ +create_raw_data_tags(data_folders_paths, use_edav) +} +\arguments{ +\item{data_folders_paths}{\code{list} Output of \code{\link[=check_data_folder]{check_data_folder()}}.} + +\item{use_edav}{\code{logical} Whether to use EDAV or not.} +} +\value{ +\code{NULL}, invisibly. +} +\description{ +Create timestamps for raw data and spatial data +} +\keyword{internal} diff --git a/man/get_all_polio_data_2.Rd b/man/get_all_polio_data_2.Rd new file mode 100644 index 00000000..0980df03 --- /dev/null +++ b/man/get_all_polio_data_2.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{get_all_polio_data_2} +\alias{get_all_polio_data_2} +\title{Pull global polio dataset} +\usage{ +get_all_polio_data_2( + dataset = "all", + data_folder = "GID/PEB/SIR/Data", + polis_folder = "GID/PEB/SIR/POLIS", + core_ready_folder = "Core_Ready_Files", + recreate.static.files = FALSE, + use_edav = TRUE, + azcontainer = get_azure_storage_connection(), + cache = TRUE +) +} +\arguments{ +\item{dataset}{\code{str} Name of the dataset. Defaults to 'all'.} + +\item{data_folder}{\code{str} Path to data folder.} + +\item{polis_folder}{\code{str} Path to the POLIS folder.} + +\item{core_ready_folder}{\code{str} Name of the core ready folder.} + +\item{recreate.static.files}{\code{logical} Whether to reprocess global polio data.} + +\item{use_edav}{\code{logical} Whether to use EDAV or not.} + +\item{azcontainer}{\code{azcontainer} Azure container object.} + +\item{cache}{\code{logical} Whether to cache the preprocessed datasets in the \code{data/polis} folder.} +} +\value{ +\code{list} Global polio datasets. +} +\description{ +Pull global polio dataset +} +\examples{ +\dontrun{ +raw_data <- get_all_polio_data_2() +} +} diff --git a/man/list_required_files_for_processing.Rd b/man/list_required_files_for_processing.Rd new file mode 100644 index 00000000..9a06e5e3 --- /dev/null +++ b/man/list_required_files_for_processing.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{list_required_files_for_processing} +\alias{list_required_files_for_processing} +\title{Creates the "download table", with paths to files required for recreating static files} +\usage{ +list_required_files_for_processing(data_folders_paths, polis_folder, use_edav) +} +\arguments{ +\item{data_folders_paths}{\code{list} Output of \code{\link[=check_data_folder]{check_data_folder()}}.} + +\item{polis_folder}{\code{str} POLIS folder containing preprocessed data. NOT the subfolder under the data folder.} + +\item{use_edav}{\code{logical} Whether to use EDAV or not.} +} +\value{ +\code{tibble} Dataset containing paths to required files. +} +\description{ +Creates the "download table", with paths to files required for recreating static files +} +\keyword{internal} diff --git a/man/process_afp_epi_raw_data.Rd b/man/process_afp_epi_raw_data.Rd new file mode 100644 index 00000000..8bba2c7b --- /dev/null +++ b/man/process_afp_epi_raw_data.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{process_afp_epi_raw_data} +\alias{process_afp_epi_raw_data} +\title{Creates afp.epi dataset} +\usage{ +process_afp_epi_raw_data(afp) +} +\arguments{ +\item{afp}{\code{tibble} Output of \code{\link[=process_afp_raw_data]{process_afp_raw_data()}}.} +} +\value{ +\code{tibble} Summary of AFP cases by year/epi-week per country. +} +\description{ +Creates afp.epi dataset +} +\keyword{internal} diff --git a/man/process_afp_raw_data.Rd b/man/process_afp_raw_data.Rd new file mode 100644 index 00000000..502e8796 --- /dev/null +++ b/man/process_afp_raw_data.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{process_afp_raw_data} +\alias{process_afp_raw_data} +\title{Creates the AFP dataset of raw_data} +\usage{ +process_afp_raw_data(dl_table, use_edav) +} +\arguments{ +\item{dl_table}{\code{tibble} Output of \code{\link[=list_required_files_for_processing]{list_required_files_for_processing()}}.} + +\item{use_edav}{\code{logical} Whether to use EDAV or not.} +} +\value{ +\code{tibble} AFP dataset. +} +\description{ +Creates the AFP dataset of raw_data +} +\keyword{internal} diff --git a/man/process_metadata_raw_data.Rd b/man/process_metadata_raw_data.Rd new file mode 100644 index 00000000..ebf919fa --- /dev/null +++ b/man/process_metadata_raw_data.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{process_metadata_raw_data} +\alias{process_metadata_raw_data} +\title{Creates metadata tag} +\usage{ +process_metadata_raw_data( + dl_table, + raw_data, + polis_folder, + core_ready_folder, + use_edav +) +} +\arguments{ +\item{dl_table}{\code{tibble} Output of \code{\link[=list_required_files_for_processing]{list_required_files_for_processing()}}.} + +\item{raw_data}{\code{list} Processed data combining all polio data.} + +\item{polis_folder}{\code{str} Path to POLIS folder.} + +\item{core_ready_folder}{\code{str} Name of the core ready folder.} + +\item{use_edav}{\code{logical} Whether to use EDAV or not.} +} +\value{ +\code{tibble} Metadata tibble. +} +\description{ +Creates metadata tag +} +\keyword{internal} diff --git a/man/process_paralytic_raw_data.Rd b/man/process_paralytic_raw_data.Rd new file mode 100644 index 00000000..b9c29de6 --- /dev/null +++ b/man/process_paralytic_raw_data.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{process_paralytic_raw_data} +\alias{process_paralytic_raw_data} +\title{Creates paralytics cases dataset} +\usage{ +process_paralytic_raw_data(afp) +} +\arguments{ +\item{afp}{\code{tibble} Output of \code{\link[=process_afp_raw_data]{process_afp_raw_data()}}.} +} +\value{ +\code{tibble} Dataset with paralytic cases only. +} +\description{ +Creates paralytics cases dataset +} +\keyword{internal} diff --git a/man/pull_data_from_dl_table.Rd b/man/pull_data_from_dl_table.Rd new file mode 100644 index 00000000..be0d0378 --- /dev/null +++ b/man/pull_data_from_dl_table.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{pull_data_from_dl_table} +\alias{pull_data_from_dl_table} +\title{Pull data listed in the download table} +\usage{ +pull_data_from_dl_table(dl_table, grepl_pattern, use_edav) +} +\arguments{ +\item{dl_table}{\code{tibble} Output of \code{\link[=list_required_files_for_processing]{list_required_files_for_processing()}}.} + +\item{grepl_pattern}{\code{str} Pattern to use to filter the \code{dl_table}.} + +\item{use_edav}{\code{logical} Whether to use EDAV or not.} +} +\value{ +\code{tibble} One of the datasets listed in \code{dl_table}. +} +\description{ +Pull data listed in the download table +} +\keyword{internal} diff --git a/man/reprocess_polio_data.Rd b/man/reprocess_polio_data.Rd new file mode 100644 index 00000000..54564147 --- /dev/null +++ b/man/reprocess_polio_data.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_all_polio_data_2.R +\name{reprocess_polio_data} +\alias{reprocess_polio_data} +\title{Reprocess the global polio dataset} +\usage{ +reprocess_polio_data( + data_folder, + polis_folder, + core_ready_folder, + use_edav, + cache +) +} +\arguments{ +\item{data_folder}{\code{str} Path to the data folder.} + +\item{polis_folder}{\code{str} Path to the POLIS folder.} + +\item{core_ready_folder}{\code{str} Name of the core ready folder.} + +\item{use_edav}{\code{logical} Whether to use EDAV or not.} + +\item{cache}{\code{logical} Whether to cache the preprocessed data to the data/polis subfolder.} +} +\value{ +\code{list} Processed raw data. +} +\description{ +Reprocess the global polio dataset +} +\keyword{internal} From 40d46e63cc2a1c9ae57145dba8a1e748f3404d19 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:54:14 -0400 Subject: [PATCH 24/28] remove browser --- R/get_all_polio_data_2.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R index 3a0b939a..ff7f695e 100644 --- a/R/get_all_polio_data_2.R +++ b/R/get_all_polio_data_2.R @@ -590,8 +590,6 @@ reprocess_polio_data <- function(data_folder, polis_folder, core_ready_folder, u # Cache processed data only if we aren't using the archived version cache_raw_data(raw_data,data_folders_paths$analytic_folder, use_edav) - browser() - # Create data tags only if we aren't using the archived version create_raw_data_tags(data_folders_paths, use_edav) From 633b31e8be28b9561547a6fe7a41dfa9ac542c35 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:57:31 -0400 Subject: [PATCH 25/28] Update get_all_polio_data_2.R --- R/get_all_polio_data_2.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R index ff7f695e..4b54b0c9 100644 --- a/R/get_all_polio_data_2.R +++ b/R/get_all_polio_data_2.R @@ -550,6 +550,7 @@ create_raw_data_tags <- function(data_folders_paths, use_edav) { #' @keywords internal reprocess_polio_data <- function(data_folder, polis_folder, core_ready_folder, use_edav, cache) { + raw_data <- list() # NOTE: we will need to add mechanism for retrieving and loading archived parquet folders data_folders_paths <- check_data_folder(data_folder, polis_folder, core_ready_folder, use_edav, cache) From ae3163f8642a9d3506f3c4ecd5c4473a4c2c31fc Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Wed, 1 Apr 2026 12:10:10 -0400 Subject: [PATCH 26/28] fix raw data parquet folder name --- R/dal.parquet.R | 2 +- R/get_all_polio_data_2.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/dal.parquet.R b/R/dal.parquet.R index f9078808..8709f434 100644 --- a/R/dal.parquet.R +++ b/R/dal.parquet.R @@ -142,7 +142,7 @@ upload_parquet_to_edav <- function(src, dest, container = get_azure_storage_conn AzureStor::multiupload_adls_file( container, paste0(src, "/*"), - file.path(dest, basename(src)), + file.path(dest, "raw_data_parquet"), recursive = TRUE ) cli::cli_process_done() diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R index 4b54b0c9..e767b181 100644 --- a/R/get_all_polio_data_2.R +++ b/R/get_all_polio_data_2.R @@ -511,7 +511,7 @@ create_raw_data_tags <- function(data_folders_paths, use_edav) { if (use_edav) { # Create raw data file tag for future comparisons sirfunctions_io("write", NULL, - file_loc = file.path(data_folders_paths$analytic_folder, paste0("raw_data_timestamp.parquet")), + file_loc = file.path(data_folders_paths$analytic_folder, paste0("raw_data_timestamp.rds")), obj = Sys.time()) # Create spatial data file tag for future comparisons @@ -529,7 +529,7 @@ create_raw_data_tags <- function(data_folders_paths, use_edav) { sirfunctions_io( "write", NULL, - file.path(data_folder_paths$analytic_folder, paste0("spatial_timestamp", output_format)), + file.path(data_folder_paths$analytic_folder, paste0("spatial_timestamp.rds")), obj = edav_spatial_timestamp, edav = use_edav ) From c0cf0bff0611994565318fa8f622511146df1e55 Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Wed, 1 Apr 2026 12:15:45 -0400 Subject: [PATCH 27/28] Update get_all_polio_data_2.R --- R/get_all_polio_data_2.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R index e767b181..646962d0 100644 --- a/R/get_all_polio_data_2.R +++ b/R/get_all_polio_data_2.R @@ -523,7 +523,7 @@ create_raw_data_tags <- function(data_folders_paths, use_edav) { edav_spatial_timestamp <- spatial_files |> dplyr::filter(stringr::str_detect(name, "global."), - stringr::str_ends(name, output_format)) |> + stringr::str_ends(name, "parquet")) |> dplyr::select(name, lastModified) sirfunctions_io( From 7df83afcb2ac5777c5b48e4ad912c455baa5435f Mon Sep 17 00:00:00 2001 From: Mervin Keith Cuadera <40894971+mcuadera@users.noreply.github.com> Date: Wed, 1 Apr 2026 12:20:49 -0400 Subject: [PATCH 28/28] Update get_all_polio_data_2.R --- R/get_all_polio_data_2.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_all_polio_data_2.R b/R/get_all_polio_data_2.R index 646962d0..6b27a8f3 100644 --- a/R/get_all_polio_data_2.R +++ b/R/get_all_polio_data_2.R @@ -529,7 +529,7 @@ create_raw_data_tags <- function(data_folders_paths, use_edav) { sirfunctions_io( "write", NULL, - file.path(data_folder_paths$analytic_folder, paste0("spatial_timestamp.rds")), + file.path(data_folders_paths$analytic_folder, paste0("spatial_timestamp.rds")), obj = edav_spatial_timestamp, edav = use_edav )