diff --git a/DESCRIPTION b/DESCRIPTION index 5d2e4a51e..1adf095ff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -64,6 +64,7 @@ Collate: 'categories.R' 'category.R' 'change-category-id.R' + 'clean-array.R' 'combine-categories.R' 'compare-categories.R' 'compare-datasets.R' diff --git a/NAMESPACE b/NAMESPACE index 5bd276fdb..4b0062a91 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -109,6 +109,7 @@ export(categoriesFromLevels) export(cd) export(changeCategoryID) export(checkForNewVersion) +export(cleanImportedArray) export(cleanseBatches) export(collapseCategories) export(combine) diff --git a/R/add-variable.R b/R/add-variable.R index 03bf1c9fb..648a374fe 100644 --- a/R/add-variable.R +++ b/R/add-variable.R @@ -81,6 +81,7 @@ validateVarDefRows <- function(vardef, numrows) { POSTNewVariable <- function(catalog_url, variable) { do.POST <- function(x) crPOST(catalog_url, body = toJSON(x, digits = 15)) + is_binddef <- FALSE if (!any(c("expr", "derivation") %in% names(variable))) { ## If deriving a variable, skip this and go straight to POSTing if (variable$type %in% c("multiple_response", "categorical_array")) { @@ -102,6 +103,12 @@ POSTNewVariable <- function(catalog_url, variable) { } is_binddef <- is.character(variable$subvariables) && !("categories" %in% names(variable)) + if (is_binddef) { + # Pop the magic flag off + # TODO: allow setting this magic flag in makeArray() + do_post_bind_magic <- variable$autonames %||% FALSE + variable$autonames <- NULL + } is_arraydef <- is_catvardef(variable) && !any(vapply(variable$subvariables, is_catvardef, logical(1))) case3 <- !(is_binddef | is_arraydef) @@ -131,6 +138,11 @@ POSTNewVariable <- function(catalog_url, variable) { } } out <- do.POST(variable) + if (is_binddef && do_post_bind_magic) { + # Look for common variable name stems and clean that + var <- VariableEntity(crGET(out)) + cleanImportedArray(var) + } invisible(out) } diff --git a/R/clean-array.R b/R/clean-array.R new file mode 100644 index 000000000..99e5ec36c --- /dev/null +++ b/R/clean-array.R @@ -0,0 +1,60 @@ +#' Clean up an array variable imported from a lossy file format +#' +#' Array and multiple-response variables coming in from SPSS or other file +#' formats generally need some work to reconstruct the "right" metadata because +#' they have to shove both parent and subvariable metadata into the "varlabels" +#' of the subvariables. This often follows a pattern of having varlabels with a +#' prefix containing the parent question wording (description) and a suffix that +#' is the actual response label. +#' +#' This function detects this prefix and reconstructs what may have been the +#' original array definition. +#' +#' @param variable An array Variable +#' @param min.prefix.length Integer: how many characters long does the common +#' string need to be in order to consider it significant enough to use? Default +#' is 20. +#' @return `variable` with edits pushed to the API. A common prefix on +#' subvariable names is extracted and set as the variable's description. +#' @export +cleanImportedArray <- function (variable, min.prefix.length=20) { + if (length(subvariables(variable)) > 1) { + prefix <- findCommonPrefix(names(subvariables(variable))) + # If length of the common stem is enough, extract it, + # remove it from the subvar names, + # remove trailing whitespace/punctuation, + # and set it as variable description. + if (nchar(prefix) >= min.prefix.length) { + # Use wildcard regexp with length just in case there are special chars in prefix. + # We already know that the prefix matches. + re <- paste0("^.{", nchar(prefix), "}") + names(subvariables(variable)) <- sub(re, "", names(subvariables(variable))) + # Now, remove whitespace and some punctuation from end of prefix, but + # don't remove a question mark or other reasonable punctuation + prefix <- sub("[[:space:]\\-\\:;|]*$", "", prefix) + description(variable) <- prefix + } + } + return(variable) +} + +findCommonPrefix <- function (x) { + # Find the shortest one and start with that + step_size <- prefix_length <- min(nchar(x)) + out <- "" + while (step_size > 0 && prefix_length > 0) { + # Bisect to find the common stem + step_size <- round(step_size / 2) + stems <- unique(substr(x, 1, prefix_length)) + if (length(stems) == 1) { + # Keep this one + out <- stems + # Try longer + prefix_length <- prefix_length + step_size + } else { + # Try shorter + prefix_length <- prefix_length - step_size + } + } + return(out) +} diff --git a/inst/WORDLIST b/inst/WORDLIST index b631ce493..1a374b1c3 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -112,6 +112,7 @@ JSON JSONing Jupyter libcurl +lossy magrittr makeWeight MemberCatalog @@ -136,6 +137,7 @@ POSIXt POSTed POSTing POSTs +powerpoint PPA pre programmatically diff --git a/man/cleanImportedArray.Rd b/man/cleanImportedArray.Rd new file mode 100644 index 000000000..4b9b4aacb --- /dev/null +++ b/man/cleanImportedArray.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean-array.R +\name{cleanImportedArray} +\alias{cleanImportedArray} +\title{Clean up an array variable imported from a lossy file format} +\usage{ +cleanImportedArray(variable, min.prefix.length = 20) +} +\arguments{ +\item{variable}{An array Variable} + +\item{min.prefix.length}{Integer: how many characters long does the common +string need to be in order to consider it significant enough to use? Default +is 20.} +} +\value{ +\code{variable} with edits pushed to the API. A common prefix on +subvariable names is extracted and set as the variable's description. +} +\description{ +Array and multiple-response variables coming in from SPSS or other file +formats generally need some work to reconstruct the "right" metadata because +they have to shove both parent and subvariable metadata into the "varlabels" +of the subvariables. This often follows a pattern of having varlabels with a +prefix containing the parent question wording (description) and a suffix that +is the actual response label. +} +\details{ +This function detects this prefix and reconstructs what may have been the +original array definition. +} diff --git a/tests/testthat/test-clean-array.R b/tests/testthat/test-clean-array.R new file mode 100644 index 000000000..717ce3c32 --- /dev/null +++ b/tests/testthat/test-clean-array.R @@ -0,0 +1,9 @@ +context("Cleaning array variables") + +test_that("findCommonPrefix", { + expect_identical(findCommonPrefix(c("abc def", "ab cd ef")), "ab") + expect_identical(findCommonPrefix(c("XX select all. A", "XX select all. BB")), "XX select all. ") + expect_identical(findCommonPrefix(c("A", "B")), "") + expect_identical(findCommonPrefix(c("abc defg", "abc defg")), "abc defg") + expect_identical(findCommonPrefix(c("abc defg", "gfed cbaooo")), "") +})