diff --git a/NAMESPACE b/NAMESPACE index eb93aa741..60d0b00bf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -181,6 +181,7 @@ export(mergeFork) export(mkdir) export(modifyWeightVariables) export(moveToGroup) +export(makeMRFromText) export(mv) export(newDataset) export(newDatasetByColumn) diff --git a/NEWS.md b/NEWS.md index 2dc39bdc3..61e5d272c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ ## crunch 1.20.1 (under development) * Variables can now be converted from one type to another with server-side derivations. Have a text input that is only numbers, and want to have a variables that is a true numeric? Simple, just use `ds$id_var_numeric <- as.Numeric(ds$id_var)`. There Are `as.*` methods for all Crunch data types except for array-like variables. +* `makeMRFromText()` to take a variable imported as delimited strings, parse the multiple-response options, and return a (derived) multiple_response variable. * Added support for setting population sizes on datasets with `setPopulation(ds, size = 24.13e6, magnitude = 3)` and for getting population sizes (or magnitudes) with `popSize(ds)` and `popMagnitude(ds)` respectively. * Add `options(crunch.show.progress)` to govern whether to report progress of long-running requests. Default is `TRUE`, but set it to `FALSE` to run quietly. * Export `pollProgress()` and recommend using that when a long-running request fails to complete within the local timeout. diff --git a/R/make-array.R b/R/make-array.R index a2ba874d5..a05db5b33 100644 --- a/R/make-array.R +++ b/R/make-array.R @@ -80,6 +80,137 @@ makeMR <- function (subvariables, name, selections, ...) { return(vardef) } + +#' Create Multiple Response Variable from Delimited lists +#' +#' Surveys often record multiple response questions in delimited lists where +#' each respondent's selections are separated by a delimiter like `;` or `|`. +#' This function breaks the delimited responses into subvariables, uploads those +#' subvariables to Crunch, and finally creates a multiple response variable from +#' them. +#' +#' @param var The variable containing the delimited responses +#' @param delim The delimiter separating the responses +#' @param name The name of the resulting MR variable +#' @param selected A character string used to indicate a selection, defaults to +#' "selected" +#' @param not_selected Character string identifying non-selection, defaults to +#' "not_selected" +#' @param unanswered Character string indicating non-response, defaults to NA. +#' @param ... Other arguments to be passed on to [makeMR()] +#' +#' @return a Multiple response variable definition +#' @export +makeMRFromText <- function (var, + delim, + name, + selected = "selected", + not_selected = "not_selected", + unanswered = NA, + ...) { + if (missing(name)) { + halt("Must supply a name for the new variable") + } + if (is.Categorical(var) || is.Text(var)) { + uniques <- names(table(var)) + } else { + halt(dQuote(substitute(var)), + " must be a Categorical or Text Crunch Variable.") + } + items <- unique(unlist(strsplit(uniques, delim))) + # make a derivation expression for each unique item + subvarderivs <- lapply(items, function(x) createSubvarDeriv(var, x, delim, + selected, not_selected, unanswered)) + names(subvarderivs) <- gsub("\\.", "_", items) # mongo errors if there are dots in the names + + # generate the ZCL to make an array from the subvariable derivations, and + # then do selection magic to make an MR + derivation <- zfunc("select_categories", + zfunc("array", + zfunc("select", list(map=subvarderivs), + list(value=I(c(1, 2, 3, 4, 5))))), + list(value=I("selected"))) + + # hide the original variable + var <- hide(var) + return(VariableDefinition(derivation=derivation, name=name, ...)) +} + +#' Create subvariable derivation expressions +#' +#' This function creates a single subvariable definition based on a character string +#' to search for and an originating variable. It uses regex to determine whether +#' a string is present in a delimited list, then substitutes the user supplied values +#' to indicate selection, non-selection, and missingness. +#' +#' +#' @inheritParams makeMRFromText +#' @param str A string whose presence indicates a selection +#' @param missing A logical vector indicating which variable entries are missing +#' @keywords internal +#' +#' @return A VariableDefinition +createSubvarDeriv <- function (var, str, delim, selected, not_selected, + unanswered) { + if (is.na(unanswered)) { + unanswered <- "No Data" + } + new_cat_type <- list( + value = list( + class = "categorical", + categories = list( + list("id" = 1, + "name" = unanswered, + "numeric_value" = NA, + "missing" = TRUE), + list("id" = 2, + "name" = selected, + "numeric_value" = NA, + "missing" = FALSE), + list("id" = 3, + "name" = not_selected, + "numeric_value" = NA, + "missing" = FALSE) + ) + ) + ) + new_cat <- list(column = I(1:3), type = new_cat_type) + deriv <- zfunc("case", new_cat) + deriv$args[[2]] <- zfunc("is_missing", var) + deriv$args[[3]] <- zfunc("~=", var, buildDelimRegex(str, delim)) + new_alias <- paste0(alias(var), "_", gsub("\\.", "_", str)) # Mongo doesn't allow aliases with dots + deriv$references <- list(name = str, alias = new_alias) + return(deriv) +} + +#' Build Regex to find delimited items. +#' +#' A delimited item `maple` can appear in a list in four ways +#' 1. At the start of a list `maple; oak` +#' 1. In the middle of a list `oak; maple; birch` +#' 1. At the end of a list `oak; maple` +#' 1. Alone with no delimiters `maple` +#' +#' This function builds a regex expression which captures those four cases It +#' is mostly broken out of [createSubvarDeriv()] for testing purposes. +#' +#' @inheritParams createSubvarDeriv +#' +#' @return A character string +#' @keywords internal +buildDelimRegex <- function (str, delim){ + # the delimeter needs to be escaped in case it's a regex character + delim <- escapeRegex(delim) + str <- escapeRegex(str) + regex <- paste0( + "^", str, delim, "|", + delim, str, delim, "|", + delim, str, "$", "|", + "^", str, "$") + return(regex) +} + + #' @rdname makeArray #' @export deriveArray <- function (subvariables, name, selections, ...) { diff --git a/R/misc.R b/R/misc.R index c8e9bf19f..f00434710 100644 --- a/R/misc.R +++ b/R/misc.R @@ -266,3 +266,18 @@ has.function <- function (query, funcs) { return(FALSE) } + +#' escape Regex +#' +#' This function takes a string and escapes all of the special characters in the string. +#' So VB.NET becomes VB\.NET. Note that R will print this as VB\\.NET, but `cat` reveals +#' that there's only one `\`. +#' @param string +#' +#' @kerwords internal +#' escapeRegex("Tom&Jerry") +#' escapeRegex(".Net) +escapeRegex <- function(string) { + out <- gsub("([.|()\\^{}+$*?])", "\\\\\\1", string) + return(gsub("(\\[|\\])", "\\\\\\1", out)) +} diff --git a/man/buildDelimRegex.Rd b/man/buildDelimRegex.Rd new file mode 100644 index 000000000..540bf2455 --- /dev/null +++ b/man/buildDelimRegex.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/make-array.R +\name{buildDelimRegex} +\alias{buildDelimRegex} +\title{Build Regex to find delimited items.} +\usage{ +buildDelimRegex(str, delim) +} +\arguments{ +\item{str}{A string whose presence indicates a selection} + +\item{delim}{The delimiter separating the responses} +} +\value{ +A character string +} +\description{ +A delimited item can appear in a list in four ways +\enumerate{ +\item At the start of a list \code{maple; oak} +\item In the middle of a list \code{oak; maple; birch} +\item At the end of a list \code{oak; maple} +\item Alone with no delimiters \code{maple} +} +} +\details{ +This function builds a regex expression which captures those four values. It +is mostly broken out of \code{\link[=createSubvarDeriv]{createSubvarDeriv()}} for testing purposes. +} +\keyword{internal} diff --git a/man/createSubvarDeriv.Rd b/man/createSubvarDeriv.Rd new file mode 100644 index 000000000..9865af5a8 --- /dev/null +++ b/man/createSubvarDeriv.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/make-array.R +\name{createSubvarDeriv} +\alias{createSubvarDeriv} +\title{Create subvariable derivation expressions} +\usage{ +createSubvarDeriv(var, str, delim, selected, not_selected, unanswered) +} +\arguments{ +\item{var}{The variable containing the delimited responses} + +\item{str}{A string whose presence indicates a selection} + +\item{delim}{The delimiter separating the responses} + +\item{selected}{A character string used to indicate a selection, defaults to +"selected"} + +\item{not_selected}{Character string identifying non-selection, defaults to +"not_selected"} + +\item{unanswered}{Character string indicating non-response, defaults to NA.} + +\item{missing}{A logical vector indicating which variable entries are missing} +} +\value{ +A VariableDefinition +} +\description{ +This function creates a single subvariable definition based on a character string +to search for and an originating variable. It uses regex to determine whether +a string is present in a delimited list, then substitutes the user supplied values +to indicate selection, non-selection, and missingness. +} +\keyword{internal} diff --git a/man/mrFromDelim.Rd b/man/mrFromDelim.Rd new file mode 100644 index 000000000..79c6f7e1a --- /dev/null +++ b/man/mrFromDelim.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/make-array.R +\name{makeMRFromText} +\alias{makeMRFromText} +\title{Create Multiple Response Variable from Delimited lists} +\usage{ +makeMRFromText(var, delim, name, selected = "selected", + not_selected = "not_selected", unanswered = NA, ...) +} +\arguments{ +\item{var}{The variable containing the delimited responses} + +\item{delim}{The delimiter separating the responses} + +\item{name}{The name of the resulting MR variable} + +\item{selected}{A character string used to indicate a selection, defaults to +"selected"} + +\item{not_selected}{Character string identifying non-selection, defaults to +"not_selected"} + +\item{unanswered}{Character string indicating non-response, defaults to NA.} + +\item{...}{Other arguments to be passed on to \code{\link[=makeMR]{makeMR()}}} +} +\value{ +a Multiple response variable definition +} +\description{ +Surveys often record multiple response questions in delimited lists where +each respondent's selections are separated by a delimiter like \code{;} or \code{|}. +This function breaks the delimited responses into subvariables, uploads those +subvariables to Crunch, and finally creates a multiple response variable from +them. +} diff --git a/tests/testthat/app.crunch.io/api/datasets/mr_from_delim.json b/tests/testthat/app.crunch.io/api/datasets/mr_from_delim.json new file mode 100644 index 000000000..de7eb696f --- /dev/null +++ b/tests/testthat/app.crunch.io/api/datasets/mr_from_delim.json @@ -0,0 +1,70 @@ +{ + "element": "shoji:entity", + "self": "https://app.crunch.io/api/datasets/mr_from_delim/", + "catalogs": { + "batches": "https://app.crunch.io/api/datasets/mr_from_delim/batches/", + "users": "https://app.crunch.io/api/datasets/mr_from_delim/users/", + "variables": "https://app.crunch.io/api/datasets/mr_from_delim/variables/", + "actions": "https://app.crunch.io/api/datasets/mr_from_delim/actions/", + "savepoints": "https://app.crunch.io/api/datasets/mr_from_delim/savepoints/", + "boxdata": "https://app.crunch.io/api/datasets/mr_from_delim/boxdata/", + "filters": "https://app.crunch.io/api/datasets/mr_from_delim/filters/", + "multitables": "https://app.crunch.io/api/datasets/mr_from_delim/multitables/", + "comparisons": "https://app.crunch.io/api/datasets/mr_from_delim/comparisons/", + "forks": "https://app.crunch.io/api/datasets/mr_from_delim/forks/", + "permissions": "https://app.crunch.io/api/datasets/mr_from_delim/permissions/", + "joins": "https://app.crunch.io/api/datasets/mr_from_delim/joins/", + "decks": "https://app.crunch.io/api/datasets/mr_from_delim/decks/", + "parent": "https://app.crunch.io/api/datasets/", + "weight_variables": "https://app.crunch.io/api/datasets/mr_from_delim/weight_variables/" + }, + "fragments": { + "preferences": "https://app.crunch.io/api/datasets/mr_from_delim/preferences/", + "stream": "https://app.crunch.io/api/datasets/mr_from_delim/stream/", + "settings": "https://app.crunch.io/api/datasets/mr_from_delim/settings/", + "visit": "https://app.crunch.io/api/datasets/mr_from_delim/visit/", + "state": "https://app.crunch.io/api/datasets/mr_from_delim/state/", + "table": "https://app.crunch.io/api/datasets/mr_from_delim/table/", + "pk": "https://app.crunch.io/api/datasets/mr_from_delim/pk/", + "exclusion": "https://app.crunch.io/api/datasets/mr_from_delim/exclusion/" + }, + "views": { + "cube": "https://app.crunch.io/api/datasets/mr_from_delim/cube/", + "export": "https://app.crunch.io/api/datasets/mr_from_delim/export/", + "summary": "https://app.crunch.io/api/datasets/mr_from_delim/summary/", + "applied_filters": "https://app.crunch.io/api/datasets/mr_from_delim/filters/applied/" + }, + "specification": "https://app.crunch.io/api/specifications/datasets/", + "description": "Detail for a given dataset", + "body": { + "size": { + "rows": 4, + "columns": 1 + }, + "current_editor_name": "Me", + "owner_name": "Me", + "name": "for testing functionality of MR from delimited text", + "end_date": null, + "access_time": "2017-04-12T19:07:06.351000", + "notes": "", + "current_editor": "https://app.crunch.io/api/users/me/", + "creation_time": "2017-04-12T14:34:00.015000", + "archived": false, + "start_date": null, + "modification_time": "2017-04-12T14:34:03.239000", + "app_settings": { + "crunch": { + "deleted_rogue_vp": true + } + }, + "owner": "https://app.crunch.io/api/users/me/", + "permissions": { + "edit": true, + "change_permissions": true, + "view": true + }, + "is_published": true, + "id": "mr_from_delim", + "description": "" + } +} diff --git a/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/cube-0e8c31.json b/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/cube-0e8c31.json new file mode 100644 index 000000000..fadc89da9 --- /dev/null +++ b/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/cube-0e8c31.json @@ -0,0 +1,98 @@ +{ + "element": "shoji:view", + "self": "https://app.crunch.io/api/datasets/mr_from_delim/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22http%3A%2F%2Flocal.crunch.io%3A8080%2Fapi%2Fdatasets%2F85ca8c595bc94a5f8e15bd94bfb8c5e7%2Fvariables%2F000001%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [] + } + }, + "dimensions": [ + { + "variable": "https://app.crunch.io/api/datasets/mr_from_delim/variables/000001/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [] + }, + "result": { + "dimensions": [ + { + "references": { + "alias": "delim", + "name": "delim" + }, + "derived": true, + "type": { + "subtype": { + "class": "text", + "missing_reasons": { + "No Data": -1 + }, + "missing_rules": {} + }, + "elements": [ + { + "id": 0, + "value": "birch; sugar maple", + "missing": false + }, + { + "id": 1, + "value": "maple butter; oak", + "missing": false + }, + { + "id": 2, + "value": "maple; birch", + "missing": false + }, + { + "id": 3, + "value": "oak; maple; birch", + "missing": false + } + ], + "class": "enum" + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 1, + 1, + 1, + 1 + ], + "n_missing": 0, + "metadata": { + "references": {}, + "derived": true, + "type": { + "integer": true, + "missing_rules": {}, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "element": "crunch:cube", + "counts": [ + 1, + 1, + 1, + 1 + ], + "n": 4 + } + } +} diff --git a/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/variables-d118fa-4b6023-PATCH.json b/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/variables-d118fa-4b6023-PATCH.json new file mode 100644 index 000000000..6434a5946 --- /dev/null +++ b/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/variables-d118fa-4b6023-PATCH.json @@ -0,0 +1,5 @@ +{ + "https://app.crunch.io/api/datasets/mr_from_delim/variables/000003/": { + "discarded": true + } +} diff --git a/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/variables-d118fa.json b/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/variables-d118fa.json new file mode 100644 index 000000000..b2e2fe2c2 --- /dev/null +++ b/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/variables-d118fa.json @@ -0,0 +1,26 @@ +{ + "element": "shoji:catalog", + "self": "https://app.crunch.io/api/datasets/mr_from_delim/variables/?relative=on", + "catalogs": { + "private": "https://app.crunch.io/api/datasets/mr_from_delim/variables/private/" + }, + "orders": { + "personal": "https://app.crunch.io/api/datasets/mr_from_delim/variables/personal/", + "weights": "https://app.crunch.io/api/datasets/mr_from_delim/variables/weights/", + "hier": "https://app.crunch.io/api/datasets/mr_from_delim/variables/hier/" + }, + "specification": "https://app.crunch.io/api/specifications/variables/", + "description": "List of Variables of this dataset", + "index": { + "000003/": { + "discarded": false, + "alias": "delimed_text", + "name": "delimed_text", + "type": "text", + "notes": "", + "derived": false, + "id": "000001", + "description": "" + } + } +} diff --git a/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/variables/hier-d118fa.json b/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/variables/hier-d118fa.json new file mode 100644 index 000000000..40336c70d --- /dev/null +++ b/tests/testthat/app.crunch.io/api/datasets/mr_from_delim/variables/hier-d118fa.json @@ -0,0 +1,6 @@ +{ + "element": "shoji:order", + "self": "https://app.crunch.io/api/datasets/four/variables/hier/?relative=on", + "description": "Hierarchical order of dataset variables", + "graph": ["../000001/", "../000010/", "../000000/", "../000006/", "../000003/", "../000008/", "../000004/", "../000007/", "../000005/", "../000002/", "../000009/"] +} diff --git a/tests/testthat/test-make-array.R b/tests/testthat/test-make-array.R index ed08a48d6..a2188e7d1 100644 --- a/tests/testthat/test-make-array.R +++ b/tests/testthat/test-make-array.R @@ -57,6 +57,107 @@ with_mock_crunch({ name="Gen"), "Undefined columns selected: NOTAVARIABLE") }) + + test_that("makeMRFromText errors correctly", { + expect_error(makeMRFromText(ds$var, "; "), + "Must supply a name for the new variable") + expect_error(makeMRFromText("string", name = "name"), + paste0(dQuote("string"), " must be a Categorical or Text Crunch Variable.")) + }) + + test_that("createSubvarDeriv generates the correct variable definition", { + expected <- list( + `function` = "case", + args = list( + list(column = I(1:3), + type = list( + value = list(class = "categorical", + categories = list( + list(id = 1, + name = "No Data", + numeric_value = NA, + missing = TRUE), + list(id = 2, + name = "Yes", + numeric_value = NA, + missing = FALSE), + list(id = 3, + name = "No", + numeric_value = NA, + missing = FALSE) + ) + ) + ) + ), + list(`function` = "is_missing", + args = list( + list(variable = "https://app.crunch.io/api/datasets/1/variables/textVar/") + ) + ), + list(`function` = "~=", + args = list( + list(variable = "https://app.crunch.io/api/datasets/1/variables/textVar/"), + list(value = "^oak; |; oak; |; oak$|^oak$")) + ) + ), + references = list(name = "oak", alias = "textVar_oak") + ) + varDef <- createSubvarDeriv(ds$textVar, str = "oak", + delim = "; ", + selected = "Yes", + not_selected = "No", + unanswered = NA) + expect_equivalent(varDef, expected) + }) + + test_that("escapeRegex escapes all metacharacters", { + metachars <- c(".", "^", "$", "*", "+", "?", "{", "}", "[", "]", "\\", "|", "(", ")") + str <- paste0("vb", metachars, "net") + expect_identical(escapeRegex(str), paste0("vb\\", metachars, "net")) + expect_identical(escapeRegex("vb.a|net"), "vb\\.a\\|net") + }) + + test_that("buildDelimRegex generates the expected regular expression", { + rx <- buildDelimRegex("maple", "; ") + expect_true(grepl(rx, "maple")) + expect_true(grepl(rx, "maple; birch")) + expect_true(grepl(rx, "oak; maple; birch")) + expect_true(grepl(rx, "birch; maple")) + expect_false(grepl(rx, "birch; sugar maple")) + expect_false(grepl(rx, "maple butter; oak")) + #test delimiters that are regex characters + expect_true(grepl(buildDelimRegex("maple", "| "), "oak| maple| birch")) + expect_false(grepl(buildDelimRegex("maple", "| "), "oak| sugar maple| birch")) + }) + + test_that("makeMRFromText sends the correct variable derivation", { + ds2 <- loadDataset("https://app.crunch.io/api/datasets/mr_from_delim/") + trees <- c("birch", "sugar maple", "maple butter", "oak", "maple") + expected <- VariableDefinition( + derivation=zfunc( + "select_categories", zfunc( + "array", zfunc( + "select", + list(map=lapply(trees, function (tree) { + return(createSubvarDeriv(ds2$delimed_text, + str = tree, + delim = "; ", + selected = "Yes", + not_selected = "No", + unanswered = NA)) + })), + list(value=I(c(1, 2, 3, 4, 5))) + ) + ), + list(value=I("selected"))), + name="New Mr") + varDef <- makeMRFromText(ds2$delimed_text, delim = "; ", + name = "New Mr", + selected = "Yes", + not_selected = "No", + unanswered = NA) + expect_equivalent(varDef, expected) + }) }) with_test_authentication({ @@ -136,4 +237,14 @@ with_test_authentication({ expect_true(setequal(names(ds), names(mrdf))) expect_identical(ncol(ds), 4L) }) + whereas("makeMRFromText functions as expected", { + ds <- newDataset(mrdf) + v <- c("ma.ple; birch", "oak; ma.ple; birch", "birch; sugar maple", "maple butter; oak") + ds$delim <- c("ma.ple; birch", "oak; ma.ple; birch", "birch; sugar maple", "maple butter; oak") + test_that("makeMRFromText creates a variable", { + ds$mr_5 <- makeMRFromText(ds$delim, delim = "; ", name = "myMR") + expect_true(is.derived(ds$mr_5)) + expect_identical(dim(as.vector(ds$mr_5)), c(nrow(ds), 5)) + }) + }) })