Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 108 additions & 71 deletions scrunch/mutable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from scrunch.expressions import parse_expr, process_expr
from scrunch.helpers import shoji_entity_wrapper

ARRAY_TYPES = frozenset(('multiple_response', 'categorical_array'))
Comment thread
shaikh-ma marked this conversation as resolved.
Outdated

def get_mutable_dataset(dataset, connection=None, editor=False, project=None):
"""
Expand Down Expand Up @@ -120,9 +121,9 @@ def join(self, left_var, right_ds, right_var, columns=None,

def compare_dataset(self, dataset, use_crunch=False):
"""
compare the difference in structure between datasets. The
criterion is the following:
Compare the difference in structure between datasets.

The criterion is the following:
(1) variables that, when matched across datasets by alias, have different types.
(2) variables that have the same name but don't match on alias.
(3) for variables that match and have categories, any categories that have the
Expand All @@ -133,7 +134,7 @@ def compare_dataset(self, dataset, use_crunch=False):
point to subvariables that belong to other ds (Not implemented)
(6) missing rules of the variable.

:param: dataset: Daatset instance to append from
:param: dataset: Dataset instance to compare with
:param: use_crunch: Use the Crunch comparison to compare
:return: a dictionary of differences

Expand All @@ -142,81 +143,117 @@ def compare_dataset(self, dataset, use_crunch=False):
"""

if use_crunch:
resp = self.resource.batches.follow(
'compare', 'dataset={}'.format(dataset.url))
return resp
return self.resource.batches.follow('compare', 'dataset={}'.format(dataset.url))

def process_metadata(metadata):
""" Extract & format metadata with required information. """
return {
v["alias"]: {
"alias": v["alias"],
"name": v["name"],
"type": v["type"],
"categories": v.get("categories", [])
if v["type"] in CATEGORICAL_TYPES
else [],
"subvariables": v.get("subreferences", [])
if v["type"] in ARRAY_TYPES
else [],
"missing_rules": {
k: v['args'][1]['value']
for k, v in v.get('missing_rules', {}).items()
}
}
for v in metadata.values()
}

self_meta = process_metadata(self.resource.table["metadata"])
dataset_meta = process_metadata(dataset.resource.table["metadata"])
common_aliases = frozenset(self_meta.keys()) & frozenset(dataset_meta.keys())

self_names = {}
dataset_names = {}

for n in self_meta.values():
name = n["name"]
alias = n["alias"]
if self_names.get(name):
self_names[name].append(alias)
else:
self_names[name] = [alias]

for n in dataset_meta.values():
name = n["name"]
alias = n["alias"]
if dataset_names.get(name):
dataset_names[name].append(alias)
else:
dataset_names[name] = [alias]

common_names = frozenset(self_names.keys()) & frozenset(dataset_names.keys())

diff = {
'variables': {
'by_type': [],
'by_alias': [],
'by_missing_rules': [],
},
'categories': {},
'subvariables': {}
"variables": {"by_type": [], "by_alias": [], "by_missing_rules": []},
"categories": {},
"subvariables": {},
}

array_types = ['multiple_response', 'categorical_array']
# 1. Compare types and categories by alias
for alias in common_aliases:
self_var, dataset_var = self_meta[alias], dataset_meta[alias]

vars_a = {v.alias: v.type for v in self.values()}
vars_b = {v.alias: v.type for v in dataset.values()}
if self_var["type"] != dataset_var["type"]:
diff["variables"]["by_type"].append(dataset_var["name"])

# 1. match variables by alias and compare types
common_aliases = frozenset(vars_a.keys()) & frozenset(vars_b.keys())
for alias in common_aliases:
if vars_a[alias] != vars_b[alias]:
diff['variables']['by_type'].append(dataset[alias].name)

# 3. match variable alias and distcint categories names for same id's
if vars_b[alias] == 'categorical' and vars_a[alias] == 'categorical':
a_ids = frozenset([v.id for v in self[alias].categories.values()])
b_ids = frozenset([v.id for v in dataset[alias].categories.values()])
common_ids = a_ids & b_ids

for id in common_ids:
a_name = self[alias].categories[id].name
b_name = dataset[alias].categories[id].name
if a_name != b_name:
if diff['categories'].get(dataset[alias].name):
diff['categories'][dataset[alias].name].append(id)
else:
diff['categories'][dataset[alias].name] = []
diff['categories'][dataset[alias].name].append(id)

# 2. match variables by names and compare aliases
common_names = frozenset(self.variable_names()) & frozenset(dataset.variable_names())
# 3. Compare category names for categorical variables
if self_var["type"] == dataset_var["type"] == "categorical":
a_ids = {v["id"]: v["name"] for v in self_var["categories"]}
b_ids = {v["id"]: v["name"] for v in dataset_var["categories"]}

mismatched_cats = [
cat_id
for cat_id in (a_ids.keys() & b_ids.keys())
if a_ids[cat_id] != b_ids[cat_id]
]
if mismatched_cats:
diff["categories"][alias] = mismatched_cats

# 2. Compare aliases, subvariables, and missing rules by name
for name in common_names:
if self[name].alias != dataset[name].alias:
diff['variables']['by_alias'].append(name)

# 4. array types that match, subvars with same name and != alias
if dataset[name].type == self[name].type and \
self[name].type in array_types and \
self[name].type in array_types:

a_names = frozenset(self[name].variable_names())
b_names = frozenset(dataset[name].variable_names())
common_subnames = a_names & b_names

for sv_name in common_subnames:
if self[name][sv_name].alias != dataset[name][sv_name].alias:
if diff['subvariables'].get(name):
diff['subvariables'][name].append(dataset[name][sv_name].alias)
else:
diff['subvariables'][name] = []
diff['subvariables'][name].append(dataset[name][sv_name].alias)

# 6. missing rules mismatch
if self[name].type not in CATEGORICAL_TYPES and dataset[name].type not in CATEGORICAL_TYPES:
if self[name].missing_rules != dataset[name].missing_rules:
rules1 = self[name].missing_rules
rules2 = dataset[name].missing_rules
if len(rules1.keys()) == len(rules2.keys()):
for key, value in rules1.items():
if key not in rules2 or rules2[key] != value:
diff['variables']['by_missing_rules'].append(name)
else:
diff['variables']['by_missing_rules'].append(name)
self_alias = set(self_names[name])
dataset_alias = set(dataset_names[name])

alias_diff = list(self_alias - dataset_alias)
alias_common = set(self_alias & dataset_alias)

if alias_diff:
diff["variables"]["by_alias"].append(name)

# 4. Compare subvariables for array types
for com_als in alias_common:
self_var, dataset_var = self_meta[com_als], dataset_meta[com_als]

if self_var["type"] == dataset_var["type"] and self_var["type"] in ARRAY_TYPES:
a_names = {i["name"]: i["alias"] for i in self_var["subvariables"].values()}
b_names = {
i["name"]: i["alias"] for i in dataset_var["subvariables"].values()
}

mismatched_subs = [
b_names[sv_name]
for sv_name in (frozenset(a_names.keys()) & frozenset(b_names.keys()))
if a_names[sv_name] != b_names[sv_name]
]
if mismatched_subs:
diff["subvariables"][name] = mismatched_subs

# 6. Compare missing rules for non-categorical types
if (
self_var["type"] not in CATEGORICAL_TYPES
and dataset_var["type"] not in CATEGORICAL_TYPES
):
if self_var["missing_rules"] != dataset_var["missing_rules"]:
diff["variables"]["by_missing_rules"].append(name)

return diff

def append_dataset(self, dataset, filter=None, variables=None,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
pkg_resources "managed" namespace packages?
"""

crunch_cube = "cr.cube"
crunch_cube = "cr.cube==3.2.0"
requests = "requests"

if PY2:
Expand Down
Loading