From a59ef38b60405f42775d036bc518dd89f5e24000 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Thu, 2 Apr 2026 17:45:02 +0530 Subject: [PATCH 1/8] Updating compare_dataset function --- scrunch/mutable_dataset.py | 175 ++++++++++++++++++++----------------- 1 file changed, 94 insertions(+), 81 deletions(-) diff --git a/scrunch/mutable_dataset.py b/scrunch/mutable_dataset.py index 69f9dca3..ebcbac69 100644 --- a/scrunch/mutable_dataset.py +++ b/scrunch/mutable_dataset.py @@ -8,6 +8,7 @@ from scrunch.expressions import parse_expr, process_expr from scrunch.helpers import shoji_entity_wrapper +ARRAY_TYPES = frozenset(('multiple_response', 'categorical_array')) def get_mutable_dataset(dataset, connection=None, editor=False, project=None): """ @@ -120,103 +121,115 @@ def join(self, left_var, right_ds, right_var, columns=None, def compare_dataset(self, dataset, use_crunch=False): """ - compare the difference in structure between datasets. The - criterion is the following: + Compare the difference in structure between datasets. - (1) variables that, when matched across datasets by alias, have different types. - (2) variables that have the same name but don't match on alias. - (3) for variables that match and have categories, any categories that have the + The criterion is the following: + (1) Variables that, when matched across datasets by alias, have different types. + (2) Variables that have the same name but don't match on alias. + (3) For variables that match and have categories, any categories that have the same id but don't match on name. - (4) for array variables that match, any subvariables that have the same name but + (4) For array variables that match, any subvariables that have the same name but don't match on alias. - (5) array variables that, after assembling the union of their subvariables, + (5) Array variables that, after assembling the union of their subvariables, point to subvariables that belong to other ds (Not implemented) - (6) missing rules of the variable. + (6) Missing rules of the variable. - :param: dataset: Daatset instance to append from + :param: dataset: Dataset instance to compare with :param: use_crunch: Use the Crunch comparison to compare - :return: a dictionary of differences + :return: A dictionary of differences - NOTE: this sould be done via: http://docs.crunch.io/#post217 + NOTE: This sould be done via: http://docs.crunch.io/#post217 but doesn't seem to be a working feature of Crunch """ if use_crunch: - resp = self.resource.batches.follow( - 'compare', 'dataset={}'.format(dataset.url)) - return resp + return self.resource.batches.follow('compare', 'dataset={}'.format(dataset.url)) + + def process_metadata(metadata): + """ Extract & format metadata with required information. """ + return { + v["alias"]: { + "alias": v["alias"], + "name": v["name"], + "type": v["type"], + "categories": v.get("categories", []) + if v["type"] in CATEGORICAL_TYPES + else [], + "subvariables": v.get("subreferences", []) + if v["type"] in ARRAY_TYPES + else [], + "missing_rules": v.get("missing_rules", []), + } + for v in metadata.values() + } + + self_meta = process_metadata(self.resource.table["metadata"]) + dataset_meta = process_metadata(dataset.resource.table["metadata"]) + + self_names = {n["name"]: n["alias"] for n in self_meta.values()} + dataset_names = {n["name"]: n["alias"] for n in dataset_meta.values()} + common_names = frozenset(self_names.keys()) & frozenset(dataset_names.keys()) + common_aliases = frozenset(self_meta.keys()) & frozenset(dataset_meta.keys()) + diff = { - 'variables': { - 'by_type': [], - 'by_alias': [], - 'by_missing_rules': [], - }, - 'categories': {}, - 'subvariables': {} + "variables": {"by_type": [], "by_alias": [], "by_missing_rules": []}, + "categories": {}, + "subvariables": {}, } - - array_types = ['multiple_response', 'categorical_array'] - - vars_a = {v.alias: v.type for v in self.values()} - vars_b = {v.alias: v.type for v in dataset.values()} - - # 1. match variables by alias and compare types - common_aliases = frozenset(vars_a.keys()) & frozenset(vars_b.keys()) + + # 1. Compare types and categories by alias for alias in common_aliases: - if vars_a[alias] != vars_b[alias]: - diff['variables']['by_type'].append(dataset[alias].name) - - # 3. match variable alias and distcint categories names for same id's - if vars_b[alias] == 'categorical' and vars_a[alias] == 'categorical': - a_ids = frozenset([v.id for v in self[alias].categories.values()]) - b_ids = frozenset([v.id for v in dataset[alias].categories.values()]) - common_ids = a_ids & b_ids - - for id in common_ids: - a_name = self[alias].categories[id].name - b_name = dataset[alias].categories[id].name - if a_name != b_name: - if diff['categories'].get(dataset[alias].name): - diff['categories'][dataset[alias].name].append(id) - else: - diff['categories'][dataset[alias].name] = [] - diff['categories'][dataset[alias].name].append(id) - - # 2. match variables by names and compare aliases - common_names = frozenset(self.variable_names()) & frozenset(dataset.variable_names()) + self_var, dataset_var = self_meta[alias], dataset_meta[alias] + + if self_var["type"] != dataset_var["type"]: + diff["variables"]["by_type"].append(dataset_var["name"]) + + # 3. Compare category names for categorical variables + if self_var["type"] == dataset_var["type"] == "categorical": + a_ids = {v["id"]: v["name"] for v in self_var["categories"]} + b_ids = {v["id"]: v["name"] for v in dataset_var["categories"]} + + mismatched_cats = [ + cat_id + for cat_id in (a_ids.keys() & b_ids.keys()) + if a_ids[cat_id] != b_ids[cat_id] + ] + if mismatched_cats: + diff["categories"][alias] = mismatched_cats + + # 2. Compare aliases, subvariables, and missing rules by name for name in common_names: - if self[name].alias != dataset[name].alias: - diff['variables']['by_alias'].append(name) - - # 4. array types that match, subvars with same name and != alias - if dataset[name].type == self[name].type and \ - self[name].type in array_types and \ - self[name].type in array_types: - - a_names = frozenset(self[name].variable_names()) - b_names = frozenset(dataset[name].variable_names()) - common_subnames = a_names & b_names - - for sv_name in common_subnames: - if self[name][sv_name].alias != dataset[name][sv_name].alias: - if diff['subvariables'].get(name): - diff['subvariables'][name].append(dataset[name][sv_name].alias) - else: - diff['subvariables'][name] = [] - diff['subvariables'][name].append(dataset[name][sv_name].alias) - - # 6. missing rules mismatch - if self[name].type not in CATEGORICAL_TYPES and dataset[name].type not in CATEGORICAL_TYPES: - if self[name].missing_rules != dataset[name].missing_rules: - rules1 = self[name].missing_rules - rules2 = dataset[name].missing_rules - if len(rules1.keys()) == len(rules2.keys()): - for key, value in rules1.items(): - if key not in rules2 or rules2[key] != value: - diff['variables']['by_missing_rules'].append(name) - else: - diff['variables']['by_missing_rules'].append(name) + self_alias, dataset_alias = self_names[name], dataset_names[name] + + if self_alias != dataset_alias: + diff["variables"]["by_alias"].append(name) + + # 4. Compare subvariables for array types + self_var, dataset_var = self_meta[self_alias], dataset_meta[dataset_alias] + + if self_var["type"] == dataset_var["type"] and self_var["type"] in ARRAY_TYPES: + a_names = {i["name"]: i["alias"] for i in self_var["subvariables"].values()} + b_names = { + i["name"]: i["alias"] for i in dataset_var["subvariables"].values() + } + + mismatched_subs = [ + b_names[sv_name] + for sv_name in (frozenset(a_names.keys()) & frozenset(b_names.keys())) + if a_names[sv_name] != b_names[sv_name] + ] + if mismatched_subs: + diff["subvariables"][name] = mismatched_subs + + # 6. Compare missing rules for non-categorical types + if ( + self_var["type"] not in CATEGORICAL_TYPES + and dataset_var["type"] not in CATEGORICAL_TYPES + ): + if self_var["missing_rules"] != dataset_var["missing_rules"]: + diff["variables"]["by_missing_rules"].append(name) + return diff def append_dataset(self, dataset, filter=None, variables=None, From 22587b8efbc79c557324e3b931a295cd367042da Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Mon, 6 Apr 2026 12:28:33 +0530 Subject: [PATCH 2/8] Update docstring --- scrunch/mutable_dataset.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scrunch/mutable_dataset.py b/scrunch/mutable_dataset.py index ebcbac69..ba29e463 100644 --- a/scrunch/mutable_dataset.py +++ b/scrunch/mutable_dataset.py @@ -124,21 +124,21 @@ def compare_dataset(self, dataset, use_crunch=False): Compare the difference in structure between datasets. The criterion is the following: - (1) Variables that, when matched across datasets by alias, have different types. - (2) Variables that have the same name but don't match on alias. - (3) For variables that match and have categories, any categories that have the + (1) variables that, when matched across datasets by alias, have different types. + (2) variables that have the same name but don't match on alias. + (3) for variables that match and have categories, any categories that have the same id but don't match on name. - (4) For array variables that match, any subvariables that have the same name but + (4) for array variables that match, any subvariables that have the same name but don't match on alias. - (5) Array variables that, after assembling the union of their subvariables, + (5) array variables that, after assembling the union of their subvariables, point to subvariables that belong to other ds (Not implemented) - (6) Missing rules of the variable. + (6) missing rules of the variable. :param: dataset: Dataset instance to compare with :param: use_crunch: Use the Crunch comparison to compare - :return: A dictionary of differences + :return: a dictionary of differences - NOTE: This sould be done via: http://docs.crunch.io/#post217 + NOTE: this sould be done via: http://docs.crunch.io/#post217 but doesn't seem to be a working feature of Crunch """ From 3d83ae2b6549ab022f2a0820eadbbe9ab2215ae8 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Mon, 6 Apr 2026 13:14:51 +0530 Subject: [PATCH 3/8] pin cr.cube version for fixing failing tests coz of syntax errors --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 31ffe8d5..a168f792 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ pkg_resources "managed" namespace packages? """ -crunch_cube = "cr.cube" +crunch_cube = "cr.cube==3.3.1" requests = "requests" if PY2: From 9f30654d40a0e81c1637118f871c895cb6b6e502 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Mon, 6 Apr 2026 13:24:39 +0530 Subject: [PATCH 4/8] fixing version for crcube --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a168f792..d04ed8c7 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ pkg_resources "managed" namespace packages? """ -crunch_cube = "cr.cube==3.3.1" +crunch_cube = "cr.cube==3.2.0" requests = "requests" if PY2: From 29d1e368a035eb62a8090baf7f5761a7f153e443 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Mon, 6 Apr 2026 19:47:41 +0530 Subject: [PATCH 5/8] Fixing for missing_rules --- scrunch/mutable_dataset.py | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/scrunch/mutable_dataset.py b/scrunch/mutable_dataset.py index ba29e463..309349b1 100644 --- a/scrunch/mutable_dataset.py +++ b/scrunch/mutable_dataset.py @@ -146,8 +146,8 @@ def compare_dataset(self, dataset, use_crunch=False): return self.resource.batches.follow('compare', 'dataset={}'.format(dataset.url)) def process_metadata(metadata): - """ Extract & format metadata with required information. """ - return { + """ Extract & format metadata with required information. """ + return { v["alias"]: { "alias": v["alias"], "name": v["name"], @@ -158,38 +158,41 @@ def process_metadata(metadata): "subvariables": v.get("subreferences", []) if v["type"] in ARRAY_TYPES else [], - "missing_rules": v.get("missing_rules", []), + "missing_rules": { + k: v['args'][1]['value'] + for k, v in v.get('missing_rules', {}).items() + } } for v in metadata.values() } - + self_meta = process_metadata(self.resource.table["metadata"]) dataset_meta = process_metadata(dataset.resource.table["metadata"]) - + common_aliases = frozenset(self_meta.keys()) & frozenset(dataset_meta.keys()) + self_names = {n["name"]: n["alias"] for n in self_meta.values()} dataset_names = {n["name"]: n["alias"] for n in dataset_meta.values()} - common_names = frozenset(self_names.keys()) & frozenset(dataset_names.keys()) - common_aliases = frozenset(self_meta.keys()) & frozenset(dataset_meta.keys()) - + + diff = { "variables": {"by_type": [], "by_alias": [], "by_missing_rules": []}, "categories": {}, "subvariables": {}, } - + # 1. Compare types and categories by alias for alias in common_aliases: self_var, dataset_var = self_meta[alias], dataset_meta[alias] - + if self_var["type"] != dataset_var["type"]: diff["variables"]["by_type"].append(dataset_var["name"]) - + # 3. Compare category names for categorical variables if self_var["type"] == dataset_var["type"] == "categorical": a_ids = {v["id"]: v["name"] for v in self_var["categories"]} b_ids = {v["id"]: v["name"] for v in dataset_var["categories"]} - + mismatched_cats = [ cat_id for cat_id in (a_ids.keys() & b_ids.keys()) @@ -201,19 +204,19 @@ def process_metadata(metadata): # 2. Compare aliases, subvariables, and missing rules by name for name in common_names: self_alias, dataset_alias = self_names[name], dataset_names[name] - + if self_alias != dataset_alias: diff["variables"]["by_alias"].append(name) - + # 4. Compare subvariables for array types self_var, dataset_var = self_meta[self_alias], dataset_meta[dataset_alias] - + if self_var["type"] == dataset_var["type"] and self_var["type"] in ARRAY_TYPES: a_names = {i["name"]: i["alias"] for i in self_var["subvariables"].values()} b_names = { i["name"]: i["alias"] for i in dataset_var["subvariables"].values() } - + mismatched_subs = [ b_names[sv_name] for sv_name in (frozenset(a_names.keys()) & frozenset(b_names.keys())) @@ -221,7 +224,7 @@ def process_metadata(metadata): ] if mismatched_subs: diff["subvariables"][name] = mismatched_subs - + # 6. Compare missing rules for non-categorical types if ( self_var["type"] not in CATEGORICAL_TYPES From 3630a2f2dd1b4e8c18b22dedabbcebb759808967 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Wed, 15 Apr 2026 13:37:05 +0530 Subject: [PATCH 6/8] Fixing issue of variables with same nme within different folders --- scrunch/mutable_dataset.py | 77 ++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/scrunch/mutable_dataset.py b/scrunch/mutable_dataset.py index 309349b1..6342cf3f 100644 --- a/scrunch/mutable_dataset.py +++ b/scrunch/mutable_dataset.py @@ -169,11 +169,27 @@ def process_metadata(metadata): self_meta = process_metadata(self.resource.table["metadata"]) dataset_meta = process_metadata(dataset.resource.table["metadata"]) common_aliases = frozenset(self_meta.keys()) & frozenset(dataset_meta.keys()) - - self_names = {n["name"]: n["alias"] for n in self_meta.values()} - dataset_names = {n["name"]: n["alias"] for n in dataset_meta.values()} - common_names = frozenset(self_names.keys()) & frozenset(dataset_names.keys()) + self_names = {} + dataset_names = {} + + for n in self_meta.values(): + name = n["name"] + alias = n["alias"] + if self_names.get(name): + self_names[name].append(alias) + else: + self_names[name] = [alias] + + for n in dataset_meta.values(): + name = n["name"] + alias = n["alias"] + if dataset_names.get(name): + dataset_names[name].append(alias) + else: + dataset_names[name] = [alias] + + common_names = frozenset(self_names.keys()) & frozenset(dataset_names.keys()) diff = { "variables": {"by_type": [], "by_alias": [], "by_missing_rules": []}, @@ -203,35 +219,40 @@ def process_metadata(metadata): # 2. Compare aliases, subvariables, and missing rules by name for name in common_names: - self_alias, dataset_alias = self_names[name], dataset_names[name] + self_alias = set(self_names[name]) + dataset_alias = set(dataset_names[name]) + + alias_diff = list(self_alias - dataset_alias) + alias_common = set(self_alias & dataset_alias) - if self_alias != dataset_alias: - diff["variables"]["by_alias"].append(name) + if alias_diff: + diff["variables"]["by_alias"].extend(alias_diff) # 4. Compare subvariables for array types - self_var, dataset_var = self_meta[self_alias], dataset_meta[dataset_alias] + for com_als in alias_common: + self_var, dataset_var = self_meta[com_als], dataset_meta[com_als] - if self_var["type"] == dataset_var["type"] and self_var["type"] in ARRAY_TYPES: - a_names = {i["name"]: i["alias"] for i in self_var["subvariables"].values()} - b_names = { - i["name"]: i["alias"] for i in dataset_var["subvariables"].values() - } + if self_var["type"] == dataset_var["type"] and self_var["type"] in ARRAY_TYPES: + a_names = {i["name"]: i["alias"] for i in self_var["subvariables"].values()} + b_names = { + i["name"]: i["alias"] for i in dataset_var["subvariables"].values() + } - mismatched_subs = [ - b_names[sv_name] - for sv_name in (frozenset(a_names.keys()) & frozenset(b_names.keys())) - if a_names[sv_name] != b_names[sv_name] - ] - if mismatched_subs: - diff["subvariables"][name] = mismatched_subs - - # 6. Compare missing rules for non-categorical types - if ( - self_var["type"] not in CATEGORICAL_TYPES - and dataset_var["type"] not in CATEGORICAL_TYPES - ): - if self_var["missing_rules"] != dataset_var["missing_rules"]: - diff["variables"]["by_missing_rules"].append(name) + mismatched_subs = [ + b_names[sv_name] + for sv_name in (frozenset(a_names.keys()) & frozenset(b_names.keys())) + if a_names[sv_name] != b_names[sv_name] + ] + if mismatched_subs: + diff["subvariables"][name] = mismatched_subs + + # 6. Compare missing rules for non-categorical types + if ( + self_var["type"] not in CATEGORICAL_TYPES + and dataset_var["type"] not in CATEGORICAL_TYPES + ): + if self_var["missing_rules"] != dataset_var["missing_rules"]: + diff["variables"]["by_missing_rules"].append(name) return diff From bed50b690ca820567cba301ecec54b793b1ed063 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Thu, 16 Apr 2026 12:45:19 +0530 Subject: [PATCH 7/8] Fix value --- scrunch/mutable_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrunch/mutable_dataset.py b/scrunch/mutable_dataset.py index 6342cf3f..a9eda018 100644 --- a/scrunch/mutable_dataset.py +++ b/scrunch/mutable_dataset.py @@ -226,7 +226,7 @@ def process_metadata(metadata): alias_common = set(self_alias & dataset_alias) if alias_diff: - diff["variables"]["by_alias"].extend(alias_diff) + diff["variables"]["by_alias"].append(name) # 4. Compare subvariables for array types for com_als in alias_common: From ab39361267642212a23ca1087e6d5ffbcb5949af Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:42:11 +0530 Subject: [PATCH 8/8] Updates as per feedback --- scrunch/mutable_dataset.py | 323 ++++++++++++++++++++++++------------- 1 file changed, 215 insertions(+), 108 deletions(-) diff --git a/scrunch/mutable_dataset.py b/scrunch/mutable_dataset.py index a9eda018..0e2651b2 100644 --- a/scrunch/mutable_dataset.py +++ b/scrunch/mutable_dataset.py @@ -8,7 +8,146 @@ from scrunch.expressions import parse_expr, process_expr from scrunch.helpers import shoji_entity_wrapper -ARRAY_TYPES = frozenset(('multiple_response', 'categorical_array')) +from warnings import warn + +ARRAY_TYPES = frozenset(('multiple_response', 'categorical_array', 'numeric_array')) + + +def compare_datasets(left_ds, right_ds, use_crunch=False): + """ + Compare the difference in structure between datasets. + + The criterion is the following: + (1) variables that, when matched across datasets by alias, have different types. + (2) variables that have the same name but don't match on alias. + (3) for variables that match and have categories, any categories that have the + same id but don't match on name. + (4) for array variables that match, any subvariables that have the same name but + don't match on alias. + (5) array variables that, after assembling the union of their subvariables, + point to subvariables that belong to other ds (Not implemented) + (6) missing rules of the variable. + + :param: left_ds: dataset instance to compare + :param: right_ds: dataset instance to compare with + :param: use_crunch: Use the Crunch comparison to compare + :return: a dictionary of differences + """ + + if use_crunch: + return left_ds.resource.batches.follow('compare', 'dataset={}'.format(right_ds.url)) + + def process_metadata(metadata): + """ Extract & format metadata with required information. """ + return { + v["alias"]: { + "alias": v["alias"], + "name": v["name"], + "type": v["type"], + "categories": v.get("categories", []) + if v["type"] in CATEGORICAL_TYPES + else [], + "subvariables": v.get("subreferences", []) + if v["type"] in ARRAY_TYPES + else [], + "missing_rules": { + k: v['args'][1]['value'] + for k, v in v.get('missing_rules', {}).items() + } + } + for v in metadata.values() + } + + left_ds_meta = process_metadata(left_ds.resource.table["metadata"]) + dataset_meta = process_metadata(right_ds.resource.table["metadata"]) + common_aliases = frozenset(left_ds_meta.keys()) & frozenset(dataset_meta.keys()) + + left_ds_names = {} + right_ds_names = {} + + for n in left_ds_meta.values(): + name = n["name"] + alias = n["alias"] + if left_ds_names.get(name): + left_ds_names[name].append(alias) + else: + left_ds_names[name] = [alias] + + for n in dataset_meta.values(): + name = n["name"] + alias = n["alias"] + if right_ds_names.get(name): + right_ds_names[name].append(alias) + else: + right_ds_names[name] = [alias] + + common_names = frozenset(left_ds_names.keys()) & frozenset(right_ds_names.keys()) + + diff = { + "variables": {"by_type": [], "by_alias": [], "by_missing_rules": []}, + "categories": {}, + "subvariables": {}, + } + + # 1. Compare types and categories by alias + for alias in common_aliases: + left_ds_var, right_ds_var = left_ds_meta[alias], dataset_meta[alias] + + if left_ds_var["type"] != right_ds_var["type"]: + diff["variables"]["by_type"].append(right_ds_var["name"]) + + # 3. Compare category names for categorical variables + if left_ds_var["type"] == right_ds_var["type"] == "categorical": + a_ids = {v["id"]: v["name"] for v in left_ds_var["categories"]} + b_ids = {v["id"]: v["name"] for v in right_ds_var["categories"]} + + mismatched_cats = [ + cat_id + for cat_id in (a_ids.keys() & b_ids.keys()) + if a_ids[cat_id] != b_ids[cat_id] + ] + if mismatched_cats: + diff["categories"][alias] = mismatched_cats + + # 2. Compare aliases, subvariables, and missing rules by name + for name in common_names: + left_ds_alias = set(left_ds_names[name]) + right_ds_alias = set(right_ds_names[name]) + + alias_diff = list(left_ds_alias - right_ds_alias) + alias_common = set(left_ds_alias & right_ds_alias) + + if alias_diff: + diff["variables"]["by_alias"].append(name) + + # 4. Compare subvariables for array types + for com_als in alias_common: + left_ds_var, right_ds_var = left_ds_meta[com_als], dataset_meta[com_als] + + if left_ds_var["type"] == right_ds_var["type"] and left_ds_var["type"] in ARRAY_TYPES: + a_names = {i["name"]: i["alias"] for i in left_ds_var["subvariables"].values()} + b_names = { + i["name"]: i["alias"] for i in right_ds_var["subvariables"].values() + } + + mismatched_subs = [ + b_names[sv_name] + for sv_name in (frozenset(a_names.keys()) & frozenset(b_names.keys())) + if a_names[sv_name] != b_names[sv_name] + ] + if mismatched_subs: + diff["subvariables"][name] = mismatched_subs + + # 6. Compare missing rules for non-categorical types + if ( + left_ds_var["type"] not in CATEGORICAL_TYPES + and right_ds_var["type"] not in CATEGORICAL_TYPES + ): + if left_ds_var["missing_rules"] != right_ds_var["missing_rules"]: + diff["variables"]["by_missing_rules"].append(name) + + return diff + def get_mutable_dataset(dataset, connection=None, editor=False, project=None): """ @@ -121,9 +260,9 @@ def join(self, left_var, right_ds, right_var, columns=None, def compare_dataset(self, dataset, use_crunch=False): """ - Compare the difference in structure between datasets. + compare the difference in structure between datasets. The + criterion is the following: - The criterion is the following: (1) variables that, when matched across datasets by alias, have different types. (2) variables that have the same name but don't match on alias. (3) for variables that match and have categories, any categories that have the @@ -134,126 +273,94 @@ def compare_dataset(self, dataset, use_crunch=False): point to subvariables that belong to other ds (Not implemented) (6) missing rules of the variable. - :param: dataset: Dataset instance to compare with + :param: dataset: Daatset instance to append from :param: use_crunch: Use the Crunch comparison to compare :return: a dictionary of differences NOTE: this sould be done via: http://docs.crunch.io/#post217 but doesn't seem to be a working feature of Crunch """ + warn( + "Deprecated: Use compare_datasets() for optimized comparison operations.", + DeprecationWarning + ) if use_crunch: - return self.resource.batches.follow('compare', 'dataset={}'.format(dataset.url)) - - def process_metadata(metadata): - """ Extract & format metadata with required information. """ - return { - v["alias"]: { - "alias": v["alias"], - "name": v["name"], - "type": v["type"], - "categories": v.get("categories", []) - if v["type"] in CATEGORICAL_TYPES - else [], - "subvariables": v.get("subreferences", []) - if v["type"] in ARRAY_TYPES - else [], - "missing_rules": { - k: v['args'][1]['value'] - for k, v in v.get('missing_rules', {}).items() - } - } - for v in metadata.values() - } - - self_meta = process_metadata(self.resource.table["metadata"]) - dataset_meta = process_metadata(dataset.resource.table["metadata"]) - common_aliases = frozenset(self_meta.keys()) & frozenset(dataset_meta.keys()) - - self_names = {} - dataset_names = {} - - for n in self_meta.values(): - name = n["name"] - alias = n["alias"] - if self_names.get(name): - self_names[name].append(alias) - else: - self_names[name] = [alias] - - for n in dataset_meta.values(): - name = n["name"] - alias = n["alias"] - if dataset_names.get(name): - dataset_names[name].append(alias) - else: - dataset_names[name] = [alias] - - common_names = frozenset(self_names.keys()) & frozenset(dataset_names.keys()) + resp = self.resource.batches.follow( + 'compare', 'dataset={}'.format(dataset.url)) + return resp diff = { - "variables": {"by_type": [], "by_alias": [], "by_missing_rules": []}, - "categories": {}, - "subvariables": {}, + 'variables': { + 'by_type': [], + 'by_alias': [], + 'by_missing_rules': [], + }, + 'categories': {}, + 'subvariables': {} } - # 1. Compare types and categories by alias - for alias in common_aliases: - self_var, dataset_var = self_meta[alias], dataset_meta[alias] - - if self_var["type"] != dataset_var["type"]: - diff["variables"]["by_type"].append(dataset_var["name"]) - - # 3. Compare category names for categorical variables - if self_var["type"] == dataset_var["type"] == "categorical": - a_ids = {v["id"]: v["name"] for v in self_var["categories"]} - b_ids = {v["id"]: v["name"] for v in dataset_var["categories"]} + array_types = ['multiple_response', 'categorical_array'] - mismatched_cats = [ - cat_id - for cat_id in (a_ids.keys() & b_ids.keys()) - if a_ids[cat_id] != b_ids[cat_id] - ] - if mismatched_cats: - diff["categories"][alias] = mismatched_cats + vars_a = {v.alias: v.type for v in self.values()} + vars_b = {v.alias: v.type for v in dataset.values()} - # 2. Compare aliases, subvariables, and missing rules by name + # 1. match variables by alias and compare types + common_aliases = frozenset(vars_a.keys()) & frozenset(vars_b.keys()) + for alias in common_aliases: + if vars_a[alias] != vars_b[alias]: + diff['variables']['by_type'].append(dataset[alias].name) + + # 3. match variable alias and distcint categories names for same id's + if vars_b[alias] == 'categorical' and vars_a[alias] == 'categorical': + a_ids = frozenset([v.id for v in self[alias].categories.values()]) + b_ids = frozenset([v.id for v in dataset[alias].categories.values()]) + common_ids = a_ids & b_ids + + for id in common_ids: + a_name = self[alias].categories[id].name + b_name = dataset[alias].categories[id].name + if a_name != b_name: + if diff['categories'].get(dataset[alias].name): + diff['categories'][dataset[alias].name].append(id) + else: + diff['categories'][dataset[alias].name] = [] + diff['categories'][dataset[alias].name].append(id) + + # 2. match variables by names and compare aliases + common_names = frozenset(self.variable_names()) & frozenset(dataset.variable_names()) for name in common_names: - self_alias = set(self_names[name]) - dataset_alias = set(dataset_names[name]) - - alias_diff = list(self_alias - dataset_alias) - alias_common = set(self_alias & dataset_alias) - - if alias_diff: - diff["variables"]["by_alias"].append(name) - - # 4. Compare subvariables for array types - for com_als in alias_common: - self_var, dataset_var = self_meta[com_als], dataset_meta[com_als] - - if self_var["type"] == dataset_var["type"] and self_var["type"] in ARRAY_TYPES: - a_names = {i["name"]: i["alias"] for i in self_var["subvariables"].values()} - b_names = { - i["name"]: i["alias"] for i in dataset_var["subvariables"].values() - } - - mismatched_subs = [ - b_names[sv_name] - for sv_name in (frozenset(a_names.keys()) & frozenset(b_names.keys())) - if a_names[sv_name] != b_names[sv_name] - ] - if mismatched_subs: - diff["subvariables"][name] = mismatched_subs - - # 6. Compare missing rules for non-categorical types - if ( - self_var["type"] not in CATEGORICAL_TYPES - and dataset_var["type"] not in CATEGORICAL_TYPES - ): - if self_var["missing_rules"] != dataset_var["missing_rules"]: - diff["variables"]["by_missing_rules"].append(name) - + if self[name].alias != dataset[name].alias: + diff['variables']['by_alias'].append(name) + + # 4. array types that match, subvars with same name and != alias + if dataset[name].type == self[name].type and \ + self[name].type in array_types and \ + self[name].type in array_types: + + a_names = frozenset(self[name].variable_names()) + b_names = frozenset(dataset[name].variable_names()) + common_subnames = a_names & b_names + + for sv_name in common_subnames: + if self[name][sv_name].alias != dataset[name][sv_name].alias: + if diff['subvariables'].get(name): + diff['subvariables'][name].append(dataset[name][sv_name].alias) + else: + diff['subvariables'][name] = [] + diff['subvariables'][name].append(dataset[name][sv_name].alias) + + # 6. missing rules mismatch + if self[name].type not in CATEGORICAL_TYPES and dataset[name].type not in CATEGORICAL_TYPES: + if self[name].missing_rules != dataset[name].missing_rules: + rules1 = self[name].missing_rules + rules2 = dataset[name].missing_rules + if len(rules1.keys()) == len(rules2.keys()): + for key, value in rules1.items(): + if key not in rules2 or rules2[key] != value: + diff['variables']['by_missing_rules'].append(name) + else: + diff['variables']['by_missing_rules'].append(name) return diff def append_dataset(self, dataset, filter=None, variables=None,