diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index e8423394750e1..13936c7896f95 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -186,25 +186,42 @@ def melt( value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns) # GH61475 - prevent AttributeError when duplicate column in id_vars - if len(frame.columns.get_indexer_for(id_vars)) > len(id_vars): - raise ValueError("id_vars cannot contain duplicate columns.") + duplicate_id_cols = [col for col in id_vars if frame.columns.tolist().count(col) > 1] + if duplicate_id_cols: + raise ValueError( + f"id_vars contains columns with duplicate labels in the DataFrame: " + f"{duplicate_id_cols}. Please rename these columns before melting." + ) if id_vars or value_vars: if col_level is not None: level = frame.columns.get_level_values(col_level) else: level = frame.columns - labels = id_vars + value_vars - idx = level.get_indexer_for(labels) - missing = idx == -1 - if missing.any(): - missing_labels = [ - lab for lab, not_found in zip(labels, missing, strict=True) if not_found - ] + + # Check id_vars and value_vars separately for clearer error messages + id_idx = level.get_indexer_for(id_vars) + missing_id = [ + lab for lab, not_found in zip(id_vars, id_idx == -1, strict=True) + if not_found + ] + if missing_id: raise KeyError( - "The following id_vars or value_vars are not present in " - f"the DataFrame: {missing_labels}" + f"The following id_vars are not present in the DataFrame: {missing_id}" ) + + value_idx = level.get_indexer_for(value_vars) + missing_value = [ + lab for lab, not_found in zip(value_vars, value_idx == -1, strict=True) + if not_found + ] + if missing_value: + raise KeyError( + "The following value_vars are not present in the DataFrame: " + f"{missing_value}" + ) + + idx = level.get_indexer_for(id_vars + value_vars) if value_vars_was_not_none: frame = frame.iloc[:, algos.unique(idx)] else: diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index d347d42ee6192..5710ea0f69a30 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -331,27 +331,38 @@ def test_melt_missing_columns_raises(self): ) # Try to melt with missing `value_vars` column name - msg = "The following id_vars or value_vars are not present in the DataFrame:" - with pytest.raises(KeyError, match=msg): + with pytest.raises( + KeyError, + match="The following value_vars are not present in the DataFrame:", + ): df.melt(["a", "b"], ["C", "d"]) # Try to melt with missing `id_vars` column name - with pytest.raises(KeyError, match=msg): + with pytest.raises( + KeyError, + match="The following id_vars are not present in the DataFrame:", + ): df.melt(["A", "b"], ["c", "d"]) - # Multiple missing + # Multiple missing id_vars with pytest.raises( KeyError, - match=msg, + match="The following id_vars are not present in the DataFrame:", ): df.melt(["a", "b", "not_here", "or_there"], ["c", "d"]) # Multiindex melt fails if column is missing from multilevel melt df.columns = [list("ABCD"), list("abcd")] - with pytest.raises(KeyError, match=msg): + with pytest.raises( + KeyError, + match="The following id_vars are not present in the DataFrame:", + ): df.melt([("E", "a")], [("B", "b")]) # Multiindex fails if column is missing from single level melt - with pytest.raises(KeyError, match=msg): + with pytest.raises( + KeyError, + match="The following value_vars are not present in the DataFrame:", + ): df.melt(["A"], ["F"], col_level=0) def test_melt_mixed_int_str_id_vars(self): @@ -558,11 +569,24 @@ def test_melt_multiindex_columns_var_name_too_many(self): def test_melt_duplicate_column_header_raises(self): # GH61475 df = DataFrame([[1, 2, 3], [3, 4, 5]], columns=["A", "A", "B"]) - msg = "id_vars cannot contain duplicate columns." + msg = ( + r"id_vars contains columns with duplicate labels in the DataFrame: " + r"\['A'\]\. Please rename these columns before melting\." + ) with pytest.raises(ValueError, match=msg): df.melt(id_vars=["A"], value_vars=["B"]) + def test_melt_duplicate_column_header_names_in_error(self): + # GH61475 - error message should name the specific duplicate column(s) + df = DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], columns=["X", "X", "Y", "Z"] + ) + msg = r"id_vars contains columns with duplicate labels in the DataFrame: \['X'\]" + + with pytest.raises(ValueError, match=msg): + df.melt(id_vars=["X"], value_vars=["Y"]) + class TestLreshape: def test_pairs(self):