Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 28 additions & 11 deletions pandas/core/reshape/melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,25 +186,42 @@ def melt(
value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns)

# GH61475 - prevent AttributeError when duplicate column in id_vars
if len(frame.columns.get_indexer_for(id_vars)) > len(id_vars):
raise ValueError("id_vars cannot contain duplicate columns.")
duplicate_id_cols = [col for col in id_vars if frame.columns.tolist().count(col) > 1]
if duplicate_id_cols:
raise ValueError(
f"id_vars contains columns with duplicate labels in the DataFrame: "
f"{duplicate_id_cols}. Please rename these columns before melting."
)

if id_vars or value_vars:
if col_level is not None:
level = frame.columns.get_level_values(col_level)
else:
level = frame.columns
labels = id_vars + value_vars
idx = level.get_indexer_for(labels)
missing = idx == -1
if missing.any():
missing_labels = [
lab for lab, not_found in zip(labels, missing, strict=True) if not_found
]

# Check id_vars and value_vars separately for clearer error messages
id_idx = level.get_indexer_for(id_vars)
missing_id = [
lab for lab, not_found in zip(id_vars, id_idx == -1, strict=True)
if not_found
]
if missing_id:
raise KeyError(
"The following id_vars or value_vars are not present in "
f"the DataFrame: {missing_labels}"
f"The following id_vars are not present in the DataFrame: {missing_id}"
)

value_idx = level.get_indexer_for(value_vars)
missing_value = [
lab for lab, not_found in zip(value_vars, value_idx == -1, strict=True)
if not_found
]
if missing_value:
raise KeyError(
"The following value_vars are not present in the DataFrame: "
f"{missing_value}"
)

idx = level.get_indexer_for(id_vars + value_vars)
if value_vars_was_not_none:
frame = frame.iloc[:, algos.unique(idx)]
else:
Expand Down
40 changes: 32 additions & 8 deletions pandas/tests/reshape/test_melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,27 +331,38 @@ def test_melt_missing_columns_raises(self):
)

# Try to melt with missing `value_vars` column name
msg = "The following id_vars or value_vars are not present in the DataFrame:"
with pytest.raises(KeyError, match=msg):
with pytest.raises(
KeyError,
match="The following value_vars are not present in the DataFrame:",
):
df.melt(["a", "b"], ["C", "d"])

# Try to melt with missing `id_vars` column name
with pytest.raises(KeyError, match=msg):
with pytest.raises(
KeyError,
match="The following id_vars are not present in the DataFrame:",
):
df.melt(["A", "b"], ["c", "d"])

# Multiple missing
# Multiple missing id_vars
with pytest.raises(
KeyError,
match=msg,
match="The following id_vars are not present in the DataFrame:",
):
df.melt(["a", "b", "not_here", "or_there"], ["c", "d"])

# Multiindex melt fails if column is missing from multilevel melt
df.columns = [list("ABCD"), list("abcd")]
with pytest.raises(KeyError, match=msg):
with pytest.raises(
KeyError,
match="The following id_vars are not present in the DataFrame:",
):
df.melt([("E", "a")], [("B", "b")])
# Multiindex fails if column is missing from single level melt
with pytest.raises(KeyError, match=msg):
with pytest.raises(
KeyError,
match="The following value_vars are not present in the DataFrame:",
):
df.melt(["A"], ["F"], col_level=0)

def test_melt_mixed_int_str_id_vars(self):
Expand Down Expand Up @@ -558,11 +569,24 @@ def test_melt_multiindex_columns_var_name_too_many(self):
def test_melt_duplicate_column_header_raises(self):
# GH61475
df = DataFrame([[1, 2, 3], [3, 4, 5]], columns=["A", "A", "B"])
msg = "id_vars cannot contain duplicate columns."
msg = (
r"id_vars contains columns with duplicate labels in the DataFrame: "
r"\['A'\]\. Please rename these columns before melting\."
)

with pytest.raises(ValueError, match=msg):
df.melt(id_vars=["A"], value_vars=["B"])

def test_melt_duplicate_column_header_names_in_error(self):
# GH61475 - error message should name the specific duplicate column(s)
df = DataFrame(
[[1, 2, 3, 4], [5, 6, 7, 8]], columns=["X", "X", "Y", "Z"]
)
msg = r"id_vars contains columns with duplicate labels in the DataFrame: \['X'\]"

with pytest.raises(ValueError, match=msg):
df.melt(id_vars=["X"], value_vars=["Y"])


class TestLreshape:
def test_pairs(self):
Expand Down
Loading