Skip to content
12 changes: 12 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5477,6 +5477,18 @@ def predicate(arr: ArrayLike) -> bool:
mgr = self._mgr._get_data_subset(predicate).copy(deep=False)
return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self)

def _select_dtypes_indices(self, dtype_class) -> np.ndarray:
"""
Return the indices of the columns of a given dtype.

Currently only works given a class, so mostly useful for ExtensionDtypes.
"""

def predicate(arr: ArrayLike) -> bool:
return isinstance(arr.dtype, dtype_class)

return self._mgr._get_data_subset_indices(predicate)

def insert(
self,
loc: int,
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7127,7 +7127,7 @@ def fillna(
if axis == 1:
# Check that all columns in result have the same dtype
# otherwise don't bother with fillna and losing accurate dtypes
unique_dtypes = algos.unique(self._mgr.get_dtypes())
unique_dtypes = self._mgr.get_unique_dtypes()
if len(unique_dtypes) > 1:
raise ValueError(
"All columns must have the same dtype, but got dtypes: "
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,9 @@ def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool:
blk = self.blocks[blkno]
return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks)

def get_unique_dtypes(self) -> npt.NDArray[np.object_]:
return algos.unique([blk.dtype for blk in self.blocks])

def get_dtypes(self) -> npt.NDArray[np.object_]:
dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object)
return dtypes.take(self.blknos)
Expand Down Expand Up @@ -656,6 +659,11 @@ def _get_data_subset(self, predicate: Callable) -> Self:
blocks = [blk for blk in self.blocks if predicate(blk.values)]
return self._combine(blocks)

def _get_data_subset_indices(self, predicate: Callable) -> np.ndarray:
blocks = [blk for blk in self.blocks if predicate(blk.values)]
indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
return indexer

def get_bool_data(self) -> Self:
"""
Select blocks that are bool-dtype and columns from object-dtype blocks
Expand Down
74 changes: 73 additions & 1 deletion pandas/io/_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import datetime as dt
from typing import (
TYPE_CHECKING,
Literal,
Expand All @@ -10,6 +11,7 @@
from pandas._config import using_string_dtype

from pandas._libs import lib
from pandas._libs.tslibs import timezones
from pandas.compat import (
pa_version_under18p0,
pa_version_under19p0,
Expand All @@ -35,6 +37,9 @@
)


pytz = import_optional_dependency("pytz", errors="ignore")


def _arrow_dtype_mapping() -> dict:
pa = import_optional_dependency("pyarrow")
return {
Expand Down Expand Up @@ -120,7 +125,9 @@ def arrow_table_to_pandas(
raise NotImplementedError

df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
return _post_convert_dtypes(df, dtype_backend, dtype, names)
df = _post_convert_dtypes(df, dtype_backend, dtype, names)
df = _normalize_timezone_dtypes(df)
return df


def _post_convert_dtypes(
Expand Down Expand Up @@ -189,3 +196,68 @@ def _post_convert_dtypes(
df[col] = df[col].astype(cat_dtype)

return df


def _normalize_pytz_timezone(tz: dt.tzinfo) -> dt.tzinfo:
"""
If the input tz is a pytz timezone, attempt to convert it to "default"
tzinfo object (zoneinfo or datetime.timezone).
"""
if not type(tz).__module__.startswith("pytz"):
# isinstance(col.dtype.tz, pytz.BaseTzInfo) does not included
# fixed offsets
return tz

if timezones.is_utc(tz):
return timezones.maybe_get_tz("UTC")

if timezones.is_fixed_offset(tz):
# Convert pytz fixed offset to datetime.timezone
try:
offset = tz.utcoffset(None)
if offset is not None:
return dt.timezone(offset)
except Exception:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what can go wrong here?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was repeating the same pattern from below which I wrote first for zones, but I suppose here there should never be an error (a pytz FixedOffset should always have an offset, which is returned from utcoffset() regardless of the value being passed). Will update

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking back: timezones.is_fixed_offset has some logic to detect if a timezone if a fixed offset, and so t does not only return true for FixedOffset, but also for some zones that have no transitions, like "Etc/GMT+1".
And I am not 100% sure that all those cases where timezones.is_fixed_offset returns true will work exactly the same. I mostly want to ensure this never raises an error (because that would introduce a new regression)

That said, such "fixed" zones should probably not be converted to a fixed offset with datetime.timezone, but to a zoneinfo object when possible. So will switch the order here and first try to convert to zoneinfo

pass

zone = timezones.get_timezone(tz)
if isinstance(zone, str):
try:
return timezones.maybe_get_tz(zone)
except Exception:
# some pytz timezones might not be available for zoneinfo
pass

return tz


def _normalize_timezone_index(index: pd.Index) -> pd.Index:
if isinstance(index, pd.MultiIndex):
levels = [_normalize_timezone_index(level) for level in index.levels]
return index.set_levels(levels)

if isinstance(index.dtype, pd.DatetimeTZDtype):
normalized_tz = _normalize_pytz_timezone(index.dtype.tz)
if normalized_tz is not index.dtype.tz:
return index.tz_convert(normalized_tz)

return index


def _normalize_timezone_dtypes(df: pd.DataFrame) -> pd.DataFrame:
if pytz is not None:
# Convert any pytz timezones to zoneinfo / fixed offset timezones
if any(
isinstance(dtype, pd.DatetimeTZDtype)
for dtype in df._mgr.get_unique_dtypes()
):
col_indices = df._select_dtypes_indices(pd.DatetimeTZDtype)
for i in col_indices:
Comment on lines +274 to +279
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also here, my feeling is that we should have existing helpers that make this easier to do (i.e. to avoid to iterate over every single column's dtype).
But I couldn't directly find anything, so I added this _select_dtypes_indices equivalent of select_dtypes but just giving you the indices instead of the materialized subset dataframe.

The any check with a call to mgr.get_unique_dtypes is maybe less necessary (because _select_dtypes_indices also already works per block), or could be moved inside _select_dtypes_indices

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a handful of places in e.g. DataFrame.select_dtypes that does blk_dtypes = [blk.dtype for blk in self._mgr.blocks]. Definitely makes sense to have a helper for this. I'd be OK with the helper returning the usually-but-not-always-unique list, fine either way.

col = df.iloc[:, i]
normalized_tz = _normalize_pytz_timezone(col.dtype.tz)
if normalized_tz is not col.dtype.tz:
df.isetitem(i, col.dt.tz_convert(normalized_tz))

df.index = _normalize_timezone_index(df.index)
df.columns = _normalize_timezone_index(df.columns)
return df
Loading