diff --git a/doc/source/whatsnew/v3.0.3.rst b/doc/source/whatsnew/v3.0.3.rst index f8987257b4858..9b1b50a4ff114 100644 --- a/doc/source/whatsnew/v3.0.3.rst +++ b/doc/source/whatsnew/v3.0.3.rst @@ -8,6 +8,20 @@ including other versions of pandas. {{ header }} +.. --------------------------------------------------------------------------- +.. _whatsnew_303.enhancements: + +Enhancements +~~~~~~~~~~~~ +- Starting with pandas 3.0.0, time zones are represented by default using the + standard library's :mod:`zoneinfo` module (or ``datetime.timezone`` for fixed + offsets) instead of using ``pytz`` (:ref:`release note `). + + The IO methods using ``pyarrow`` under the hood such as :func:`read_parquet`, + :func:`read_feather` and :func:`read_orc` (or :func:`read_csv` when specifying + the engine) were still returning timezone using ``pytz``. Those have now been + updated to consistently use default ``zoneinfo`` time zones as well (:issue:`65134`). + .. --------------------------------------------------------------------------- .. _whatsnew_303.regressions: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dc09e24437087..5459103c2ee85 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5446,7 +5446,7 @@ def predicate(arr: ArrayLike) -> bool: return True - blk_dtypes = [blk.dtype for blk in self._mgr.blocks] + blk_dtypes = self._mgr.get_unique_dtypes() if ( np.object_ in include and str not in include @@ -5473,6 +5473,18 @@ def predicate(arr: ArrayLike) -> bool: mgr = self._mgr._get_data_subset(predicate).copy(deep=False) return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) + def _select_dtypes_indices(self, dtype_class) -> np.ndarray: + """ + Return the indices of the columns of a given dtype. + + Currently only works given a class, so mostly useful for ExtensionDtypes. + """ + + def predicate(arr: ArrayLike) -> bool: + return isinstance(arr.dtype, dtype_class) + + return self._mgr._get_data_subset_indices(predicate) + def insert( self, loc: int, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3dac395410399..df0e740f00f4f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7130,7 +7130,7 @@ def fillna( if axis == 1: # Check that all columns in result have the same dtype # otherwise don't bother with fillna and losing accurate dtypes - unique_dtypes = algos.unique(self._mgr.get_dtypes()) + unique_dtypes = self._mgr.get_unique_dtypes() if len(unique_dtypes) > 1: raise ValueError( "All columns must have the same dtype, but got dtypes: " diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d520901e36812..3a447e1c64699 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -336,6 +336,9 @@ def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: blk = self.blocks[blkno] return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) + def get_unique_dtypes(self) -> npt.NDArray[np.object_]: + return algos.unique(np.array([blk.dtype for blk in self.blocks], dtype=object)) + def get_dtypes(self) -> npt.NDArray[np.object_]: dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) return dtypes.take(self.blknos) @@ -656,6 +659,11 @@ def _get_data_subset(self, predicate: Callable) -> Self: blocks = [blk for blk in self.blocks if predicate(blk.values)] return self._combine(blocks) + def _get_data_subset_indices(self, predicate: Callable) -> np.ndarray: + blocks = [blk for blk in self.blocks if predicate(blk.values)] + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) + return indexer + def get_bool_data(self) -> Self: """ Select blocks that are bool-dtype and columns from object-dtype blocks diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 73ee1f73710b7..03247a17dab9e 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,15 +1,18 @@ from __future__ import annotations +import datetime as dt from typing import ( TYPE_CHECKING, Literal, ) +import zoneinfo import numpy as np from pandas._config import using_string_dtype from pandas._libs import lib +from pandas._libs.tslibs import timezones from pandas.compat import ( pa_version_under18p0, pa_version_under19p0, @@ -35,6 +38,9 @@ ) +pytz = import_optional_dependency("pytz", errors="ignore") + + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") return { @@ -120,7 +126,9 @@ def arrow_table_to_pandas( raise NotImplementedError df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) - return _post_convert_dtypes(df, dtype_backend, dtype, names) + df = _post_convert_dtypes(df, dtype_backend, dtype, names) + df = _normalize_timezone_dtypes(df) + return df def _post_convert_dtypes( @@ -203,3 +211,78 @@ def _post_convert_dtypes( df[col] = df[col].astype(cat_dtype) return df + + +def _normalize_pytz_timezone(tz: dt.tzinfo) -> dt.tzinfo: + """ + If the input tz is a pytz timezone, attempt to convert it to "default" + tzinfo object (zoneinfo or datetime.timezone). + """ + if not type(tz).__module__.startswith("pytz"): + # isinstance(col.dtype.tz, pytz.BaseTzInfo) does not included + # fixed offsets + return tz + + if timezones.is_utc(tz): + return dt.timezone.utc + + if tz.zone is not None: # type: ignore[attr-defined] + try: + return zoneinfo.ZoneInfo(tz.zone) # type: ignore[attr-defined] + except Exception: + # some pytz timezones might not be available for zoneinfo + pass + + if timezones.is_fixed_offset(tz): + # Convert pytz fixed offset to datetime.timezone + try: + offset = tz.utcoffset(None) + if offset is not None: + return dt.timezone(offset) + except Exception: + pass + + return tz + + +def _normalize_timezone_index(index: pd.Index) -> pd.Index: + if isinstance(index, pd.MultiIndex): + if any(isinstance(level.dtype, pd.DatetimeTZDtype) for level in index.levels): + levels = [_normalize_timezone_index(level) for level in index.levels] + return index.set_levels(levels) + + return index + + if isinstance(index.dtype, pd.DatetimeTZDtype): + normalized_tz = _normalize_pytz_timezone(index.dtype.tz) + if normalized_tz is not index.dtype.tz: + return index.tz_convert(normalized_tz) # type: ignore[attr-defined] + + return index + + +def _normalize_timezone_dtypes(df: pd.DataFrame) -> pd.DataFrame: + """ + PyArrow uses pytz by default for timezones, but pandas uses + zoneinfo / datetime.timezone since pandas 3.0. + + TODO: Starting with pyarrow 25, it will use zoneinfo by default, and then + this normalization can be skipped (https://github.com/apache/arrow/pull/49694). + """ + if pytz is not None: + # Convert any pytz timezones to zoneinfo / fixed offset timezones + if any( + isinstance(dtype, pd.DatetimeTZDtype) + for dtype in df._mgr.get_unique_dtypes() + ): + col_indices = df._select_dtypes_indices(pd.DatetimeTZDtype) + for i in col_indices: + col = df.iloc[:, i] + normalized_tz = _normalize_pytz_timezone(col.dtype.tz) + if normalized_tz is not col.dtype.tz: + df.isetitem(i, col.dt.tz_convert(normalized_tz)) + + df.index = _normalize_timezone_index(df.index) + df.columns = _normalize_timezone_index(df.columns) + + return df diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 7aed7acb8e50d..41effd4c2896e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -218,13 +218,9 @@ def test_parse_tz_aware(all_parsers): {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) if parser.engine == "pyarrow": - pytz = pytest.importorskip("pytz") - expected_tz = pytz.utc expected.index = expected.index.as_unit("s") - else: - expected_tz = timezone.utc tm.assert_frame_equal(result, expected) - assert result.index.tz is expected_tz + assert result.index.tz is timezone.utc @pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}]) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index c5922b6b5a9a4..aff16c58f8c28 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1068,31 +1068,7 @@ def test_timestamp_nanoseconds(self, pa, temp_file): def test_timezone_aware_index(self, pa, timezone_aware_date_list, temp_file): idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) - - # see gh-36004 - # compare time(zone) values only, skip their class: - # pyarrow always creates fixed offset timezones using pytz.FixedOffset() - # even if it was datetime.timezone() originally - # - # technically they are the same: - # they both implement datetime.tzinfo - # they both wrap datetime.timedelta() - # this use-case sets the resolution to 1 minute - - expected = df[:] - if timezone_aware_date_list.tzinfo != datetime.UTC: - # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone - # https://github.com/pandas-dev/pandas/issues/37286 - try: - import pytz - except ImportError: - pass - else: - offset = df.index.tz.utcoffset(timezone_aware_date_list) - tz = pytz.FixedOffset(offset.total_seconds() / 60) - expected.index = expected.index.tz_convert(tz) - expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz) - check_round_trip(df, temp_file, pa, check_dtype=False, expected=expected) + check_round_trip(df, temp_file, pa, check_dtype=False) def test_filter_row_groups(self, pa, temp_file): # https://github.com/pandas-dev/pandas/issues/26551 diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index c48986c597356..33b05bc34eccd 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -6,6 +6,7 @@ import subprocess import sys import textwrap +import zoneinfo import dateutil.tz import pytest @@ -191,3 +192,18 @@ def test_maybe_get_tz_offset_only(): tz = timezones.maybe_get_tz("UTC-02:45") assert tz == timezone(-timedelta(hours=2, minutes=45)) + + +def test_normalize_pytz_timezone(): + pytz = pytest.importorskip("pytz") + + from pandas.io._util import _normalize_pytz_timezone + + for tz, expected in [ + (pytz.UTC, timezone.utc), + (pytz.FixedOffset(90), timezone(timedelta(minutes=90))), + (pytz.timezone("America/New_York"), zoneinfo.ZoneInfo("America/New_York")), + (pytz.timezone("Etc/GMT+1"), zoneinfo.ZoneInfo("Etc/GMT+1")), + ]: + result = _normalize_pytz_timezone(tz) + assert result == expected