-
-
Notifications
You must be signed in to change notification settings - Fork 19.9k
Parquet IO: also use zoneinfo timezones by default even when pyarrow uses pytz #65134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
b22fbc8
524ff58
6cab7d7
6b1fcaf
55d4e4b
6a11335
bca10e8
01a8c45
0e5e05b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import datetime as dt | ||
| from typing import ( | ||
| TYPE_CHECKING, | ||
| Literal, | ||
|
|
@@ -10,6 +11,7 @@ | |
| from pandas._config import using_string_dtype | ||
|
|
||
| from pandas._libs import lib | ||
| from pandas._libs.tslibs import timezones | ||
| from pandas.compat import ( | ||
| pa_version_under18p0, | ||
| pa_version_under19p0, | ||
|
|
@@ -35,6 +37,9 @@ | |
| ) | ||
|
|
||
|
|
||
| pytz = import_optional_dependency("pytz", errors="ignore") | ||
|
|
||
|
|
||
| def _arrow_dtype_mapping() -> dict: | ||
| pa = import_optional_dependency("pyarrow") | ||
| return { | ||
|
|
@@ -120,7 +125,9 @@ def arrow_table_to_pandas( | |
| raise NotImplementedError | ||
|
|
||
| df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) | ||
| return _post_convert_dtypes(df, dtype_backend, dtype, names) | ||
| df = _post_convert_dtypes(df, dtype_backend, dtype, names) | ||
| df = _normalize_timezone_dtypes(df) | ||
| return df | ||
|
|
||
|
|
||
| def _post_convert_dtypes( | ||
|
|
@@ -189,3 +196,68 @@ def _post_convert_dtypes( | |
| df[col] = df[col].astype(cat_dtype) | ||
|
|
||
| return df | ||
|
|
||
|
|
||
| def _normalize_pytz_timezone(tz: dt.tzinfo) -> dt.tzinfo: | ||
| """ | ||
| If the input tz is a pytz timezone, attempt to convert it to "default" | ||
| tzinfo object (zoneinfo or datetime.timezone). | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| """ | ||
| if not type(tz).__module__.startswith("pytz"): | ||
| # isinstance(col.dtype.tz, pytz.BaseTzInfo) does not included | ||
| # fixed offsets | ||
| return tz | ||
|
|
||
| if timezones.is_utc(tz): | ||
| return timezones.maybe_get_tz("UTC") | ||
|
|
||
| if timezones.is_fixed_offset(tz): | ||
| # Convert pytz fixed offset to datetime.timezone | ||
| try: | ||
| offset = tz.utcoffset(None) | ||
| if offset is not None: | ||
| return dt.timezone(offset) | ||
| except Exception: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what can go wrong here?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was repeating the same pattern from below which I wrote first for zones, but I suppose here there should never be an error (a pytz FixedOffset should always have an offset, which is returned from utcoffset() regardless of the value being passed). Will update
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking back: That said, such "fixed" zones should probably not be converted to a fixed offset with datetime.timezone, but to a zoneinfo object when possible. So will switch the order here and first try to convert to zoneinfo |
||
| pass | ||
|
|
||
| zone = timezones.get_timezone(tz) | ||
| if isinstance(zone, str): | ||
| try: | ||
| return timezones.maybe_get_tz(zone) | ||
| except Exception: | ||
| # some pytz timezones might not be available for zoneinfo | ||
| pass | ||
|
|
||
| return tz | ||
|
|
||
|
|
||
| def _normalize_timezone_index(index: pd.Index) -> pd.Index: | ||
| if isinstance(index, pd.MultiIndex): | ||
| levels = [_normalize_timezone_index(level) for level in index.levels] | ||
| return index.set_levels(levels) | ||
|
|
||
| if isinstance(index.dtype, pd.DatetimeTZDtype): | ||
| normalized_tz = _normalize_pytz_timezone(index.dtype.tz) | ||
| if normalized_tz is not index.dtype.tz: | ||
| return index.tz_convert(normalized_tz) | ||
|
|
||
| return index | ||
|
|
||
|
|
||
| def _normalize_timezone_dtypes(df: pd.DataFrame) -> pd.DataFrame: | ||
| if pytz is not None: | ||
| # Convert any pytz timezones to zoneinfo / fixed offset timezones | ||
| if any( | ||
| isinstance(dtype, pd.DatetimeTZDtype) | ||
| for dtype in df._mgr.get_unique_dtypes() | ||
| ): | ||
| col_indices = df._select_dtypes_indices(pd.DatetimeTZDtype) | ||
| for i in col_indices: | ||
|
Comment on lines
+274
to
+279
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also here, my feeling is that we should have existing helpers that make this easier to do (i.e. to avoid to iterate over every single column's dtype). The
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have a handful of places in e.g. DataFrame.select_dtypes that does |
||
| col = df.iloc[:, i] | ||
| normalized_tz = _normalize_pytz_timezone(col.dtype.tz) | ||
| if normalized_tz is not col.dtype.tz: | ||
| df.isetitem(i, col.dt.tz_convert(normalized_tz)) | ||
|
|
||
| df.index = _normalize_timezone_index(df.index) | ||
| df.columns = _normalize_timezone_index(df.columns) | ||
| return df | ||
Uh oh!
There was an error while loading. Please reload this page.