Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/pyarrow/includes/libarrow_python.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil:
CTimePoint TimePoint_from_ns(int64_t val)

CResult[c_string] TzinfoToString(PyObject* pytzinfo)
CResult[PyObject*] StringToTzinfo(c_string)
CResult[PyObject*] StringToTzinfo(c_string, c_bool)


cdef extern from "arrow/python/numpy_init.h" namespace "arrow::py":
Expand Down
7 changes: 4 additions & 3 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,7 +787,7 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block=
def make_datetimetz(unit, tz):
if _pandas_api.is_v1():
unit = 'ns' # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
tz = pa.lib.string_to_tzinfo(tz)
tz = pa.lib.string_to_tzinfo(tz, prefer_zoneinfo=_pandas_api.is_ge_v3())
return _pandas_api.datetimetz_type(unit, tz=tz)


Expand Down Expand Up @@ -1183,7 +1183,8 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
# ARROW-13756: if index is timezone aware DataTimeIndex
elif pandas_dtype == "datetimetz":
tz = pa.lib.string_to_tzinfo(
column_indexes[0]['metadata']['timezone'])
column_indexes[0]['metadata']['timezone'],
prefer_zoneinfo=_pandas_api.is_ge_v3())
level = pd.to_datetime(level, utc=True).tz_convert(tz)
if _pandas_api.is_ge_v3():
# with pandas 3+, to_datetime returns a unit depending on the string
Expand Down Expand Up @@ -1289,7 +1290,7 @@ def make_tz_aware(series, tz):
"""
Make a datetime64 Series timezone-aware for the given tz
"""
tz = pa.lib.string_to_tzinfo(tz)
tz = pa.lib.string_to_tzinfo(tz, prefer_zoneinfo=_pandas_api.is_ge_v3())
series = (series.dt.tz_localize('utc')
.dt.tz_convert(tz))
return series
9 changes: 8 additions & 1 deletion python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,14 @@ cdef class TimestampScalar(Scalar):
return None

if not dtype.timezone().empty():
tzinfo = string_to_tzinfo(frombytes(dtype.timezone()))
prefer_zoneinfo = True
# only we this method would return a pandas.Timestamp, prefer
# zoneinfo depending on the pandas version
if _pandas_api.have_pandas and dtype.unit() == TimeUnit_NANO:
prefer_zoneinfo = _pandas_api.is_ge_v3()
Comment on lines +825 to +829
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
prefer_zoneinfo = True
# only we this method would return a pandas.Timestamp, prefer
# zoneinfo depending on the pandas version
if _pandas_api.have_pandas and dtype.unit() == TimeUnit_NANO:
prefer_zoneinfo = _pandas_api.is_ge_v3()
# for datetime.datetime output, always prefer zoneinfo over pytz
prefer_zoneinfo = True
if _pandas_api.have_pandas and dtype.unit() == TimeUnit_NANO:
# but if this method returns a pandas.Timestamp (i.e. pandas installed
# and nano unit) -> adjust preference based on the pandas version
# (i.e. keep returning pytz for older pandas)
prefer_zoneinfo = _pandas_api.is_ge_v3()

@AlenkaF would this be clearer?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thanks! 🙏

tzinfo = string_to_tzinfo(
frombytes(dtype.timezone()), prefer_zoneinfo=prefer_zoneinfo
)
else:
tzinfo = None

Expand Down
26 changes: 20 additions & 6 deletions python/pyarrow/src/arrow/python/datetime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -368,13 +368,14 @@ Result<std::string> PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo) {

// Converted from python. See https://github.com/apache/arrow/pull/7604
// for details.
Result<PyObject*> StringToTzinfo(const std::string& tz) {
Result<PyObject*> StringToTzinfo(const std::string& tz, bool prefer_zoneinfo) {
std::string_view sign_str, hour_str, minute_str;
OwnedRef pytz;
OwnedRef zoneinfo;
OwnedRef datetime;

if (internal::ImportModule("pytz", &pytz).ok()) {
// Legacy behavior: prefer pytz objects when available
if (!prefer_zoneinfo && internal::ImportModule("pytz", &pytz).ok()) {
if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
int sign = -1;
if (sign_str == "+") {
Expand Down Expand Up @@ -406,7 +407,7 @@ Result<PyObject*> StringToTzinfo(const std::string& tz) {
return tzinfo;
}

// catch fixed offset if pytz is not present
// Handle fixed offsets with datetime.timezone
if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
RETURN_NOT_OK(internal::ImportModule("datetime", &datetime));
int sign = -1;
Expand Down Expand Up @@ -447,7 +448,7 @@ Result<PyObject*> StringToTzinfo(const std::string& tz) {
return tzinfo;
}

// fallback on zoneinfo if tz is string and pytz is not present
// Use zoneinfo for named timezones when available
if (internal::ImportModule("zoneinfo", &zoneinfo).ok()) {
OwnedRef class_zoneinfo;
RETURN_NOT_OK(
Expand All @@ -456,12 +457,25 @@ Result<PyObject*> StringToTzinfo(const std::string& tz) {
PyUnicode_FromStringAndSize(tz.c_str(), static_cast<Py_ssize_t>(tz.size())));
auto tzinfo =
PyObject_CallFunctionObjArgs(class_zoneinfo.obj(), py_tz_string.obj(), NULL);
if (tzinfo != nullptr) {
return tzinfo;
}

// Keep backwards compatibility for named timezones only available in pytz
PyErr_Clear();
}

if (internal::ImportModule("pytz", &pytz).ok()) {
OwnedRef timezone;
RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone));
OwnedRef py_tz_string(
PyUnicode_FromStringAndSize(tz.c_str(), static_cast<Py_ssize_t>(tz.size())));
auto tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL);
RETURN_IF_PYERROR();
return tzinfo;
}

return Status::Invalid(
"Pytz package or Python>=3.8 for zoneinfo module must be installed.");
return Status::Invalid("The zoneinfo module or pytz package must be installed.");
}

Result<std::string> TzinfoToString(PyObject* tzinfo) {
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/src/arrow/python/datetime.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ Result<int64_t> PyDateTime_utcoffset_s(PyObject* pydatetime);
/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
/// GIL must be held when calling this method.
ARROW_PYTHON_EXPORT
Result<PyObject*> StringToTzinfo(const std::string& tz);
Result<PyObject*> StringToTzinfo(const std::string& tz, bool prefer_zoneinfo = true);

/// \brief Convert a time zone object to a string representation.
///
Expand Down
34 changes: 26 additions & 8 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import multiprocessing as mp
import sys
import warnings
import zoneinfo

from collections import OrderedDict
from datetime import date, datetime, time, timedelta, timezone
Expand Down Expand Up @@ -1168,10 +1169,23 @@ def test_python_datetime(self):
def test_python_datetime_with_pytz_tzinfo(self):
pytz = pytest.importorskip("pytz")

for tz in [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)]:
values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)]
timezones_pytz = [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)]
timezones_zoneinfo = [
zoneinfo.ZoneInfo('UTC'),
zoneinfo.ZoneInfo('US/Eastern'),
timezone(timedelta(minutes=1))
]

for tz, tz_zoneinfo in zip(timezones_pytz, timezones_zoneinfo):
values = [tz.localize(datetime(2018, 1, 1, 12, 23, 45))]
df = pd.DataFrame({'datetime': values})
_check_pandas_roundtrip(df)
if Version(pd.__version__) >= Version("3.0.0"):
df_expected = pd.DataFrame(
{'datetime': [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_zoneinfo)]}
)
else:
df_expected = None
_check_pandas_roundtrip(df, expected=df_expected)

@h.given(st.none() | past.timezones)
@h.settings(deadline=None)
Expand All @@ -1183,22 +1197,26 @@ def test_python_datetime_with_pytz_timezone(self, tz):
_check_pandas_roundtrip(df, check_dtype=False)

def test_python_datetime_with_timezone_tzinfo(self):
pytz = pytest.importorskip("pytz")
from datetime import timezone

values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=timezone.utc)]
# also test with index to ensure both paths roundtrip (ARROW-9962)
df = pd.DataFrame({'datetime': values}, index=values)
_check_pandas_roundtrip(df, preserve_index=True)

# datetime.timezone is going to be pytz.FixedOffset
hours = 1
tz_timezone = timezone(timedelta(hours=hours))
tz_pytz = pytz.FixedOffset(hours * 60)
values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_timezone)]
values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)]
df = pd.DataFrame({'datetime': values}, index=values)
df_exp = pd.DataFrame({'datetime': values_exp}, index=values_exp)
if Version(pd.__version__) < Version("3.0.0"):
# datetime.timezone is going to be pytz.FixedOffset
pytz = pytest.importorskip("pytz")
tz_pytz = pytz.FixedOffset(hours * 60)
values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)]
df_exp = pd.DataFrame({'datetime': values_exp}, index=values_exp)
else:
df_exp = None

_check_pandas_roundtrip(df, expected=df_exp, preserve_index=True)

def test_python_datetime_subclass(self):
Expand Down
60 changes: 33 additions & 27 deletions python/pyarrow/tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from functools import partial
import datetime
import sys
import zoneinfo

import pytest
import hypothesis as h
Expand Down Expand Up @@ -491,35 +492,40 @@ def utcoffset(self, dt):

def test_string_to_tzinfo():
string = ['UTC', 'Europe/Paris', '+03:00', '+01:30', '-02:00']
try:
import pytz
expected = [pytz.utc, pytz.timezone('Europe/Paris'),
pytz.FixedOffset(180), pytz.FixedOffset(90),
pytz.FixedOffset(-120)]
result = [pa.lib.string_to_tzinfo(i) for i in string]
assert result == expected

except ImportError:
try:
import zoneinfo
expected = [zoneinfo.ZoneInfo(key='UTC'),
zoneinfo.ZoneInfo(key='Europe/Paris'),
datetime.timezone(datetime.timedelta(hours=3)),
datetime.timezone(
datetime.timedelta(hours=1, minutes=30)),
datetime.timezone(-datetime.timedelta(hours=2))]
result = [pa.lib.string_to_tzinfo(i) for i in string]
assert result == expected

except ImportError:
pytest.skip('requires pytz or zoneinfo to be installed')


def test_timezone_string_roundtrip_pytz():
result = [pa.lib.string_to_tzinfo(i) for i in string]
expected = [
zoneinfo.ZoneInfo('UTC'),
zoneinfo.ZoneInfo('Europe/Paris'),
datetime.timezone(datetime.timedelta(hours=3)),
datetime.timezone(datetime.timedelta(hours=1, minutes=30)),
datetime.timezone(-datetime.timedelta(hours=2)),
]
assert result == expected


def test_string_to_tzinfo_prefer_zoneinfo_false():
pytz = pytest.importorskip("pytz")
result = pa.lib.string_to_tzinfo("Europe/Brussels", prefer_zoneinfo=False)
assert result == pytz.timezone("Europe/Brussels")
result = pa.lib.string_to_tzinfo("+01:30", prefer_zoneinfo=False)
assert result == pytz.FixedOffset(90)


@pytest.mark.skipif(
sys.platform == 'darwin', reason="macOS supports those lower-case names"
)
def test_string_to_tzinfo_pytz_fallback():
pytz = pytest.importorskip("pytz")
result = pa.lib.string_to_tzinfo("europe/brussels")
expected = pytz.timezone("Europe/Brussels")
assert result == expected


tz = [pytz.FixedOffset(90), pytz.FixedOffset(-90),
pytz.utc, pytz.timezone('America/New_York')]
def test_timezone_string_roundtrip():
tz = [datetime.timezone(datetime.timedelta(hours=1, minutes=30)),
datetime.timezone(datetime.timedelta(hours=-1, minutes=-30)),
zoneinfo.ZoneInfo('UTC'),
zoneinfo.ZoneInfo('America/New_York')]
name = ['+01:30', '-01:30', 'UTC', 'America/New_York']

assert [pa.lib.tzinfo_to_string(i) for i in tz] == name
Expand Down
12 changes: 9 additions & 3 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -4166,7 +4166,7 @@ def tzinfo_to_string(tz):
return frombytes(GetResultValue(TzinfoToString(<PyObject*>tz)))


def string_to_tzinfo(name):
def string_to_tzinfo(name, *, prefer_zoneinfo=True):
"""
Convert a time zone name into a time zone object.

Expand All @@ -4177,15 +4177,21 @@ def string_to_tzinfo(name):

Parameters
----------
name: str
name: str
Time zone name.
prefer_zoneinfo : bool, default True
If True, resolve named timezones using ``zoneinfo`` first and only
fall back to ``pytz`` when needed. If False, prefer ``pytz`` when it
is available.

Returns
-------
tz : datetime.tzinfo
Time zone object
"""
cdef PyObject* tz = GetResultValue(StringToTzinfo(name.encode('utf-8')))
cdef PyObject* tz = GetResultValue(
StringToTzinfo(name.encode('utf-8'), prefer_zoneinfo)
)
return PyObject_to_object(tz)


Expand Down
Loading