Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions pandas/_libs/tslibs/period.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ def get_period_field_arr(
arr: npt.NDArray[np.int64], # const int64_t[:]
freq: int,
) -> npt.NDArray[np.int64]: ...
def period_ordinals_from_fields(
years: npt.NDArray[np.int64],
months: npt.NDArray[np.int64],
days: npt.NDArray[np.int64],
hours: npt.NDArray[np.int64],
minutes: npt.NDArray[np.int64],
seconds: npt.NDArray[np.int64],
freq: int,
validate: bool = ...,
) -> tuple[npt.NDArray[np.int64], int]: ...
def from_calendar_ordinals(
values: npt.NDArray[np.int64], # const int64_t[:]
dtype: PeriodDtypeBase,
Expand Down
62 changes: 62 additions & 0 deletions pandas/_libs/tslibs/period.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1523,6 +1523,68 @@ cdef accessor _get_accessor_func(str field):
return NULL


@cython.wraparound(False)
@cython.boundscheck(False)
@cython.cdivision(False)
def period_ordinals_from_fields(
const int64_t[:] years,
const int64_t[:] months,
const int64_t[:] days,
const int64_t[:] hours,
const int64_t[:] minutes,
const int64_t[:] seconds,
int freq,
bint validate=False,
):
"""
Vectorized version of period_ordinal: convert arrays of date/time fields
to an array of period ordinals for the given frequency.

Parameters
----------
years, months, days, hours, minutes, seconds : int64 arrays
freq : int
validate : bool, default False
If True, check each date for validity (month 1-12, day 1-N).
Invalid entries get NPY_NAT.

Returns
-------
(ndarray[int64], int)
The ordinals array, and the index of the first invalid entry
(-1 if all entries are valid or validate is False).
"""
cdef:
Py_ssize_t i, n = len(years)
int64_t[::1] result = np.empty(n, dtype="i8")
npy_datetimestruct dts
int64_t month, day
Py_ssize_t first_invalid = -1

memset(&dts, 0, sizeof(npy_datetimestruct))

for i in range(n):
if validate:
month = months[i]
day = days[i]
if month < 1 or month > 12 or day < 1 or \
day > get_days_in_month(<int>years[i], month):
result[i] = NPY_NAT
if first_invalid == -1:
first_invalid = i
continue

dts.year = years[i]
dts.month = months[i]
dts.day = days[i]
dts.hour = hours[i]
dts.min = minutes[i]
dts.sec = seconds[i]
result[i] = get_period_ordinal(&dts, freq)

return result.base, first_invalid


@cython.wraparound(False)
@cython.boundscheck(False)
def from_calendar_ordinals(const int64_t[:] values, PeriodDtypeBase dtype):
Expand Down
39 changes: 28 additions & 11 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
period as libperiod,
to_offset,
)
from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
from pandas._libs.tslibs.dtypes import (
FreqGroup,
PeriodDtypeBase,
Expand Down Expand Up @@ -1535,8 +1536,6 @@ def _range_from_fields(
if day is None:
day = 1

ordinals = []

if quarter is not None:
if freq is None:
freq = to_offset("Q", is_period=True)
Expand All @@ -1549,20 +1548,38 @@ def _range_from_fields(

freqstr = freq.freqstr
year, quarter = _make_field_arrays(year, quarter)
for y, q in zip(year, quarter, strict=True):
calendar_year, calendar_month = parsing.quarter_to_myear(y, q, freqstr)
val = libperiod.period_ordinal(
calendar_year, calendar_month, 1, 1, 1, 1, 0, 0, base
)
ordinals.append(val)
year = np.asarray(year, dtype=np.int64)
quarter = np.asarray(quarter, dtype=np.int64)

if (quarter < 1).any() or (quarter > 4).any():
raise ValueError("Quarter must be 1 <= q <= 4")

# Vectorized quarter_to_myear
mnum = MONTH_NUMBERS[parsing.get_rule_month(freqstr)] + 1
months = (mnum + (quarter - 1) * 3) % 12 + 1
years = np.where(months > mnum, year - 1, year)

length = len(years)
ones = np.ones(length, dtype=np.int64)
zeros = np.zeros(length, dtype=np.int64)
ordinals, _ = libperiod.period_ordinals_from_fields(
years, months, ones, zeros, zeros, zeros, base
)
else:
freq = to_offset(freq, is_period=True)
base = libperiod.freq_to_dtype_code(freq)
arrays = _make_field_arrays(year, month, day, hour, minute, second)
for y, mth, d, h, mn, s in zip(*arrays, strict=True):
ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base))
ordinals, _ = libperiod.period_ordinals_from_fields(
arrays[0].astype(np.int64, copy=False),
arrays[1].astype(np.int64, copy=False),
arrays[2].astype(np.int64, copy=False),
arrays[3].astype(np.int64, copy=False),
arrays[4].astype(np.int64, copy=False),
arrays[5].astype(np.int64, copy=False),
base,
)

return np.array(ordinals, dtype=np.int64), freq
return ordinals, freq


def _make_field_arrays(*fields) -> list[np.ndarray]:
Expand Down
97 changes: 88 additions & 9 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,13 @@
Timestamp,
astype_overflowsafe,
get_supported_dtype,
iNaT,
is_supported_dtype,
period as libperiod,
timezones as libtimezones,
)
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
from pandas._libs.tslibs.dtypes import FreqGroup
from pandas._libs.tslibs.parsing import (
DateParseError,
guess_datetime_format,
Expand Down Expand Up @@ -1229,17 +1232,93 @@ def coerce(values):
values = values.astype("int64")
return values

values = (
coerce(arg[unit_rev["year"]]) * 10000
+ coerce(arg[unit_rev["month"]]) * 100
+ coerce(arg[unit_rev["day"]])
# Convert field values to int64 arrays, tracking NaN positions
# (NaN arises when errors="coerce" and a value can't be parsed)
nan_mask = np.zeros(len(arg), dtype=bool)

def _to_int64(vals, default, col_name):
arr = np.asarray(vals)
if not is_float_dtype(arr.dtype):
return arr.astype(np.int64, copy=False)
isnan = np.isnan(arr)
# Non-integer floats (e.g. month=1.5) are invalid
fractional = (~isnan) & (arr != np.floor(arr))
if fractional.any() and errors == "raise":
raise ValueError(
f"cannot assemble the datetimes: column {col_name!r} contains "
f"fractional values"
)
bad = isnan | fractional
if bad.any():
nan_mask[bad] = True
arr = np.where(bad, default, arr)
return arr.astype(np.int64)

field_spec = [
("year", 2000),
("month", 1),
("day", 1),
("h", 0),
("m", 0),
("s", 0),
]
field_arrs = []
for field, default in field_spec:
col_name = unit_rev.get(field)
if col_name is not None:
arr = _to_int64(coerce(arg[col_name]), default=default, col_name=col_name)
else:
arr = np.zeros(len(arg), dtype=np.int64)
field_arrs.append(arr)

# Construct datetime64[us] directly from fields, avoiding the
# object-dtype round-trip through format="%Y%m%d" string parsing.
# Replace NaN-masked entries with valid placeholders to avoid overflow
# in the Cython function; it writes iNaT for invalid dates when
# validate=True.
if nan_mask.any():
for idx, (_, default) in enumerate(field_spec):
field_arrs[idx] = np.where(nan_mask, default, field_arrs[idx])

year_arr, month_arr, day_arr, hour_arr, minute_arr, second_arr = field_arrs

ordinals, first_invalid = libperiod.period_ordinals_from_fields(
year_arr,
month_arr,
day_arr,
hour_arr,
minute_arr,
second_arr,
cast("int", FreqGroup.FR_US.value),
validate=True,
)
try:
values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc)
except (TypeError, ValueError) as err:
raise ValueError(f"cannot assemble the datetimes: {err}") from err
if first_invalid >= 0 and errors == "raise":
bad_val = (
f"{year_arr[first_invalid]}{month_arr[first_invalid]:02d}"
f"{day_arr[first_invalid]:02d}"
)
raise ValueError(
f'cannot assemble the datetimes: time data "{bad_val}" '
f'doesn\'t match format "%Y%m%d".'
)
# errors="coerce": invalid entries already have iNaT from Cython
if nan_mask.any():
ordinals[nan_mask] = iNaT

dt64_values = ordinals.view("M8[us]")

from pandas import Series

if utc:
dta = DatetimeArray._simple_new(
dt64_values, dtype=DatetimeTZDtype(tz="UTC", unit="us")
)
values = Series(dta, index=arg.index, copy=False)
else:
values = Series(dt64_values, index=arg.index, copy=False)

units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"]
# Add sub-second components as timedeltas
units: list[UnitChoices] = ["ms", "us", "ns"]
for u in units:
value = unit_rev.get(u)
if value is not None and value in arg:
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/indexes/period/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,41 @@ def test_constructor_invalid_quarters(self):
year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC"
)

def test_constructor_field_arrays_quarter_no_freq(self):
# Quarter with no explicit freq infers Q-DEC
years = np.array([2020, 2020, 2020, 2020], dtype=np.int64)
quarters = np.array([1, 2, 3, 4], dtype=np.int64)
result = PeriodIndex.from_fields(year=years, quarter=quarters)
expected = PeriodIndex(
[Period(year=2020, quarter=q, freq="Q-DEC") for q in range(1, 5)]
)
tm.assert_index_equal(result, expected)

def test_constructor_field_arrays_hourly(self):
# Test non-quarter path with all 6 fields
result = PeriodIndex.from_fields(
year=[2020, 2020],
month=[1, 6],
day=[15, 20],
hour=[10, 22],
minute=[30, 45],
second=[5, 59],
freq="s",
)
expected = PeriodIndex(
[
Period("2020-01-15 10:30:05", freq="s"),
Period("2020-06-20 22:45:59", freq="s"),
]
)
tm.assert_index_equal(result, expected)

def test_constructor_field_arrays_empty(self):
# Empty arrays should produce empty PeriodIndex
result = PeriodIndex.from_fields(year=[], month=[], freq="M")
expected = PeriodIndex([], dtype="period[M]")
tm.assert_index_equal(result, expected)

def test_period_range_fractional_period(self):
msg = "periods must be an integer, got 10.5"
with pytest.raises(TypeError, match=msg):
Expand Down
Loading
Loading