diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index fdff1e48df649..e771800364fa7 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -31,6 +31,16 @@ def get_period_field_arr( arr: npt.NDArray[np.int64], # const int64_t[:] freq: int, ) -> npt.NDArray[np.int64]: ... +def period_ordinals_from_fields( + years: npt.NDArray[np.int64], + months: npt.NDArray[np.int64], + days: npt.NDArray[np.int64], + hours: npt.NDArray[np.int64], + minutes: npt.NDArray[np.int64], + seconds: npt.NDArray[np.int64], + freq: int, + validate: bool = ..., +) -> tuple[npt.NDArray[np.int64], int]: ... def from_calendar_ordinals( values: npt.NDArray[np.int64], # const int64_t[:] dtype: PeriodDtypeBase, diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index ff3dd83159d41..5af9c2efe7c72 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1523,6 +1523,68 @@ cdef accessor _get_accessor_func(str field): return NULL +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(False) +def period_ordinals_from_fields( + const int64_t[:] years, + const int64_t[:] months, + const int64_t[:] days, + const int64_t[:] hours, + const int64_t[:] minutes, + const int64_t[:] seconds, + int freq, + bint validate=False, +): + """ + Vectorized version of period_ordinal: convert arrays of date/time fields + to an array of period ordinals for the given frequency. + + Parameters + ---------- + years, months, days, hours, minutes, seconds : int64 arrays + freq : int + validate : bool, default False + If True, check each date for validity (month 1-12, day 1-N). + Invalid entries get NPY_NAT. + + Returns + ------- + (ndarray[int64], int) + The ordinals array, and the index of the first invalid entry + (-1 if all entries are valid or validate is False). + """ + cdef: + Py_ssize_t i, n = len(years) + int64_t[::1] result = np.empty(n, dtype="i8") + npy_datetimestruct dts + int64_t month, day + Py_ssize_t first_invalid = -1 + + memset(&dts, 0, sizeof(npy_datetimestruct)) + + for i in range(n): + if validate: + month = months[i] + day = days[i] + if month < 1 or month > 12 or day < 1 or \ + day > get_days_in_month(years[i], month): + result[i] = NPY_NAT + if first_invalid == -1: + first_invalid = i + continue + + dts.year = years[i] + dts.month = months[i] + dts.day = days[i] + dts.hour = hours[i] + dts.min = minutes[i] + dts.sec = seconds[i] + result[i] = get_period_ordinal(&dts, freq) + + return result.base, first_invalid + + @cython.wraparound(False) @cython.boundscheck(False) def from_calendar_ordinals(const int64_t[:] values, PeriodDtypeBase dtype): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index c9e0f47bbe852..764ed53c0abe7 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -35,6 +35,7 @@ period as libperiod, to_offset, ) +from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS from pandas._libs.tslibs.dtypes import ( FreqGroup, PeriodDtypeBase, @@ -1535,8 +1536,6 @@ def _range_from_fields( if day is None: day = 1 - ordinals = [] - if quarter is not None: if freq is None: freq = to_offset("Q", is_period=True) @@ -1549,20 +1548,38 @@ def _range_from_fields( freqstr = freq.freqstr year, quarter = _make_field_arrays(year, quarter) - for y, q in zip(year, quarter, strict=True): - calendar_year, calendar_month = parsing.quarter_to_myear(y, q, freqstr) - val = libperiod.period_ordinal( - calendar_year, calendar_month, 1, 1, 1, 1, 0, 0, base - ) - ordinals.append(val) + year = np.asarray(year, dtype=np.int64) + quarter = np.asarray(quarter, dtype=np.int64) + + if (quarter < 1).any() or (quarter > 4).any(): + raise ValueError("Quarter must be 1 <= q <= 4") + + # Vectorized quarter_to_myear + mnum = MONTH_NUMBERS[parsing.get_rule_month(freqstr)] + 1 + months = (mnum + (quarter - 1) * 3) % 12 + 1 + years = np.where(months > mnum, year - 1, year) + + length = len(years) + ones = np.ones(length, dtype=np.int64) + zeros = np.zeros(length, dtype=np.int64) + ordinals, _ = libperiod.period_ordinals_from_fields( + years, months, ones, zeros, zeros, zeros, base + ) else: freq = to_offset(freq, is_period=True) base = libperiod.freq_to_dtype_code(freq) arrays = _make_field_arrays(year, month, day, hour, minute, second) - for y, mth, d, h, mn, s in zip(*arrays, strict=True): - ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base)) + ordinals, _ = libperiod.period_ordinals_from_fields( + arrays[0].astype(np.int64, copy=False), + arrays[1].astype(np.int64, copy=False), + arrays[2].astype(np.int64, copy=False), + arrays[3].astype(np.int64, copy=False), + arrays[4].astype(np.int64, copy=False), + arrays[5].astype(np.int64, copy=False), + base, + ) - return np.array(ordinals, dtype=np.int64), freq + return ordinals, freq def _make_field_arrays(*fields) -> list[np.ndarray]: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1505181a53451..4ec0eb03297c1 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -27,10 +27,13 @@ Timestamp, astype_overflowsafe, get_supported_dtype, + iNaT, is_supported_dtype, + period as libperiod, timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas._libs.tslibs.dtypes import FreqGroup from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -1229,17 +1232,93 @@ def coerce(values): values = values.astype("int64") return values - values = ( - coerce(arg[unit_rev["year"]]) * 10000 - + coerce(arg[unit_rev["month"]]) * 100 - + coerce(arg[unit_rev["day"]]) + # Convert field values to int64 arrays, tracking NaN positions + # (NaN arises when errors="coerce" and a value can't be parsed) + nan_mask = np.zeros(len(arg), dtype=bool) + + def _to_int64(vals, default, col_name): + arr = np.asarray(vals) + if not is_float_dtype(arr.dtype): + return arr.astype(np.int64, copy=False) + isnan = np.isnan(arr) + # Non-integer floats (e.g. month=1.5) are invalid + fractional = (~isnan) & (arr != np.floor(arr)) + if fractional.any() and errors == "raise": + raise ValueError( + f"cannot assemble the datetimes: column {col_name!r} contains " + f"fractional values" + ) + bad = isnan | fractional + if bad.any(): + nan_mask[bad] = True + arr = np.where(bad, default, arr) + return arr.astype(np.int64) + + field_spec = [ + ("year", 2000), + ("month", 1), + ("day", 1), + ("h", 0), + ("m", 0), + ("s", 0), + ] + field_arrs = [] + for field, default in field_spec: + col_name = unit_rev.get(field) + if col_name is not None: + arr = _to_int64(coerce(arg[col_name]), default=default, col_name=col_name) + else: + arr = np.zeros(len(arg), dtype=np.int64) + field_arrs.append(arr) + + # Construct datetime64[us] directly from fields, avoiding the + # object-dtype round-trip through format="%Y%m%d" string parsing. + # Replace NaN-masked entries with valid placeholders to avoid overflow + # in the Cython function; it writes iNaT for invalid dates when + # validate=True. + if nan_mask.any(): + for idx, (_, default) in enumerate(field_spec): + field_arrs[idx] = np.where(nan_mask, default, field_arrs[idx]) + + year_arr, month_arr, day_arr, hour_arr, minute_arr, second_arr = field_arrs + + ordinals, first_invalid = libperiod.period_ordinals_from_fields( + year_arr, + month_arr, + day_arr, + hour_arr, + minute_arr, + second_arr, + cast("int", FreqGroup.FR_US.value), + validate=True, ) - try: - values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc) - except (TypeError, ValueError) as err: - raise ValueError(f"cannot assemble the datetimes: {err}") from err + if first_invalid >= 0 and errors == "raise": + bad_val = ( + f"{year_arr[first_invalid]}{month_arr[first_invalid]:02d}" + f"{day_arr[first_invalid]:02d}" + ) + raise ValueError( + f'cannot assemble the datetimes: time data "{bad_val}" ' + f'doesn\'t match format "%Y%m%d".' + ) + # errors="coerce": invalid entries already have iNaT from Cython + if nan_mask.any(): + ordinals[nan_mask] = iNaT + + dt64_values = ordinals.view("M8[us]") + + from pandas import Series + + if utc: + dta = DatetimeArray._simple_new( + dt64_values, dtype=DatetimeTZDtype(tz="UTC", unit="us") + ) + values = Series(dta, index=arg.index, copy=False) + else: + values = Series(dt64_values, index=arg.index, copy=False) - units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"] + # Add sub-second components as timedeltas + units: list[UnitChoices] = ["ms", "us", "ns"] for u in units: value = unit_rev.get(u) if value is not None and value in arg: diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index a50ec9ca1706d..6671be64e145a 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -227,6 +227,41 @@ def test_constructor_invalid_quarters(self): year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC" ) + def test_constructor_field_arrays_quarter_no_freq(self): + # Quarter with no explicit freq infers Q-DEC + years = np.array([2020, 2020, 2020, 2020], dtype=np.int64) + quarters = np.array([1, 2, 3, 4], dtype=np.int64) + result = PeriodIndex.from_fields(year=years, quarter=quarters) + expected = PeriodIndex( + [Period(year=2020, quarter=q, freq="Q-DEC") for q in range(1, 5)] + ) + tm.assert_index_equal(result, expected) + + def test_constructor_field_arrays_hourly(self): + # Test non-quarter path with all 6 fields + result = PeriodIndex.from_fields( + year=[2020, 2020], + month=[1, 6], + day=[15, 20], + hour=[10, 22], + minute=[30, 45], + second=[5, 59], + freq="s", + ) + expected = PeriodIndex( + [ + Period("2020-01-15 10:30:05", freq="s"), + Period("2020-06-20 22:45:59", freq="s"), + ] + ) + tm.assert_index_equal(result, expected) + + def test_constructor_field_arrays_empty(self): + # Empty arrays should produce empty PeriodIndex + result = PeriodIndex.from_fields(year=[], month=[], freq="M") + expected = PeriodIndex([], dtype="period[M]") + tm.assert_index_equal(result, expected) + def test_period_range_fractional_period(self): msg = "periods must be an integer, got 10.5" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index fb79ffa81f8f6..7367696dd8504 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2297,8 +2297,8 @@ def test_dataframe_float(self, cache): # float df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) msg = ( - r"^cannot assemble the datetimes: unconverted data remains when parsing " - r'with format ".*": "1".' + r"^cannot assemble the datetimes: column 'month' contains " + r"fractional values$" ) with pytest.raises(ValueError, match=msg): to_datetime(df, cache=cache) @@ -2312,6 +2312,82 @@ def test_dataframe_utc_true(self): ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data,expected_ts", + [ + # Feb 29 in leap year (valid) + ( + {"year": [2000], "month": [2], "day": [29]}, + [Timestamp("2000-02-29")], + ), + # Feb 28 in non-leap year (valid) + ( + {"year": [2001], "month": [2], "day": [28]}, + [Timestamp("2001-02-28")], + ), + ], + ) + def test_dataframe_leap_year_valid(self, data, expected_ts): + result = to_datetime(DataFrame(data)) + expected = Series(expected_ts) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data", + [ + {"year": [2001], "month": [2], "day": [29]}, # Feb 29 non-leap + {"year": [2020], "month": [4], "day": [31]}, # Apr 31 + {"year": [2020], "month": [1], "day": [0]}, # day 0 + ], + ) + def test_dataframe_invalid_day_raises(self, data): + msg = r"cannot assemble the datetimes: time data" + with pytest.raises(ValueError, match=msg): + to_datetime(DataFrame(data)) + + @pytest.mark.parametrize( + "data", + [ + {"year": [2001], "month": [2], "day": [29]}, # Feb 29 non-leap + {"year": [2020], "month": [4], "day": [31]}, # Apr 31 + ], + ) + def test_dataframe_invalid_day_coerce(self, data): + result = to_datetime(DataFrame(data), errors="coerce") + expected = Series([NaT], dtype="datetime64[us]") + tm.assert_series_equal(result, expected) + + def test_dataframe_fractional_float_coerce(self): + # Fractional float with errors="coerce" should produce NaT + df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) + result = to_datetime(df, errors="coerce") + expected = Series([NaT, Timestamp("2001-01-01")]) + tm.assert_series_equal(result, expected) + + def test_dataframe_empty(self): + # Empty DataFrame should produce empty Series + df = DataFrame({"year": [], "month": [], "day": []}) + result = to_datetime(df) + expected = Series([], dtype="datetime64[us]") + tm.assert_series_equal(result, expected) + + def test_dataframe_utc_with_time_fields(self): + df = DataFrame( + { + "year": [2020], + "month": [6], + "day": [15], + "hour": [12], + "minute": [30], + "second": [45], + } + ) + result = to_datetime(df, utc=True) + expected = Series( + [Timestamp("2020-06-15 12:30:45")], dtype="datetime64[us, UTC]" + ) + tm.assert_series_equal(result, expected) + class TestToDatetimeMisc: def test_to_datetime_barely_out_of_bounds(self):