diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index b318369aa9df7..5513eded8c68d 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -296,6 +296,7 @@ I/O - Fixed :func:`read_json` with ``lines=True`` and ``chunksize`` to respect ``nrows`` when the requested row count is not a multiple of the chunk size (:issue:`64025`) - Bug in :meth:`DataFrame.__repr__` where horizontally truncated output could exceed the terminal width by up to 4 characters (:issue:`32461`) +- Bug in :meth:`DataFrame.to_csv` where ``chunksize`` could produce inconsistent datetime and timedelta formatting across chunks (:issue:`55481`) - Bug in :meth:`DataFrame.to_stata` raising ``KeyError`` when column names require renaming and ``convert_dates`` is specified for a different column (:issue:`60536`) - Fixed :func:`read_json` with ``lines=True`` and ``nrows=0`` to return an empty DataFrame (:issue:`64025`) - Fixed bug in :meth:`HDFStore.select` where passing ``where`` as a list of conditions referencing caller-scope variables failed on Python 3.12+ due to :pep:`709` inlining list comprehension stack frames (:issue:`64881`) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 64f668b69db54..0967f97eaf9ed 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -20,7 +20,10 @@ import numpy as np -from pandas._libs import writers as libwriters +from pandas._libs import ( + tslib, + writers as libwriters, +) from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( @@ -31,6 +34,10 @@ ) from pandas.core.dtypes.missing import notna +from pandas.core.arrays import ( + DatetimeArray, + TimedeltaArray, +) from pandas.core.indexes.api import Index from pandas.io.common import get_handle @@ -47,6 +54,8 @@ npt, ) + from pandas import DataFrame + from pandas.io.formats.format import DataFrameFormatter @@ -307,26 +316,88 @@ def _generate_multiindex_header_rows(self) -> Iterator[list[Hashable]]: def _save_body(self) -> None: nrows = len(self.data_index) chunks = (nrows // self.chunksize) + 1 + + # GH#55481: pre-compute per-column date formats from the full data + # so that datetime/timedelta columns are formatted consistently + # across chunks. + col_formats = self._compute_col_date_formats() + + # GH#55481: pre-format the index using the full data so that + # DatetimeIndex/TimedeltaIndex formatting is consistent across chunks. + formatted_index = self._preformat_index() + for i in range(chunks): start_i = i * self.chunksize end_i = min(start_i + self.chunksize, nrows) if start_i >= end_i: break - self._save_chunk(start_i, end_i) + self._save_chunk(start_i, end_i, col_formats, formatted_index) + + def _preformat_index(self) -> npt.NDArray[np.object_] | None: + """Pre-format the index using the full data for consistency. + + For DatetimeIndex/TimedeltaIndex, formatting depends on + _is_dates_only which must be determined from the full index, + not per-chunk slices. + + Returns the fully formatted index array, or None if + no pre-formatting is needed. + """ + if self.nlevels == 0 or self.date_format is not None: + return None + + idx_values = self.data_index._values + if isinstance(idx_values, (DatetimeArray, TimedeltaArray)): + return self.data_index._get_values_for_csv(**self._number_format) - def _save_chunk(self, start_i: int, end_i: int) -> None: + return None + + def _compute_col_date_formats(self) -> dict[int, bool] | None: + """Pre-compute _is_dates_only for datetime/timedelta columns using + the full column data, so that per-chunk formatting is consistent. + + Returns a dict mapping column index to the _is_dates_only result + for the full column, or None if no pre-computation is needed. + """ + if self.date_format is not None: + # User specified a format, no auto-detection needed + return None + + result: dict[int, bool] = {} + for col_idx in range(self.obj.shape[1]): + arr = self.obj.iloc[:, col_idx].array + if isinstance(arr, (DatetimeArray, TimedeltaArray)): + result[col_idx] = arr._is_dates_only + + return result if result else None + + def _save_chunk( + self, + start_i: int, + end_i: int, + col_formats: dict[int, bool] | None, + formatted_index: npt.NDArray[np.object_] | None, + ) -> None: # create the data for a chunk slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] - res = df._get_values_for_csv(**self._number_format) - data = list(res._iter_column_arrays()) + if col_formats is None: + res = df._get_values_for_csv(**self._number_format) + data = list(res._iter_column_arrays()) + else: + data = list(self._format_chunk_columns(df, col_formats)) ix = ( - self.data_index[slicer]._get_values_for_csv(**self._number_format) - if self.nlevels != 0 - else np.empty(end_i - start_i) + formatted_index[start_i:end_i] + if formatted_index is not None + else ( + self.data_index[slicer]._get_values_for_csv(**self._number_format) + if self.nlevels != 0 + else np.empty(end_i - start_i) + ) ) + libwriters.write_csv_rows( data, ix, @@ -334,3 +405,79 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: self.cols, self.writer, ) + + def _format_chunk_columns( + self, df: DataFrame, col_formats: dict[int, bool] + ) -> list: + """Format chunk columns using pre-computed date format info. + + For datetime/timedelta columns, uses the _is_dates_only result + from the full column to ensure consistent formatting. + """ + from pandas.core.indexes.base import get_values_for_csv + + data: list = [] + for col_idx in range(df.shape[1]): + col_values = df.iloc[:, col_idx]._values + + if col_idx in col_formats: + is_dates_only = col_formats[col_idx] + arr = df.iloc[:, col_idx].array + formatted = self._format_dt_column(arr, is_dates_only) + else: + formatted = get_values_for_csv( + col_values, + na_rep=self.na_rep, + float_format=self.float_format, # type: ignore[arg-type] + date_format=self.date_format, + decimal=self.decimal, + quoting=self.quoting, + ) + data.append(formatted) + return data + + def _format_dt_column( + self, + arr: DatetimeArray | TimedeltaArray, + is_dates_only: bool, + ) -> npt.NDArray[np.object_]: + """Format a datetime or timedelta column using a pre-computed + _is_dates_only value from the full column.""" + if isinstance(arr, DatetimeArray): + if is_dates_only: + date_format = "%Y-%m-%d" + else: + date_format = None + + # Call format_array_from_datetime directly to bypass the + # per-chunk _is_dates_only check in _format_native_types. + result = tslib.format_array_from_datetime( + arr.asi8, + tz=arr.tz, + format=date_format, + na_rep=self.na_rep, + reso=arr._creso, + ) + else: + # TimedeltaArray + from pandas.io.formats.format import get_format_timedelta64 + + if is_dates_only: + # Use the default behavior (even_days format) + formatter = get_format_timedelta64(arr, na_rep=self.na_rep) + else: + # Force long format to prevent per-chunk auto-detection + from pandas import Timedelta + + def formatter( + x: object, _na_rep: str | float = self.na_rep + ) -> str | float: + if x is None or x != x: + return _na_rep + if not isinstance(x, Timedelta): + x = Timedelta(x) + return x._repr_base(format="long") # type: ignore[attr-defined] + + result = np.frompyfunc(formatter, 1, 1)(arr._ndarray) + + return np.asarray(result, dtype=object) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index badcf547e1d0d..98bea2cc6b1d4 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -860,6 +860,98 @@ def test_callable_float_format_compatibility(): assert result == expected +def test_to_csv_chunksize_datetime_column(): + # GH#55481 + dti = pd.date_range("2016-01-01", periods=3, freq="D") + df = DataFrame({"A": dti}) + df.iloc[-1, -1] += pd.Timedelta(minutes=1) + + result = df.to_csv(chunksize=1) + expected_rows = [ + ",A", + "0,2016-01-01 00:00:00", + "1,2016-01-02 00:00:00", + "2,2016-01-03 00:01:00", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + +def test_to_csv_chunksize_datetime_column_dates_only(): + # GH#55481 - dates-only columns should still use compact format + dti = pd.date_range("2016-01-01", periods=3, freq="D") + df = DataFrame({"A": dti}) + + result = df.to_csv(chunksize=1) + expected_rows = [",A", "0,2016-01-01", "1,2016-01-02", "2,2016-01-03"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + +def test_to_csv_chunksize_timedelta_column(): + # GH#55481 + tdi = pd.timedelta_range("1D", periods=3, freq="D") + df = DataFrame({"A": tdi}) + df.iloc[-1, -1] += pd.Timedelta(minutes=1) + + result = df.to_csv(chunksize=1) + expected_rows = [ + ",A", + "0,1 days 00:00:00", + "1,2 days 00:00:00", + "2,3 days 00:01:00", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + +def test_to_csv_chunksize_datetime_index(): + # GH#55481 - DatetimeIndex should also be consistent across chunks + dti = pd.date_range("2016-01-01", periods=3, freq="D") + df = DataFrame({"A": [1, 2, 3]}, index=dti) + df.index = df.index.insert(2, dti[-1] + pd.Timedelta(minutes=1)).delete(3) + + result = df.to_csv(chunksize=1) + expected_rows = [ + ",A", + "2016-01-01 00:00:00,1", + "2016-01-02 00:00:00,2", + "2016-01-03 00:01:00,3", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + +def test_to_csv_chunksize_multiple_datetime_columns(): + # GH#55481 - each column should be formatted based on its own data + dti_a = pd.date_range("2016-01-01", periods=3, freq="D") + dti_b = pd.date_range("2020-06-01", periods=3, freq="D") + df = DataFrame({"A": dti_a, "B": dti_b}) + df.iloc[-1, 0] += pd.Timedelta(minutes=1) # A is not dates-only + # B stays dates-only + + result = df.to_csv(chunksize=1) + expected_rows = [ + ",A,B", + "0,2016-01-01 00:00:00,2020-06-01", + "1,2016-01-02 00:00:00,2020-06-02", + "2,2016-01-03 00:01:00,2020-06-03", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + +def test_to_csv_chunksize_matches_no_chunksize(): + # GH#55481 - output should be the same regardless of chunksize + dti = pd.date_range("2016-01-01", periods=3, freq="D") + df = DataFrame({"A": dti}) + df.iloc[-1, -1] += pd.Timedelta(minutes=1) + + result_chunked = df.to_csv(chunksize=1) + result_default = df.to_csv() + assert result_chunked == result_default + + def test_no_float_format(): df = DataFrame({"A": [1.23, 4.56]}) result = df.to_csv(float_format=None, lineterminator="\n")