Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ I/O
- Fixed :func:`read_json` with ``lines=True`` and ``chunksize`` to respect ``nrows``
when the requested row count is not a multiple of the chunk size (:issue:`64025`)
- Bug in :meth:`DataFrame.__repr__` where horizontally truncated output could exceed the terminal width by up to 4 characters (:issue:`32461`)
- Bug in :meth:`DataFrame.to_csv` where ``chunksize`` could produce inconsistent datetime and timedelta formatting across chunks (:issue:`55481`)
- Bug in :meth:`DataFrame.to_stata` raising ``KeyError`` when column names require renaming and ``convert_dates`` is specified for a different column (:issue:`60536`)
- Fixed :func:`read_json` with ``lines=True`` and ``nrows=0`` to return an empty DataFrame (:issue:`64025`)
- Fixed bug in :meth:`HDFStore.select` where passing ``where`` as a list of conditions referencing caller-scope variables failed on Python 3.12+ due to :pep:`709` inlining list comprehension stack frames (:issue:`64881`)
Expand Down
163 changes: 155 additions & 8 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@

import numpy as np

from pandas._libs import writers as libwriters
from pandas._libs import (
tslib,
writers as libwriters,
)
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.generic import (
Expand All @@ -31,6 +34,10 @@
)
from pandas.core.dtypes.missing import notna

from pandas.core.arrays import (
DatetimeArray,
TimedeltaArray,
)
from pandas.core.indexes.api import Index

from pandas.io.common import get_handle
Expand All @@ -47,6 +54,8 @@
npt,
)

from pandas import DataFrame

from pandas.io.formats.format import DataFrameFormatter


Expand Down Expand Up @@ -307,30 +316,168 @@ def _generate_multiindex_header_rows(self) -> Iterator[list[Hashable]]:
def _save_body(self) -> None:
nrows = len(self.data_index)
chunks = (nrows // self.chunksize) + 1

# GH#55481: pre-compute per-column date formats from the full data
# so that datetime/timedelta columns are formatted consistently
# across chunks.
col_formats = self._compute_col_date_formats()

# GH#55481: pre-format the index using the full data so that
# DatetimeIndex/TimedeltaIndex formatting is consistent across chunks.
formatted_index = self._preformat_index()

for i in range(chunks):
start_i = i * self.chunksize
end_i = min(start_i + self.chunksize, nrows)
if start_i >= end_i:
break
self._save_chunk(start_i, end_i)
self._save_chunk(start_i, end_i, col_formats, formatted_index)

def _preformat_index(self) -> npt.NDArray[np.object_] | None:
"""Pre-format the index using the full data for consistency.

For DatetimeIndex/TimedeltaIndex, formatting depends on
_is_dates_only which must be determined from the full index,
not per-chunk slices.

Returns the fully formatted index array, or None if
no pre-formatting is needed.
"""
if self.nlevels == 0 or self.date_format is not None:
return None

idx_values = self.data_index._values
if isinstance(idx_values, (DatetimeArray, TimedeltaArray)):
return self.data_index._get_values_for_csv(**self._number_format)

def _save_chunk(self, start_i: int, end_i: int) -> None:
return None

def _compute_col_date_formats(self) -> dict[int, bool] | None:
"""Pre-compute _is_dates_only for datetime/timedelta columns using
the full column data, so that per-chunk formatting is consistent.

Returns a dict mapping column index to the _is_dates_only result
for the full column, or None if no pre-computation is needed.
"""
if self.date_format is not None:
# User specified a format, no auto-detection needed
return None

result: dict[int, bool] = {}
for col_idx in range(self.obj.shape[1]):
arr = self.obj.iloc[:, col_idx].array
if isinstance(arr, (DatetimeArray, TimedeltaArray)):
result[col_idx] = arr._is_dates_only

return result if result else None

def _save_chunk(
self,
start_i: int,
end_i: int,
col_formats: dict[int, bool] | None,
formatted_index: npt.NDArray[np.object_] | None,
) -> None:
# create the data for a chunk
slicer = slice(start_i, end_i)
df = self.obj.iloc[slicer]

res = df._get_values_for_csv(**self._number_format)
data = list(res._iter_column_arrays())
if col_formats is None:
res = df._get_values_for_csv(**self._number_format)
data = list(res._iter_column_arrays())
else:
data = list(self._format_chunk_columns(df, col_formats))

ix = (
self.data_index[slicer]._get_values_for_csv(**self._number_format)
if self.nlevels != 0
else np.empty(end_i - start_i)
formatted_index[start_i:end_i]
if formatted_index is not None
else (
self.data_index[slicer]._get_values_for_csv(**self._number_format)
if self.nlevels != 0
else np.empty(end_i - start_i)
)
)

libwriters.write_csv_rows(
data,
ix,
self.nlevels,
self.cols,
self.writer,
)

def _format_chunk_columns(
self, df: DataFrame, col_formats: dict[int, bool]
) -> list:
"""Format chunk columns using pre-computed date format info.

For datetime/timedelta columns, uses the _is_dates_only result
from the full column to ensure consistent formatting.
"""
from pandas.core.indexes.base import get_values_for_csv

data: list = []
for col_idx in range(df.shape[1]):
col_values = df.iloc[:, col_idx]._values

if col_idx in col_formats:
is_dates_only = col_formats[col_idx]
arr = df.iloc[:, col_idx].array
formatted = self._format_dt_column(arr, is_dates_only)
else:
formatted = get_values_for_csv(
col_values,
na_rep=self.na_rep,
float_format=self.float_format, # type: ignore[arg-type]
date_format=self.date_format,
decimal=self.decimal,
quoting=self.quoting,
)
data.append(formatted)
return data

def _format_dt_column(
self,
arr: DatetimeArray | TimedeltaArray,
is_dates_only: bool,
) -> npt.NDArray[np.object_]:
"""Format a datetime or timedelta column using a pre-computed
_is_dates_only value from the full column."""
if isinstance(arr, DatetimeArray):
if is_dates_only:
date_format = "%Y-%m-%d"
else:
date_format = None

# Call format_array_from_datetime directly to bypass the
# per-chunk _is_dates_only check in _format_native_types.
result = tslib.format_array_from_datetime(
arr.asi8,
tz=arr.tz,
format=date_format,
na_rep=self.na_rep,
reso=arr._creso,
)
else:
# TimedeltaArray
from pandas.io.formats.format import get_format_timedelta64

if is_dates_only:
# Use the default behavior (even_days format)
formatter = get_format_timedelta64(arr, na_rep=self.na_rep)
else:
# Force long format to prevent per-chunk auto-detection
from pandas import Timedelta

def formatter(
x: object, _na_rep: str | float = self.na_rep
) -> str | float:
if x is None or x != x:
return _na_rep
if not isinstance(x, Timedelta):
x = Timedelta(x)
return x._repr_base(format="long") # type: ignore[attr-defined]

result = np.frompyfunc(formatter, 1, 1)(arr._ndarray)

return np.asarray(result, dtype=object)
92 changes: 92 additions & 0 deletions pandas/tests/io/formats/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,98 @@ def test_callable_float_format_compatibility():
assert result == expected


def test_to_csv_chunksize_datetime_column():
# GH#55481
dti = pd.date_range("2016-01-01", periods=3, freq="D")
df = DataFrame({"A": dti})
df.iloc[-1, -1] += pd.Timedelta(minutes=1)

result = df.to_csv(chunksize=1)
expected_rows = [
",A",
"0,2016-01-01 00:00:00",
"1,2016-01-02 00:00:00",
"2,2016-01-03 00:01:00",
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected


def test_to_csv_chunksize_datetime_column_dates_only():
# GH#55481 - dates-only columns should still use compact format
dti = pd.date_range("2016-01-01", periods=3, freq="D")
df = DataFrame({"A": dti})

result = df.to_csv(chunksize=1)
expected_rows = [",A", "0,2016-01-01", "1,2016-01-02", "2,2016-01-03"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected


def test_to_csv_chunksize_timedelta_column():
# GH#55481
tdi = pd.timedelta_range("1D", periods=3, freq="D")
df = DataFrame({"A": tdi})
df.iloc[-1, -1] += pd.Timedelta(minutes=1)

result = df.to_csv(chunksize=1)
expected_rows = [
",A",
"0,1 days 00:00:00",
"1,2 days 00:00:00",
"2,3 days 00:01:00",
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected


def test_to_csv_chunksize_datetime_index():
# GH#55481 - DatetimeIndex should also be consistent across chunks
dti = pd.date_range("2016-01-01", periods=3, freq="D")
df = DataFrame({"A": [1, 2, 3]}, index=dti)
df.index = df.index.insert(2, dti[-1] + pd.Timedelta(minutes=1)).delete(3)

result = df.to_csv(chunksize=1)
expected_rows = [
",A",
"2016-01-01 00:00:00,1",
"2016-01-02 00:00:00,2",
"2016-01-03 00:01:00,3",
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected


def test_to_csv_chunksize_multiple_datetime_columns():
# GH#55481 - each column should be formatted based on its own data
dti_a = pd.date_range("2016-01-01", periods=3, freq="D")
dti_b = pd.date_range("2020-06-01", periods=3, freq="D")
df = DataFrame({"A": dti_a, "B": dti_b})
df.iloc[-1, 0] += pd.Timedelta(minutes=1) # A is not dates-only
# B stays dates-only

result = df.to_csv(chunksize=1)
expected_rows = [
",A,B",
"0,2016-01-01 00:00:00,2020-06-01",
"1,2016-01-02 00:00:00,2020-06-02",
"2,2016-01-03 00:01:00,2020-06-03",
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected


def test_to_csv_chunksize_matches_no_chunksize():
# GH#55481 - output should be the same regardless of chunksize
dti = pd.date_range("2016-01-01", periods=3, freq="D")
df = DataFrame({"A": dti})
df.iloc[-1, -1] += pd.Timedelta(minutes=1)

result_chunked = df.to_csv(chunksize=1)
result_default = df.to_csv()
assert result_chunked == result_default


def test_no_float_format():
df = DataFrame({"A": [1.23, 4.56]})
result = df.to_csv(float_format=None, lineterminator="\n")
Expand Down
Loading