From df82105f2c042e4b4a471d665d72e266da24a198 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 25 Nov 2025 15:43:13 +0100 Subject: [PATCH 1/8] GH-48251: [C++][CI] Add CSV fuzzing seed corpus generator --- cpp/src/arrow/csv/CMakeLists.txt | 6 + cpp/src/arrow/csv/fuzz.cc | 3 +- cpp/src/arrow/csv/generate_fuzz_corpus.cc | 204 ++++++++++++++++++++++ cpp/src/arrow/testing/random.cc | 20 +-- 4 files changed, 222 insertions(+), 11 deletions(-) create mode 100644 cpp/src/arrow/csv/generate_fuzz_corpus.cc diff --git a/cpp/src/arrow/csv/CMakeLists.txt b/cpp/src/arrow/csv/CMakeLists.txt index 55047ca20462..524a780b4716 100644 --- a/cpp/src/arrow/csv/CMakeLists.txt +++ b/cpp/src/arrow/csv/CMakeLists.txt @@ -30,6 +30,12 @@ add_arrow_benchmark(converter_benchmark PREFIX "arrow-csv") add_arrow_benchmark(parser_benchmark PREFIX "arrow-csv") add_arrow_benchmark(writer_benchmark PREFIX "arrow-csv") +if(ARROW_BUILD_FUZZING_UTILITIES) + add_executable(arrow-csv-generate-fuzz-corpus generate_fuzz_corpus.cc) + target_link_libraries(arrow-csv-generate-fuzz-corpus ${ARROW_UTIL_LIB} + ${ARROW_TEST_LINK_LIBS}) +endif() + add_arrow_fuzz_target(fuzz PREFIX "arrow-csv") arrow_install_all_headers("arrow/csv") diff --git a/cpp/src/arrow/csv/fuzz.cc b/cpp/src/arrow/csv/fuzz.cc index e745c2c0bd2f..fbaa676d97e3 100644 --- a/cpp/src/arrow/csv/fuzz.cc +++ b/cpp/src/arrow/csv/fuzz.cc @@ -42,10 +42,11 @@ Status FuzzCsvReader(const uint8_t* data, int64_t size) { auto read_options = ReadOptions::Defaults(); // Make chunking more likely - read_options.block_size = 4096; + read_options.block_size = 1000; auto parse_options = ParseOptions::Defaults(); auto convert_options = ConvertOptions::Defaults(); convert_options.auto_dict_encode = true; + convert_options.auto_dict_max_cardinality = 50; auto input_stream = std::make_shared<::arrow::io::BufferReader>(std::make_shared(data, size)); diff --git a/cpp/src/arrow/csv/generate_fuzz_corpus.cc b/cpp/src/arrow/csv/generate_fuzz_corpus.cc new file mode 100644 index 000000000000..eeba49e87f81 --- /dev/null +++ b/cpp/src/arrow/csv/generate_fuzz_corpus.cc @@ -0,0 +1,204 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// A command line executable that generates a bunch of valid IPC files +// containing example record batches. Those are used as fuzzing seeds +// to make fuzzing more efficient. + +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/util.h" +#include "arrow/compute/cast.h" +#include "arrow/csv/options.h" +#include "arrow/csv/writer.h" +#include "arrow/io/file.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/writer.h" +#include "arrow/json/from_string.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/testing/random.h" +#include "arrow/util/io_util.h" + +namespace arrow::csv { + +using ::arrow::internal::CreateDir; +using ::arrow::internal::PlatformFilename; +using ::arrow::json::ArrayFromJSONString; + +Result> WriteRecordBatch( + const std::shared_ptr& batch, const WriteOptions& options) { + ARROW_ASSIGN_OR_RAISE(auto sink, io::BufferOutputStream::Create(1024)); + ARROW_ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(sink.get(), batch->schema(), options)); + RETURN_NOT_OK(writer->WriteRecordBatch(*batch)); + RETURN_NOT_OK(writer->Close()); + return sink->Finish(); +} + +Result> MakeBatch( + std::function>(int64_t length, double null_probability)> + array_factory, + int64_t length) { + ArrayVector columns; + FieldVector fields; + + struct ColumnSpec { + std::string name; + double null_probability; + }; + for (auto spec : {ColumnSpec{"with_nulls", 0.2}, ColumnSpec{"without_nulls", 0.0}}) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr column, + array_factory(length, spec.null_probability)); + columns.push_back(column); + fields.push_back(field(spec.name, column->type())); + } + return RecordBatch::Make(schema(std::move(fields)), length, std::move(columns)); +} + +Result Batches() { + ::arrow::random::RandomArrayGenerator gen(/*seed=*/42); + RecordBatchVector batches; + + auto append_batch = [&](auto array_factory, int64_t length) -> Status { + ARROW_ASSIGN_OR_RAISE(auto batch, MakeBatch(array_factory, length)); + batches.push_back(batch); + return Status::OK(); + }; + + // Ideally, we should exercise all possible inference kinds (see inference_internal.h) + auto make_nulls = [&](int64_t length, double null_probability) { + return MakeArrayOfNull(null(), length); + }; + auto make_ints = [&](int64_t length, double null_probability) { + return gen.Int64(length, /*min=*/-1'000'000, /*max=*/1'000'000, null_probability); + }; + auto make_floats = [&](int64_t length, double null_probability) { + return gen.Float64(length, /*min=*/-100.0, /*max=*/100.0, null_probability); + }; + auto make_booleans = [&](int64_t length, double null_probability) { + return gen.Boolean(length, /*true_probability=*/0.8, null_probability); + }; + auto make_dates = [&](int64_t length, double null_probability) { + return gen.Date64(length, /*min=*/1, /*max=*/365 * 60, null_probability); + }; + auto make_times = [&](int64_t length, double null_probability) { + return gen.Int32(length, /*min=*/0, /*max=*/86399, null_probability) + ->View(time32(TimeUnit::SECOND)); + }; + + std::string timezone; + auto make_timestamps = [&](int64_t length, double null_probability) { + return gen.Int64(length, /*min=*/1, /*max=*/1764079190, null_probability) + ->View(timestamp(TimeUnit::SECOND, timezone)); + }; + auto make_timestamps_ns = [&](int64_t length, double null_probability) { + return gen + .Int64(length, /*min=*/1, /*max=*/1764079190LL * 1'000'000'000, null_probability) + ->View(timestamp(TimeUnit::NANO, timezone)); + }; + + auto make_strings = [&](int64_t length, double null_probability) { + return gen.String(length, /*min_length=*/3, /*max_length=*/15, null_probability); + }; + auto make_string_with_repeats = [&](int64_t length, double null_probability) { + // `unique` should be less than `auto_dict_max_cardinality` in fuzz target + return gen.StringWithRepeats(length, /*unique=*/10, /*min_length=*/3, + /*max_length=*/15, null_probability); + }; + + RETURN_NOT_OK(append_batch(make_nulls, /*length=*/2000)); + RETURN_NOT_OK(append_batch(make_ints, /*length=*/500)); + RETURN_NOT_OK(append_batch(make_floats, /*length=*/150)); + RETURN_NOT_OK(append_batch(make_booleans, /*length=*/500)); + + RETURN_NOT_OK(append_batch(make_dates, /*length=*/200)); + RETURN_NOT_OK(append_batch(make_times, /*length=*/400)); + timezone = ""; + RETURN_NOT_OK(append_batch(make_timestamps, /*length=*/200)); + RETURN_NOT_OK(append_batch(make_timestamps_ns, /*length=*/100)); + // Will generate timestamps with a "Z" suffix + timezone = "UTC"; + RETURN_NOT_OK(append_batch(make_timestamps, /*length=*/200)); + RETURN_NOT_OK(append_batch(make_timestamps_ns, /*length=*/100)); + // Will generate timestamps with a "+0100" or "+0200" suffix + timezone = "Europe/Paris"; + RETURN_NOT_OK(append_batch(make_timestamps, /*length=*/200)); + RETURN_NOT_OK(append_batch(make_timestamps_ns, /*length=*/100)); + + RETURN_NOT_OK(append_batch(make_strings, /*length=*/300)); + RETURN_NOT_OK(append_batch(make_string_with_repeats, /*length=*/300)); + // XXX Cannot add non-UTF8 binary as the CSV writer doesn't support writing it + + return batches; +} + +Status DoMain(const std::string& out_dir) { + ARROW_ASSIGN_OR_RAISE(auto dir_fn, PlatformFilename::FromString(out_dir)); + RETURN_NOT_OK(CreateDir(dir_fn)); + + int sample_num = 1; + auto sample_name = [&]() -> std::string { + return "csv-file-" + std::to_string(sample_num++); + }; + + ARROW_ASSIGN_OR_RAISE(auto batches, Batches()); + + auto options = WriteOptions::Defaults(); + RETURN_NOT_OK(options.Validate()); + + for (const auto& batch : batches) { + RETURN_NOT_OK(batch->ValidateFull()); + ARROW_ASSIGN_OR_RAISE(auto buffer, WriteRecordBatch(batch, options)); + + ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name())); + std::cerr << sample_fn.ToString() << std::endl; + ARROW_ASSIGN_OR_RAISE(auto file, io::FileOutputStream::Open(sample_fn.ToString())); + RETURN_NOT_OK(file->Write(buffer)); + RETURN_NOT_OK(file->Close()); + } + return Status::OK(); +} + +ARROW_NORETURN void Usage() { + std::cerr << "Usage: arrow-csv-generate-fuzz-corpus " + << "" << std::endl; + std::exit(2); +} + +int Main(int argc, char** argv) { + if (argc != 2) { + Usage(); + } + auto out_dir = std::string(argv[1]); + + Status st = DoMain(out_dir); + if (!st.ok()) { + std::cerr << st.ToString() << std::endl; + return 1; + } + return 0; +} + +} // namespace arrow::csv + +int main(int argc, char** argv) { return arrow::csv::Main(argc, argv); } diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 5f95638b7d63..c50387e49094 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -65,7 +65,7 @@ template struct GeneratorFactory { GeneratorFactory(ValueType min, ValueType max) : min_(min), max_(max) {} - auto operator()(pcg32_fast* rng) const { + auto operator()(pcg32* rng) const { return [dist = DistributionType(min_, max_), rng]() mutable { return static_cast(dist(*rng)); }; @@ -80,7 +80,7 @@ template struct GeneratorFactory { GeneratorFactory(Float16 min, Float16 max) : min_(min.ToFloat()), max_(max.ToFloat()) {} - auto operator()(pcg32_fast* rng) const { + auto operator()(pcg32* rng) const { return [dist = DistributionType(min_, max_), rng]() mutable { return Float16(dist(*rng)).bits(); }; @@ -121,7 +121,7 @@ struct GenerateOptions { GenerateTypedDataNoNan(data, n); return; } - pcg32_fast rng(seed_++); + pcg32 rng(seed_++); auto gen = generator_factory_(&rng); ::arrow::random::bernoulli_distribution nan_dist(nan_probability_); const PhysicalType nan_value = get_nan(); @@ -130,7 +130,7 @@ struct GenerateOptions { } void GenerateTypedDataNoNan(PhysicalType* data, size_t n) { - pcg32_fast rng(seed_++); + pcg32 rng(seed_++); auto gen = generator_factory_(&rng); std::generate(data, data + n, [&] { return gen(); }); @@ -138,7 +138,7 @@ struct GenerateOptions { void GenerateBitmap(uint8_t* buffer, size_t n, int64_t* null_count) { int64_t count = 0; - pcg32_fast rng(seed_++); + pcg32 rng(seed_++); ::arrow::random::bernoulli_distribution dist(1.0 - probability_); for (size_t i = 0; i < n; i++) { @@ -749,7 +749,7 @@ void ShuffleListViewDataInPlace(SeedType seed, ArrayData* data) { auto* offsets = data->GetMutableValues(1); auto* sizes = data->GetMutableValues(2); - pcg32_fast rng(seed); + pcg32 rng(seed); using UniformDist = std::uniform_int_distribution; UniformDist dist; for (int64_t i = data->length - 1; i > 0; --i) { @@ -888,7 +888,7 @@ Result> RandomListView(RAG& self, const Array& values, auto sizes = buffers[1]->mutable_data_as(); // Derive sizes from offsets taking coverage into account - pcg32_fast rng(self.seed()); + pcg32 rng(self.seed()); using NormalDist = std::normal_distribution; NormalDist size_dist; for (int64_t i = 0; i < length; ++i) { @@ -977,7 +977,7 @@ std::shared_ptr RandomArrayGenerator::Map(const std::shared_ptr& k std::shared_ptr RandomArrayGenerator::RunEndEncoded( std::shared_ptr value_type, int64_t logical_size, double null_probability) { Int32Builder run_ends_builder; - pcg32_fast rng(seed()); + pcg32 rng(seed()); DCHECK_LE(logical_size, std::numeric_limits::max()); @@ -1447,7 +1447,7 @@ std::shared_ptr GenerateBatch(const FieldVector& fields, void rand_day_millis(int64_t N, std::vector* out) { const int random_seed = 0; - arrow::random::pcg32_fast gen(random_seed); + arrow::random::pcg32 gen(random_seed); std::uniform_int_distribution d(std::numeric_limits::min(), std::numeric_limits::max()); out->resize(N, {}); @@ -1462,7 +1462,7 @@ void rand_day_millis(int64_t N, std::vector* out) { const int random_seed = 0; - arrow::random::pcg32_fast gen(random_seed); + arrow::random::pcg32 gen(random_seed); std::uniform_int_distribution d(std::numeric_limits::min(), std::numeric_limits::max()); out->resize(N, {}); From 29dcad92f6395015854c7b30be205005d4bd06d1 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 25 Nov 2025 16:36:48 +0100 Subject: [PATCH 2/8] Try to fix test on Windows --- cpp/build-support/fuzzing/generate_corpuses.sh | 2 +- cpp/src/arrow/ipc/read_write_test.cc | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/build-support/fuzzing/generate_corpuses.sh b/cpp/build-support/fuzzing/generate_corpuses.sh index 233c9be0ebb1..6ebc86ffad82 100755 --- a/cpp/build-support/fuzzing/generate_corpuses.sh +++ b/cpp/build-support/fuzzing/generate_corpuses.sh @@ -71,7 +71,7 @@ rm -rf ${PANDAS_DIR} git clone --depth=1 https://github.com/pandas-dev/pandas ${PANDAS_DIR} rm -rf ${CORPUS_DIR} -mkdir -p ${CORPUS_DIR} +${OUT}/arrow-csv-generate-fuzz-corpus ${CORPUS_DIR} # Add examples from arrow-testing repo cp ${ARROW_ROOT}/testing/data/csv/*.csv ${CORPUS_DIR} # Add examples from Pandas test suite diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 84ec923ce803..315d8bd07d9b 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -952,23 +952,23 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) { } TEST_F(TestWriteRecordBatch, RoundtripPreservesBufferSizes) { - // ARROW-7975 + // ARROW-7975: deserialized buffers should have logically exact size (no padding) random::RandomArrayGenerator rg(/*seed=*/0); + constexpr int64_t kLength = 30; - int64_t length = 15; - auto arr = rg.String(length, 0, 10, 0.1); - auto batch = RecordBatch::Make(::arrow::schema({field("f0", utf8())}), length, {arr}); + auto arr = + rg.String(kLength, /*min_length=*/0, /*max_length=*/10, /*null_probability=*/0.3); + ASSERT_NE(arr->null_count(), 0); // required for validity bitmap size assertion below + + auto batch = RecordBatch::Make(::arrow::schema({field("f0", utf8())}), kLength, {arr}); - ASSERT_OK_AND_ASSIGN( - mmap_, io::MemoryMapFixture::InitMemoryMap( - /*buffer_size=*/1 << 20, TempFile("test-roundtrip-buffer-sizes"))); DictionaryMemo dictionary_memo; ASSERT_OK_AND_ASSIGN( auto result, DoStandardRoundTrip(*batch, IpcWriteOptions::Defaults(), &dictionary_memo)); - // Make sure that the validity bitmap is size 2 as expected - ASSERT_EQ(2, arr->data()->buffers[0]->size()); + // Make sure that the validity bitmap has expected size + ASSERT_EQ(bit_util::BytesForBits(kLength), arr->data()->buffers[0]->size()); for (size_t i = 0; i < arr->data()->buffers.size(); ++i) { ASSERT_EQ(arr->data()->buffers[i]->size(), From ac4401d379a9cf2a1352c24b6bd0867481069ad5 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 25 Nov 2025 17:43:30 +0100 Subject: [PATCH 3/8] Add debug for Windows failures --- .github/workflows/cpp_windows.yml | 8 ++++---- cpp/src/arrow/util/rle_encoding_test.cc | 19 +++++++++++++++++-- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cpp_windows.yml b/.github/workflows/cpp_windows.yml index 0940beef6966..3b596c5656b1 100644 --- a/.github/workflows/cpp_windows.yml +++ b/.github/workflows/cpp_windows.yml @@ -43,16 +43,16 @@ jobs: ARROW_BUILD_SHARED: ON ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: ON - ARROW_DATASET: ON + ARROW_DATASET: OFF ARROW_FLIGHT: OFF ARROW_HDFS: ON ARROW_HOME: /usr ARROW_JEMALLOC: OFF ARROW_MIMALLOC: ON - ARROW_ORC: ON - ARROW_PARQUET: ON + ARROW_ORC: OFF + ARROW_PARQUET: OFF ARROW_SIMD_LEVEL: ${{ inputs.simd-level }} - ARROW_SUBSTRAIT: ON + ARROW_SUBSTRAIT: OFF ARROW_USE_GLOG: OFF ARROW_VERBOSE_THIRDPARTY_BUILD: OFF ARROW_WITH_BROTLI: OFF diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index c709095a6c7e..7136b0ad8dee 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -993,11 +993,24 @@ void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts const int data_size = static_cast(data.length()); const int data_values_count = static_cast(data.length() - spaced * data.null_count()); + // Note: because of the way RleEncoder::CheckBufferFull() + // is called, we have to reserve an extra "RleEncoder::MinBufferSize" + // bytes. These extra bytes won't be used but not reserving them + // would cause the encoder to fail. const int buffer_size = - static_cast(RleBitPackedEncoder::MaxBufferSize(bit_width, data_size)); + static_cast( + ::arrow::util::RleBitPackedEncoder::MaxBufferSize(bit_width, data_values_count) + + ::arrow::util::RleBitPackedEncoder::MinBufferSize(bit_width)); + ASSERT_GE(parts, 1); ASSERT_LE(parts, data_size); + ARROW_SCOPED_TRACE("bit_width = ", bit_width, ", spaced = ", spaced, ", data_size = ", data_size, + ", buffer_size = ", buffer_size); + ARROW_LOG(INFO) << "bit_width = " < 1, the worst case is the repetition of "literal run of length 8 // and then a repeated run of length 8". diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 7136b0ad8dee..b2d4f7df6f1b 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -993,24 +993,15 @@ void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts const int data_size = static_cast(data.length()); const int data_values_count = static_cast(data.length() - spaced * data.null_count()); - // Note: because of the way RleEncoder::CheckBufferFull() - // is called, we have to reserve an extra "RleEncoder::MinBufferSize" - // bytes. These extra bytes won't be used but not reserving them - // would cause the encoder to fail. - const int buffer_size = - static_cast( - ::arrow::util::RleBitPackedEncoder::MaxBufferSize(bit_width, data_values_count) + - ::arrow::util::RleBitPackedEncoder::MinBufferSize(bit_width)); + const int buffer_size = static_cast( + ::arrow::util::RleBitPackedEncoder::MaxBufferSize(bit_width, data_values_count) + + ::arrow::util::RleBitPackedEncoder::MinBufferSize(bit_width)); ASSERT_GE(parts, 1); ASSERT_LE(parts, data_size); - ARROW_SCOPED_TRACE("bit_width = ", bit_width, ", spaced = ", spaced, ", data_size = ", data_size, - ", buffer_size = ", buffer_size); - ARROW_LOG(INFO) << "bit_width = " <