diff --git a/include/glaze/core/buffer_traits.hpp b/include/glaze/core/buffer_traits.hpp index 7a00e92802..5784694fb9 100644 --- a/include/glaze/core/buffer_traits.hpp +++ b/include/glaze/core/buffer_traits.hpp @@ -70,6 +70,13 @@ namespace glz template concept is_output_streaming = buffer_traits>::is_output_streaming; + // Concept to check if a buffer type is both bounded and supports streaming. + // These buffers cannot grow but can flush to handle data larger than their capacity. + template + concept is_bounded_output_streaming = + buffer_traits>::is_output_streaming && + buffer_traits>::has_bounded_capacity; + // Flush helper for streaming output buffers template GLZ_ALWAYS_INLINE void flush_buffer(B&& b, size_t written) noexcept diff --git a/include/glaze/core/ostream_buffer.hpp b/include/glaze/core/ostream_buffer.hpp index 75a461a43f..3844e9a1d0 100644 --- a/include/glaze/core/ostream_buffer.hpp +++ b/include/glaze/core/ostream_buffer.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include #include @@ -191,4 +192,193 @@ namespace glz template using ostream_buffer = basic_ostream_buffer; + // A bounded streaming buffer that has fixed capacity but can flush incrementally. + // Unlike basic_ostream_buffer, this buffer will NOT grow beyond its initial capacity. + // Instead, it flushes to the underlying stream when capacity is approached. + // + // This is useful for: + // - Memory-constrained environments where buffer growth is not acceptable + // - Serializing data larger than available memory by streaming to disk/network + // + // Usage: + // std::ofstream file("output.json"); + // glz::bounded_ostream_buffer buffer(file); + // auto ec = glz::write_json(obj, buffer); + // + // Template parameters: + // Stream - Byte-oriented output stream (must satisfy byte_output_stream concept) + // Capacity - Fixed buffer size in bytes. Must be at least min_ostream_buffer_size. + + template + requires(Capacity >= min_ostream_buffer_size) + class bounded_ostream_buffer + { + static_assert(Capacity >= min_ostream_buffer_size, + "Buffer capacity must be at least 512 bytes to handle all JSON value types"); + + Stream* stream_; + std::array buffer_; + size_t flush_offset_ = 0; // Logical position that maps to buffer_[0] + size_t logical_size_ = Capacity; // Reported size + + public: + using value_type = char; + using reference = char&; + using const_reference = const char&; + using size_type = size_t; + using iterator = char*; + using const_iterator = const char*; + using stream_type = Stream; + + static constexpr size_t buffer_capacity = Capacity; + + explicit bounded_ostream_buffer(Stream& stream) : stream_(&stream) {} + + // Move-only to prevent accidental copies + bounded_ostream_buffer(const bounded_ostream_buffer&) = delete; + bounded_ostream_buffer& operator=(const bounded_ostream_buffer&) = delete; + bounded_ostream_buffer(bounded_ostream_buffer&&) noexcept = default; + bounded_ostream_buffer& operator=(bounded_ostream_buffer&&) noexcept = default; + + ~bounded_ostream_buffer() = default; + + // Element access - maps logical position to physical buffer + reference operator[](size_t ix) noexcept + { + assert(ix >= flush_offset_ && "Index before flush offset"); + assert(ix - flush_offset_ < Capacity && "Index exceeds buffer capacity"); + return buffer_[ix - flush_offset_]; + } + const_reference operator[](size_t ix) const noexcept + { + assert(ix >= flush_offset_ && "Index before flush offset"); + assert(ix - flush_offset_ < Capacity && "Index exceeds buffer capacity"); + return buffer_[ix - flush_offset_]; + } + + // Current logical size (capacity available from position 0) + size_t size() const noexcept { return logical_size_; } + + // Resize - for bounded buffers, we update logical size but cannot grow physical buffer + void resize(size_t new_size) noexcept { logical_size_ = new_size; } + + // Final flush - called by buffer_traits::finalize() + // Returns true on success, false if stream write failed + bool finalize(size_t total_written) + { + if (total_written > flush_offset_ && stream_) { + const size_t to_flush = total_written - flush_offset_; + stream_->write(buffer_.data(), static_cast(to_flush)); + if (stream_->fail()) [[unlikely]] { + return false; + } + flush_offset_ = total_written; + } + return true; + } + + // Flush all pending data and reset buffer position + // After flush, capacity increases by the amount flushed + // Returns true on success, false if stream write failed + bool flush(size_t written_so_far) + { + if (written_so_far > flush_offset_ && stream_) { + const size_t to_flush = written_so_far - flush_offset_; + stream_->write(buffer_.data(), static_cast(to_flush)); + if (stream_->fail()) [[unlikely]] { + return false; + } + flush_offset_ = written_so_far; + // Update logical size to reflect new capacity from current position + logical_size_ = flush_offset_ + Capacity; + } + return true; + } + + // Reset for reuse + void reset() noexcept + { + flush_offset_ = 0; + logical_size_ = Capacity; + } + + // Effective capacity from position 0 (includes already-flushed space) + size_t effective_capacity() const noexcept { return flush_offset_ + Capacity; } + + // Physical buffer capacity + static constexpr size_t physical_capacity() noexcept { return Capacity; } + + // Check if underlying stream is in good state + bool good() const noexcept { return stream_ && stream_->good(); } + + // Check if underlying stream has failed + bool fail() const noexcept { return !stream_ || stream_->fail(); } + + // Access underlying stream + Stream* stream() const noexcept { return stream_; } + + // Bytes flushed so far + size_t bytes_flushed() const noexcept { return flush_offset_; } + + // Iterator support (for physical buffer) + iterator begin() noexcept { return buffer_.data(); } + iterator end() noexcept { return buffer_.data() + Capacity; } + const_iterator begin() const noexcept { return buffer_.data(); } + const_iterator end() const noexcept { return buffer_.data() + Capacity; } + + // Data access + char* data() noexcept { return buffer_.data(); } + const char* data() const noexcept { return buffer_.data(); } + }; + + // buffer_traits specialization for bounded_ostream_buffer + template + struct buffer_traits> + { + static constexpr bool is_resizable = false; + static constexpr bool has_bounded_capacity = true; + static constexpr bool is_output_streaming = true; + + // Capacity grows as data is flushed + GLZ_ALWAYS_INLINE static size_t capacity(const bounded_ostream_buffer& b) noexcept + { + return b.effective_capacity(); + } + + GLZ_ALWAYS_INLINE static bool ensure_capacity(bounded_ostream_buffer& b, size_t needed) + { + if (needed > capacity(b)) { + // Cannot grow beyond capacity. Callers must flush at safe points + // (between array elements, object fields) to make room. + return false; + } + return true; + } + + // Basic finalize without error reporting (for backward compatibility) + GLZ_ALWAYS_INLINE static void finalize(bounded_ostream_buffer& b, size_t written) + { + b.finalize(written); + } + + // Context-aware finalize that reports stream errors through ctx.error + GLZ_ALWAYS_INLINE static void finalize(bounded_ostream_buffer& b, size_t written, is_context auto& ctx) + { + if (!b.finalize(written)) [[unlikely]] { + ctx.error = error_code::send_error; + } + } + + // Basic flush without error reporting (for backward compatibility) + GLZ_ALWAYS_INLINE static void flush(bounded_ostream_buffer& b, size_t written) { b.flush(written); } + + // Context-aware flush that reports stream errors through ctx.error + GLZ_ALWAYS_INLINE static void flush(bounded_ostream_buffer& b, size_t written, is_context auto& ctx) + { + if (!b.flush(written)) [[unlikely]] { + ctx.error = error_code::send_error; + } + } + }; + } // namespace glz diff --git a/include/glaze/json/read.hpp b/include/glaze/json/read.hpp index 8c84b1c1fa..75fc41d2a9 100644 --- a/include/glaze/json/read.hpp +++ b/include/glaze/json/read.hpp @@ -922,6 +922,366 @@ namespace glz } }; + // ============================================================================ + // Streaming String Parsing Support + // ============================================================================ + + // Minimum bytes remaining before triggering a buffer refill. + // Must be large enough for the longest escape sequence: surrogate pair \uXXXX\uXXXX (12 bytes) + // plus some margin for lookahead. 16 is a convenient power of 2. + inline constexpr size_t streaming_refill_threshold = 16; + + // State for tracking partial escape sequences across buffer refills + struct string_streaming_state + { + uint8_t escape_bytes_pending = 0; // 0-12 bytes buffered + char escape_buffer[12]{}; // Max: \uD800\uDC00 (12 bytes) + bool in_escape = false; + }; + + // Refill the streaming buffer, updating iterators + // Returns true if data is available after refill + template + GLZ_ALWAYS_INLINE bool refill_streaming_buffer(Ctx& ctx, const char*& it, const char*& end) noexcept + { + if constexpr (has_streaming_state>) { + if (ctx.stream.enabled()) { + const size_t consumed = static_cast(it - ctx.stream.data()); + const char* new_it; + const char* new_end; + ctx.stream.consume_and_refill(consumed, new_it, new_end); + it = new_it; + end = new_end; + return (it < end) || !ctx.stream.at_eof(); + } + } + return false; + } + + // Complete a partial escape sequence after buffer refill + // Returns: 0 = need more data, >0 = bytes written to output, -1 = error + template + GLZ_ALWAYS_INLINE int complete_escape_sequence(auto& value, is_context auto& ctx, const char*& it, const char* end, + string_streaming_state& state) noexcept + { + // Continue filling escape buffer until we have a complete sequence + while (state.escape_bytes_pending < 12 && it < end) { + const char c = *it; + + // Check for different escape types based on what we have + if (state.escape_bytes_pending == 1) { + // We have just '\', need the escape character + state.escape_buffer[1] = c; + state.escape_bytes_pending = 2; + ++it; + + if (c != 'u') { + // Simple escape - complete + const char unescaped = char_unescape_table[uint8_t(c)]; + if (unescaped == 0) [[unlikely]] { + ctx.error = error_code::invalid_escape; + return -1; + } + value.push_back(unescaped); + state.escape_bytes_pending = 0; + state.in_escape = false; + return 1; + } + // Unicode escape - need 4 hex digits + continue; + } + + if (state.escape_bytes_pending >= 2 && state.escape_bytes_pending < 6) { + // Collecting hex digits for first \uXXXX + state.escape_buffer[state.escape_bytes_pending++] = c; + ++it; + + if (state.escape_bytes_pending == 6) { + // Have full \uXXXX - check if it's a high surrogate + const uint32_t code = hex_to_u32(state.escape_buffer + 2); + if (code == 0xFFFFFFFFu) [[unlikely]] { + ctx.error = error_code::u_requires_hex_digits; + return -1; + } + + // Check for surrogate (using constants from glaze/util/parse.hpp) + using namespace unicode; + if ((code & generic_surrogate_mask) == generic_surrogate_value) { + // It's a surrogate - check if high + if ((code & surrogate_mask) != high_surrogate_value) [[unlikely]] { + ctx.error = error_code::unicode_escape_conversion_failure; + return -1; + } + // High surrogate - need low surrogate, continue collecting + continue; + } + + // Not a surrogate - decode and output + char utf8_buf[4]; + const uint32_t len = code_point_to_utf8(code, utf8_buf); + if (len == 0) [[unlikely]] { + ctx.error = error_code::unicode_escape_conversion_failure; + return -1; + } + value.append(utf8_buf, len); + state.escape_bytes_pending = 0; + state.in_escape = false; + return static_cast(len); + } + continue; + } + + if (state.escape_bytes_pending >= 6 && state.escape_bytes_pending < 12) { + // Collecting second \uXXXX for surrogate pair + state.escape_buffer[state.escape_bytes_pending++] = c; + ++it; + + if (state.escape_bytes_pending == 12) { + // Have full surrogate pair \uXXXX\uXXXX + // Verify format: positions 6-7 should be '\u' + if (state.escape_buffer[6] != '\\' || state.escape_buffer[7] != 'u') [[unlikely]] { + ctx.error = error_code::unicode_escape_conversion_failure; + return -1; + } + + const uint32_t high = hex_to_u32(state.escape_buffer + 2); + const uint32_t low = hex_to_u32(state.escape_buffer + 8); + + if (high == 0xFFFFFFFFu || low == 0xFFFFFFFFu) [[unlikely]] { + ctx.error = error_code::u_requires_hex_digits; + return -1; + } + + // Verify low surrogate (using constants from glaze/util/parse.hpp) + using namespace unicode; + if ((low & surrogate_mask) != low_surrogate_value) [[unlikely]] { + ctx.error = error_code::unicode_escape_conversion_failure; + return -1; + } + + // Decode surrogate pair + uint32_t code_point = (high & surrogate_codepoint_mask) << surrogate_codepoint_bits; + code_point |= (low & surrogate_codepoint_mask); + code_point += surrogate_codepoint_offset; + + char utf8_buf[4]; + const uint32_t len = code_point_to_utf8(code_point, utf8_buf); + if (len == 0) [[unlikely]] { + ctx.error = error_code::unicode_escape_conversion_failure; + return -1; + } + value.append(utf8_buf, len); + state.escape_bytes_pending = 0; + state.in_escape = false; + return static_cast(len); + } + continue; + } + } + + // Need more data + return 0; + } + + // Decode string content byte-by-byte (used near buffer boundaries) + // Returns: true = continue parsing, false = end of string or error + template + GLZ_ALWAYS_INLINE bool decode_byte_by_byte_streaming(auto& value, is_context auto& ctx, const char*& it, + const char* end, string_streaming_state& state) noexcept + { + while (it < end) { + // First check if we have a partial escape to complete + if (state.in_escape) { + const int result = complete_escape_sequence(value, ctx, it, end, state); + if (result < 0) { + return false; // Error + } + if (result == 0) { + return true; // Need refill + } + continue; + } + + const char c = *it; + + if (c == '"') { + // End of string + return false; + } + + if ((static_cast(c) < 0x20)) [[unlikely]] { + // Control character - not allowed unescaped + ctx.error = error_code::syntax_error; + return false; + } + + if (c != '\\') { + // Regular character + value.push_back(c); + ++it; + continue; + } + + // Start of escape sequence + ++it; + if (it >= end) { + // Backslash at buffer boundary - save state + state.escape_buffer[0] = '\\'; + state.escape_bytes_pending = 1; + state.in_escape = true; + return true; // Need refill + } + + const char escape_char = *it; + ++it; + + if (escape_char == 'u') { + // Unicode escape - check if we have enough bytes + if (static_cast(end - it) < 4) { + // Not enough bytes - save state + state.escape_buffer[0] = '\\'; + state.escape_buffer[1] = 'u'; + state.escape_bytes_pending = 2; + // Copy available hex digits + while (it < end && state.escape_bytes_pending < 6) { + state.escape_buffer[state.escape_bytes_pending++] = *it++; + } + state.in_escape = true; + return true; // Need refill + } + + // Have 4 hex digits available + const uint32_t code = hex_to_u32(it); + if (code == 0xFFFFFFFFu) [[unlikely]] { + ctx.error = error_code::u_requires_hex_digits; + return false; + } + it += 4; + + // Check for surrogate (using constants from glaze/util/parse.hpp) + using namespace unicode; + if ((code & generic_surrogate_mask) == generic_surrogate_value) { + // Surrogate detected + if ((code & surrogate_mask) != high_surrogate_value) [[unlikely]] { + ctx.error = error_code::unicode_escape_conversion_failure; + return false; + } + + // High surrogate - need low surrogate + if (static_cast(end - it) < 6) { + // Not enough for \uXXXX - save state + state.escape_buffer[0] = '\\'; + state.escape_buffer[1] = 'u'; + // Encode high surrogate back to hex + constexpr char hex_digits[] = "0123456789ABCDEF"; + state.escape_buffer[2] = hex_digits[(code >> 12) & 0xF]; + state.escape_buffer[3] = hex_digits[(code >> 8) & 0xF]; + state.escape_buffer[4] = hex_digits[(code >> 4) & 0xF]; + state.escape_buffer[5] = hex_digits[code & 0xF]; + state.escape_bytes_pending = 6; + // Copy available bytes of \uXXXX + while (it < end && state.escape_bytes_pending < 12) { + state.escape_buffer[state.escape_bytes_pending++] = *it++; + } + state.in_escape = true; + return true; // Need refill + } + + // Have enough bytes - check for \u + if (it[0] != '\\' || it[1] != 'u') [[unlikely]] { + ctx.error = error_code::unicode_escape_conversion_failure; + return false; + } + it += 2; + + const uint32_t low = hex_to_u32(it); + if (low == 0xFFFFFFFFu) [[unlikely]] { + ctx.error = error_code::u_requires_hex_digits; + return false; + } + it += 4; + + if ((low & surrogate_mask) != low_surrogate_value) [[unlikely]] { + ctx.error = error_code::unicode_escape_conversion_failure; + return false; + } + + // Decode surrogate pair + uint32_t code_point = (code & surrogate_codepoint_mask) << surrogate_codepoint_bits; + code_point |= (low & surrogate_codepoint_mask); + code_point += surrogate_codepoint_offset; + + char utf8_buf[4]; + const uint32_t len = code_point_to_utf8(code_point, utf8_buf); + value.append(utf8_buf, len); + } + else { + // Not a surrogate - simple BMP character + char utf8_buf[4]; + const uint32_t len = code_point_to_utf8(code, utf8_buf); + value.append(utf8_buf, len); + } + } + else { + // Simple escape + const char unescaped = char_unescape_table[uint8_t(escape_char)]; + if (unescaped == 0) [[unlikely]] { + ctx.error = error_code::invalid_escape; + return false; + } + value.push_back(unescaped); + } + } + + return true; // Buffer exhausted, need refill + } + + // Main streaming string parser + // Parses strings that may span multiple buffer refills + template + void parse_string_streaming(T& value, Ctx& ctx, const char*& it, const char*& end) noexcept + { + value.clear(); + string_streaming_state state{}; + + // Main parsing loop + while (true) { + const size_t remaining = static_cast(end - it); + + // Refill when buffer is getting low + if (remaining < streaming_refill_threshold) { + if (!refill_streaming_buffer(ctx, it, end)) { + // No more data available + if (state.in_escape) { + ctx.error = error_code::unexpected_end; + return; + } + // Check if we're at end of string + if (it < end && *it == '"') { + ++it; + return; + } + ctx.error = error_code::unexpected_end; + return; + } + } + + // Use byte-by-byte parsing (simpler, handles all edge cases) + // Could add SWAR fast path here for large chunks, but byte-by-byte + // is sufficient for streaming where I/O is the bottleneck + if (!decode_byte_by_byte_streaming(value, ctx, it, end, state)) { + if (bool(ctx.error)) { + return; + } + // End of string found + ++it; // Skip closing quote + return; + } + + // decode_byte_by_byte_streaming returned true = buffer exhausted, continue loop to refill + } + } + template requires(string_t && !u8str_t) struct from @@ -1096,6 +1456,14 @@ namespace glz } } + // Streaming string path - handles strings larger than buffer + if constexpr (has_streaming_state>) { + if (ctx.stream.enabled()) { + parse_string_streaming(value, ctx, it, end); + return; + } + } + if constexpr (not check_raw_string(Opts)) { static constexpr auto string_padding_bytes = 8; diff --git a/include/glaze/json/write.hpp b/include/glaze/json/write.hpp index cb4c833451..a7c35bba96 100644 --- a/include/glaze/json/write.hpp +++ b/include/glaze/json/write.hpp @@ -711,6 +711,79 @@ namespace glz } }; + // Helper function for writing strings to bounded streaming buffers + // Extracted to avoid deep nesting in the main string writer + template + GLZ_ALWAYS_INLINE void write_string_bounded_streaming(const sv str, is_context auto&& ctx, B&& b, auto& ix) + { + using buffer_t = std::remove_cvref_t; + constexpr size_t max_escape_size = check_escape_control_characters(Opts) ? 6 : 2; + constexpr size_t min_chunk_space = 2 + max_escape_size; + + auto ensure_and_flush = [&](size_t needed) -> bool { + if (ix + needed > buffer_traits::capacity(b)) { + buffer_traits::flush(b, ix, ctx); + if (bool(ctx.error)) [[unlikely]] { + return false; + } + if (ix + needed > buffer_traits::capacity(b)) { + ctx.error = error_code::buffer_overflow; + return false; + } + } + return true; + }; + + if (!ensure_and_flush(min_chunk_space)) [[unlikely]] { + return; + } + + if constexpr (not check_raw_string(Opts)) { + b[ix] = '"'; + ++ix; + } + + const auto* c = str.data(); + const auto* const e = c + str.size(); + + for (; c < e; ++c) { + if (!ensure_and_flush(max_escape_size + 1)) [[unlikely]] { + return; + } + + if (const auto escaped = char_escape_table[uint8_t(*c)]; escaped) { + std::memcpy(&b[ix], &escaped, 2); + ix += 2; + } + else if constexpr (check_escape_control_characters(Opts)) { + if (uint8_t(*c) < 0x20) { + char unicode_escape[6] = {'\\', 'u', '0', '0', '0', '0'}; + constexpr char hex_digits[] = "0123456789ABCDEF"; + unicode_escape[4] = hex_digits[(uint8_t(*c) >> 4) & 0xF]; + unicode_escape[5] = hex_digits[uint8_t(*c) & 0xF]; + std::memcpy(&b[ix], unicode_escape, 6); + ix += 6; + } + else { + b[ix] = *c; + ++ix; + } + } + else { + b[ix] = *c; + ++ix; + } + } + + if constexpr (not check_raw_string(Opts)) { + if (!ensure_and_flush(1)) [[unlikely]] { + return; + } + b[ix] = '"'; + ++ix; + } + } + template requires str_t || char_t struct to @@ -829,6 +902,12 @@ namespace glz }(); const auto n = str.size(); + // For bounded streaming buffers, write incrementally with flushing + if constexpr (is_bounded_output_streaming) { + write_string_bounded_streaming(str, ctx, b, ix); + return; + } + // In the case n == 0 we need two characters for quotes. // For each individual character we need room for two characters to handle escapes. // When using Unicode escapes, we might need up to 6 characters (\uXXXX) per character diff --git a/tests/istream_buffer_test/istream_buffer_test.cpp b/tests/istream_buffer_test/istream_buffer_test.cpp index 926fac7986..c2987a8ca8 100644 --- a/tests/istream_buffer_test/istream_buffer_test.cpp +++ b/tests/istream_buffer_test/istream_buffer_test.cpp @@ -3961,4 +3961,247 @@ suite additional_edge_cases = [] { }; }; +// ============================================================================ +// STREAMING STRING PARSING TESTS +// ============================================================================ + +suite streaming_string_tests = [] { + "string larger than buffer"_test = [] { + // 2KB string in 512-byte buffer + std::string large_string(2000, 'x'); + std::string json = "\"" + large_string + "\""; + + slow_stringbuf sbuf(json, 64); // 64 bytes per read + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result; + auto ec = glz::read_json(result, buffer); + + expect(!ec) << "Should parse large string successfully"; + expect(result == large_string) << "String content should match"; + }; + + "string with simple escapes spanning buffer"_test = [] { + // Create string with many escape sequences + std::string content; + for (int i = 0; i < 200; ++i) { + content += "a\\nb\\tc\\\"d\\\\e"; + } + std::string json = "\"" + content + "\""; + + slow_stringbuf sbuf(json, 32); // Small chunks + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result; + auto ec = glz::read_json(result, buffer); + + expect(!ec) << "Should parse escaped string successfully"; + + // Verify escapes were decoded + std::string expected; + for (int i = 0; i < 200; ++i) { + expected += "a\nb\tc\"d\\e"; + } + expect(result == expected) << "Escapes should be decoded correctly"; + }; + + "unicode escape at buffer boundary"_test = [] { + // Test \uXXXX escapes at various positions relative to buffer boundary + for (size_t chunk_size = 3; chunk_size <= 16; ++chunk_size) { + // Create string with unicode escapes + std::string json = R"("Hello \u0041\u0042\u0043 World")"; // \u0041 = 'A', etc. + + slow_stringbuf sbuf(json, chunk_size); + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result; + auto ec = glz::read_json(result, buffer); + + expect(!ec) << "Chunk size " << chunk_size << " should work"; + expect(result == "Hello ABC World") << "Unicode should decode correctly"; + } + }; + + "surrogate pair spanning buffer"_test = [] { + // Surrogate pair for emoji: U+1F600 (grinning face) = \uD83D\uDE00 + for (size_t chunk_size = 4; chunk_size <= 20; ++chunk_size) { + std::string json = R"("Test \uD83D\uDE00 emoji")"; + + slow_stringbuf sbuf(json, chunk_size); + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result; + auto ec = glz::read_json(result, buffer); + + expect(!ec) << "Chunk size " << chunk_size << " should work"; + // U+1F600 in UTF-8 is F0 9F 98 80 + expect(result == "Test \xF0\x9F\x98\x80 emoji") << "Surrogate pair should decode correctly"; + } + }; + + "multiple surrogate pairs in large string"_test = [] { + // Multiple emojis in a string larger than buffer + std::string json = "\""; + std::string expected; + for (int i = 0; i < 100; ++i) { + json += "text\\uD83D\\uDE00"; // Add escaped emoji + expected += "text\xF0\x9F\x98\x80"; // Expected UTF-8 + } + json += "\""; + + slow_stringbuf sbuf(json, 48); + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result; + auto ec = glz::read_json(result, buffer); + + expect(!ec) << "Should parse multiple surrogates"; + expect(result == expected) << "All surrogates should decode correctly"; + }; + + "backslash at exact buffer boundary"_test = [] { + // Test backslash at every position relative to buffer size + for (size_t padding = 0; padding < 20; ++padding) { + std::string filler(padding, 'a'); + std::string json = "\"" + filler + "\\n" + "rest\""; + + slow_stringbuf sbuf(json, padding + 2); // Force split at backslash + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result; + auto ec = glz::read_json(result, buffer); + + expect(!ec) << "Padding " << padding << " should work"; + expect(result == filler + "\n" + "rest") << "Newline should decode correctly"; + } + }; + + "object with large string field"_test = [] { + std::string large_value(3000, 'y'); + std::string json = R"({"id":42,"name":")" + large_value + R"("})"; + + slow_stringbuf sbuf(json, 100); + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + Record r; + auto ec = glz::read_json(r, buffer); + + expect(!ec) << "Should parse object with large string"; + expect(r.id == 42); + expect(r.name == large_value); + }; + + "array of large strings"_test = [] { + std::vector original; + std::string json = "["; + for (int i = 0; i < 10; ++i) { + if (i > 0) json += ","; + std::string s(500 + i * 100, 'a' + i); + json += "\"" + s + "\""; + original.push_back(s); + } + json += "]"; + + slow_stringbuf sbuf(json, 128); + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::vector result; + auto ec = glz::read_json(result, buffer); + + expect(!ec) << "Should parse array of large strings"; + expect(result.size() == 10u); + expect(result == original); + }; + + "empty string with slow streaming"_test = [] { + std::string json = R"("")"; + + slow_stringbuf sbuf(json, 1); // Byte by byte + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result = "initial"; + auto ec = glz::read_json(result, buffer); + + expect(!ec) << "Should parse empty string"; + expect(result.empty()) << "Result should be empty"; + }; + + "string with all escape types"_test = [] { + // All JSON escape sequences + std::string json = R"("quote:\" backslash:\\ slash:\/ backspace:\b formfeed:\f newline:\n return:\r tab:\t unicode:\u0048\u0069")"; + + slow_stringbuf sbuf(json, 10); + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result; + auto ec = glz::read_json(result, buffer); + + expect(!ec) << "Should parse all escape types"; + expect(result == "quote:\" backslash:\\ slash:/ backspace:\b formfeed:\f newline:\n return:\r tab:\t unicode:Hi"); + }; + + "byte-by-byte streaming large string"_test = [] { + std::string large(1500, 'z'); + std::string json = "\"" + large + "\""; + + slow_stringbuf sbuf(json, 1); // Extreme: 1 byte at a time + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result; + auto ec = glz::read_json(result, buffer); + + expect(!ec) << "Should handle byte-by-byte streaming"; + expect(result == large); + }; + + "unicode BMP characters"_test = [] { + // Various BMP (Basic Multilingual Plane) characters + std::string json = R"("\u00E9\u00F1\u00FC\u4E2D\u6587")"; // é ñ ü 中 文 + + slow_stringbuf sbuf(json, 8); + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result; + auto ec = glz::read_json(result, buffer); + + expect(!ec); + // UTF-8: é=C3A9, ñ=C3B1, ü=C3BC, 中=E4B8AD, 文=E69687 + expect(result == "\xC3\xA9\xC3\xB1\xC3\xBC\xE4\xB8\xAD\xE6\x96\x87"); + }; + + "round trip large string"_test = [] { + // Write a large string, then read it back with streaming + std::string original(5000, 'q'); + for (size_t i = 0; i < original.size(); i += 100) { + original[i] = '\n'; // Add some escapes + } + + std::string json; + auto wec = glz::write_json(original, json); + expect(!wec); + + slow_stringbuf sbuf(json, 200); + std::istream slow_stream(&sbuf); + glz::basic_istream_buffer buffer(slow_stream); + + std::string result; + auto rec = glz::read_json(result, buffer); + + expect(!rec) << "Should read back large string"; + expect(result == original) << "Round trip should preserve content"; + }; +}; + int main() { return 0; } diff --git a/tests/ostream_buffer_test/ostream_buffer_test.cpp b/tests/ostream_buffer_test/ostream_buffer_test.cpp index 81a0e988db..0f49023625 100644 --- a/tests/ostream_buffer_test/ostream_buffer_test.cpp +++ b/tests/ostream_buffer_test/ostream_buffer_test.cpp @@ -916,6 +916,221 @@ suite streaming_special_types_output_tests = [] { }; }; +// ============================================================================ +// BOUNDED OSTREAM BUFFER TESTS +// ============================================================================ + +// Test struct for bounded buffer tests +struct LargeStringObject +{ + std::string large_string{}; +}; + +template <> +struct glz::meta +{ + using T = LargeStringObject; + static constexpr auto value = object("large_string", &T::large_string); +}; + +suite bounded_ostream_buffer_tests = [] { + "bounded_ostream_buffer traits"_test = [] { + using buffer_t = glz::bounded_ostream_buffer; + using traits = glz::buffer_traits; + + static_assert(!traits::is_resizable); + static_assert(traits::has_bounded_capacity); + static_assert(traits::is_output_streaming); + static_assert(glz::is_bounded_output_streaming); + }; + + "bounded_ostream_buffer basic operation"_test = [] { + std::ostringstream oss; + glz::bounded_ostream_buffer buf{oss}; + + expect(buf.physical_capacity() == 512u); + expect(buf.effective_capacity() == 512u); + expect(buf.bytes_flushed() == 0u); + }; + + "bounded_ostream_buffer simple write"_test = [] { + std::ostringstream oss; + glz::bounded_ostream_buffer buf{oss}; + + TestObject obj{42, "test", 3.14}; + auto ec = glz::write_json(obj, buf); + + expect(!ec); + expect(oss.str() == R"({"id":42,"name":"test","value":3.14})"); + }; + + "bounded_ostream_buffer large string exceeds buffer"_test = [] { + std::ostringstream oss; + // Use 512 byte buffer for a 2000 character string + glz::bounded_ostream_buffer buf{oss}; + + // Create a string larger than the buffer + LargeStringObject obj{.large_string = std::string(2000, 'a')}; + + auto ec = glz::write_json(obj, buf); + expect(!ec) << "Write should succeed with incremental flushing"; + + // Verify round-trip + LargeStringObject parsed; + auto parse_ec = glz::read_json(parsed, oss.str()); + expect(!parse_ec) << "Parse should succeed"; + expect(parsed.large_string == obj.large_string) << "Round-trip should preserve data"; + + // Verify flushing occurred + expect(buf.bytes_flushed() > 512u) << "Buffer should have flushed data"; + }; + + "bounded_ostream_buffer string with escapes exceeds buffer"_test = [] { + std::ostringstream oss; + glz::bounded_ostream_buffer buf{oss}; + + // Create string with escape characters that exceeds buffer + std::string s; + for (int i = 0; i < 200; ++i) { + s += "a\"b\\c\n"; // Contains characters that need escaping + } + LargeStringObject obj{.large_string = s}; + + auto ec = glz::write_json(obj, buf); + expect(!ec) << "Write with escapes should succeed"; + + // Verify round-trip + LargeStringObject parsed; + auto parse_ec = glz::read_json(parsed, oss.str()); + expect(!parse_ec) << "Parse should succeed"; + expect(parsed.large_string == obj.large_string) << "Round-trip should preserve escaped data"; + }; + + "bounded_ostream_buffer capacity grows with flush"_test = [] { + std::ostringstream oss; + glz::bounded_ostream_buffer buf{oss}; + + LargeStringObject obj{.large_string = std::string(2000, 'x')}; + + size_t initial_capacity = buf.effective_capacity(); + expect(initial_capacity == 512u); + + auto ec = glz::write_json(obj, buf); + expect(!ec); + + // After flushing, effective capacity should have grown + size_t final_capacity = buf.effective_capacity(); + expect(final_capacity > initial_capacity) << "Capacity should grow with flushing"; + expect(final_capacity == buf.bytes_flushed() + 512u); + }; + + "bounded_ostream_buffer reset"_test = [] { + std::ostringstream oss; + glz::bounded_ostream_buffer buf{oss}; + + LargeStringObject obj{.large_string = std::string(1000, 'y')}; + auto ec = glz::write_json(obj, buf); + expect(!ec); + + expect(buf.bytes_flushed() > 0u); + + buf.reset(); + + expect(buf.bytes_flushed() == 0u); + expect(buf.effective_capacity() == 512u); + }; + + "bounded_ostream_buffer good and fail"_test = [] { + std::ostringstream oss; + glz::bounded_ostream_buffer buf{oss}; + + expect(buf.good()); + expect(!buf.fail()); + expect(buf.stream() == &oss); + }; + + "bounded_ostream_buffer large array"_test = [] { + std::ostringstream oss; + glz::bounded_ostream_buffer buf{oss}; + + // Large array that exceeds buffer + std::vector arr(500); + for (int i = 0; i < 500; ++i) { + arr[i] = i; + } + + auto ec = glz::write_json(arr, buf); + expect(!ec); + + // Verify round-trip + std::vector parsed; + auto parse_ec = glz::read_json(parsed, oss.str()); + expect(!parse_ec); + expect(parsed == arr); + }; + + "bounded_ostream_buffer nested structure"_test = [] { + std::ostringstream oss; + glz::bounded_ostream_buffer buf{oss}; + + std::map> nested; + for (int i = 0; i < 20; ++i) { + std::vector values; + for (int j = 0; j < 10; ++j) { + values.push_back(std::string(50, 'a' + j)); + } + nested["key" + std::to_string(i)] = values; + } + + auto ec = glz::write_json(nested, buf); + expect(!ec); + + // Verify round-trip + std::map> parsed; + auto parse_ec = glz::read_json(parsed, oss.str()); + expect(!parse_ec); + expect(parsed == nested); + }; + + "bounded_ostream_buffer stream error detection"_test = [] { + // Create a stream that will fail after some writes + class failing_streambuf : public std::streambuf + { + size_t bytes_written_ = 0; + size_t fail_after_; + + public: + failing_streambuf(size_t fail_after) : fail_after_(fail_after) {} + + protected: + std::streamsize xsputn(const char*, std::streamsize n) override + { + if (bytes_written_ + static_cast(n) > fail_after_) { + return 0; // Simulate write failure + } + bytes_written_ += static_cast(n); + return n; + } + + int overflow(int) override { return EOF; } + }; + + // Create stream that fails after 100 bytes + failing_streambuf sbuf(100); + std::ostream failing_stream(&sbuf); + + glz::bounded_ostream_buffer buf{failing_stream}; + + // Try to write a large string that will trigger flush and fail + LargeStringObject obj{.large_string = std::string(2000, 'x')}; + + auto ec = glz::write_json(obj, buf); + + // Should detect the stream write error + expect(ec.ec == glz::error_code::send_error) << "Expected send_error, got: " << static_cast(ec.ec); + }; +}; + // ============================================================================ // DOCUMENTATION EXAMPLES AS TESTS // ============================================================================