diff --git a/CLAUDE.md b/CLAUDE.md
index 1370347..06c8179 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -53,7 +53,7 @@ FFT-based extreme time-stretching library (Paulstretch algorithm). Single transl
 ### Public classes
 
 - **`StreamingStretcher`** — Block-based push/pull primitive for realtime use (AudioWorklet / Web Worker). The host calls `next_input_size()` to learn how many input frames `step()` wants next, gathers exactly that many (zero-padding if the source ran out), calls `step()` to produce `bufsize()` output frames, then advances its input cursor by an additional `skip_after_step()` frames. The first `step()` call expects `max_input_chunk()` (= 3 × bufsize) frames for the initial fill; subsequent requests alternate between 0 and `bufsize()` depending on stretch factor and onset detection. `step_without_onset_feedback()` + `apply_onset()` exists for hosts that need to coordinate onsets across channels.
-- **`OfflineRenderer`** — Convenience wrapper around `StreamingStretcher` for whole-buffer rendering (`render_mono`, `render_stereo`). Stereo rendering runs two independent `StreamingStretcher`s but synchronizes onset detection (`max` of both channels) so the channels stay aligned.
+- **`OfflineRenderer`** — Convenience wrapper around `StreamingStretcher` for whole-buffer rendering (`render_mono`, `render_stereo`). Stereo rendering runs two independent `StreamingStretcher`s but synchronizes onset detection (`max` of both channels) so the channels stay aligned. `render_mono_chunked` / `render_stereo_chunked` run the same DSP loop but deliver the output one `bufsize()`-frame chunk at a time via a `ChunkSink` callback, so peak memory stays bounded for very long outputs (the buffered path holds the whole result — and on WASM a second JS-side copy of it — in linear memory at once, which can exceed the heap and abort). The buffered and chunked paths share a common `stream_channel` / `stream_stereo` core in `src/paulstretch.cpp`.
 - **`BinauralBeatsProcessor`** — Independent post-processor that mixes the stretched signal toward mono and adds a sub-audio LFO-style beat between L/R (`set_options`, optional frequency envelope, `process(left, right, nframes, position_pct)`).
 
 ### Key data flow
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c83c82..dc1c6ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -101,6 +101,10 @@ if (PAULSTRETCH_BUILD_TESTS AND NOT EMSCRIPTEN)
 		COMMAND paulstretch_streaming_test
 			${CMAKE_CURRENT_SOURCE_DIR}/tests/plenty_of_unknown.wav
 	)
+
+	add_executable(paulstretch_chunked_test tests/chunked_test.cpp)
+	target_link_libraries(paulstretch_chunked_test PRIVATE paulstretch_core)
+	add_test(NAME paulstretch_chunked_test COMMAND paulstretch_chunked_test)
 endif()
 
 if (PAULSTRETCH_BUILD_EXAMPLES AND NOT EMSCRIPTEN)
@@ -125,6 +129,11 @@ if (EMSCRIPTEN AND PAULSTRETCH_BUILD_WASM)
 		-O3
 		--bind
 		-sALLOW_MEMORY_GROWTH=1
+		# Raise the growth ceiling to the wasm32 maximum (4 GiB). Emscripten
+		# defaults to 2 GiB, which an hour-plus offline render can exceed and
+		# abort. For unbounded output lengths, prefer renderMonoChunked /
+		# renderStereoChunked, which keep linear memory bounded regardless.
+		-sMAXIMUM_MEMORY=4294967296
 		# Default ENVIRONMENT (web,webview,worker,node) — works in browsers,
 		# bundlers (Vite/Webpack), Node, and Web Workers.
 		-sEXPORT_ES6=1
diff --git a/README.md b/README.md
index 7271989..05a9329 100644
--- a/README.md
+++ b/README.md
@@ -66,6 +66,19 @@ auto [left, right] = renderer.render_stereo(left_in, right_in);
 
 Stereo rendering runs two independent stretchers but synchronizes onset detection across channels so they stay phase-aligned.
 
+#### Very long outputs (chunked rendering)
+
+`render_mono` returns the whole result in one `std::vector`. For an extreme stretch — a few seconds blown up several hundred times into an hour-plus of audio — that buffer can be enormous, and on the WebAssembly build it has to live in linear memory twice over (the C++ vector plus the returned `Float32Array`), which can exceed the WASM heap and abort. `render_mono_chunked` / `render_stereo_chunked` run the identical algorithm but hand each `bufsize()`-frame chunk to a sink as it is produced, so peak memory stays bounded regardless of output length:
+
+```cpp
+renderer.render_mono_chunked(input, [&](const float *data, int frames) {
+    // Consume the chunk — append to a buffer, write to disk, feed an encoder.
+});
+
+renderer.render_stereo_chunked(left_in, right_in,
+    [&](const float *left, const float *right, int frames) { /* ... */ });
+```
+
 ### Streaming (realtime) rendering
 
 `StreamingStretcher` is a block-based push/pull primitive for realtime hosts (audio callback, AudioWorklet, Web Worker). The host gathers exactly the number of input frames the stretcher asks for, calls `step()` to produce one output chunk, then advances its input cursor by the additional skip distance:
@@ -159,77 +172,9 @@ int width = paulstretch::fft_simd_size();              // 4
 
 ## Node.js / WASM usage
 
-```js
-import createPaulstretchModule from "@olilarkin/paulstretch-wasm";
-
-const Module = await createPaulstretchModule();
-const renderer = new Module.OfflineRenderer(8.0, 4096, 48000, Module.Window.Hann, 0.0);
-
-const output = renderer.renderMono(input);
-const { left, right } = renderer.renderStereo(leftChannel, rightChannel);
-
-renderer.delete(); // embind objects are not GC'd
-```
-
-### Streaming
-
-```js
-const s = new Module.StreamingStretcher(8.0, 4096, 48000, Module.Window.Hann, 0.0);
-
-while (rendering) {
-    const want = s.nextInputSize();
-    const input = gatherFrames(want); // Float32Array, zero-pad if needed
-    const { output, onset } = s.step(input, positionPct);
-    writeFrames(output);
-    inputCursor += want + s.skipAfterStep();
-}
-s.delete();
-```
-
-For multichannel hosts that need synchronized onsets across channels, use `stepWithoutOnsetFeedback()` on every channel, take the max onset, then call `applyOnset()` on every channel before the next iteration.
-
-### Stretch envelope
-
-Pass parallel arrays of positions (0–1) and multiplier values:
-
-```js
-renderer.setStretchEnvelope(
-    new Float32Array([0, 0.5, 1.0]),
-    new Float32Array([1.0, 4.0, 1.0]),
-);
-const output = renderer.renderMono(input);
-renderer.clearStretchEnvelope();
-```
-
-### Spectral processing
-
-`setProcessOptions` accepts a plain JS object with camelCase keys (e.g. `pitchShiftEnabled`, `pitchShiftCents`, `filterEnabled`, `filterLowHz`); unspecified fields keep their defaults:
+The Emscripten build is published as [`@olilarkin/paulstretch-wasm`](https://github.com/olilarkin/libpaulstretch/pkgs/npm/paulstretch-wasm). See [`npm/README.md`](npm/README.md) for installation, the full JS API (offline, streaming, envelope, spectral processing, binaural beats), and bundler notes. Type definitions live in [`npm/index.d.ts`](npm/index.d.ts).
 
-```js
-renderer.setProcessOptions({
-    pitchShiftEnabled: true,
-    pitchShiftCents: 700,
-    filterEnabled: true,
-    filterLowHz: 200,
-    filterHighHz: 4000,
-});
-```
-
-See `npm/index.d.ts` for the full `ProcessOptions` shape.
-
-### Binaural beats
-
-```js
-const bb = new Module.BinauralBeatsProcessor(48000);
-bb.setOptions({
-    enabled: true,
-    stereoMode: Module.BinauralStereoMode.LeftRight,
-    mono: 0.5,
-    beatFrequencyHz: 8,
-});
-const { left, right } = bb.process(leftIn, rightIn, positionPct);
-bb.delete();
-```
+The C++ and JS APIs are 1:1 — JS methods use camelCase versions of the C++ names, and `setProcessOptions` accepts a plain object instead of a `ProcessOptions` struct.
 
 ## Notes
 
diff --git a/include/paulstretch/paulstretch.h b/include/paulstretch/paulstretch.h
index cf716cf..ae553d5 100644
--- a/include/paulstretch/paulstretch.h
+++ b/include/paulstretch/paulstretch.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <cstddef>
+#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
@@ -190,6 +191,22 @@ class OfflineRenderer {
     StereoBuffer render_stereo(const std::vector<float> &left,
                                const std::vector<float> &right) const;
 
+    // Chunked offline rendering. Identical algorithm to render_mono/render_stereo,
+    // but instead of materialising the whole output in one buffer the result is
+    // delivered to `sink` one chunk at a time (each chunk is `bufsize()` frames).
+    // This keeps peak memory bounded regardless of stretch factor / output length
+    // — essential for the WASM build, whose linear memory is capped well below the
+    // size an hour-plus render would otherwise need. The pointers passed to `sink`
+    // are only valid for the duration of the call; copy out anything you keep.
+    using ChunkSink = std::function<void(const float *data, int frames)>;
+    using StereoChunkSink =
+        std::function<void(const float *left, const float *right, int frames)>;
+    void render_mono_chunked(const std::vector<float> &input,
+                             const ChunkSink &sink) const;
+    void render_stereo_chunked(const std::vector<float> &left,
+                               const std::vector<float> &right,
+                               const StereoChunkSink &sink) const;
+
     std::size_t estimate_output_frames(std::size_t input_frames) const;
 
 private:
diff --git a/npm/README.md b/npm/README.md
index cdb7c8b..f86b402 100644
--- a/npm/README.md
+++ b/npm/README.md
@@ -54,6 +54,29 @@ renderer.delete();
 const { left, right } = renderer.renderStereo(leftIn, rightIn);
 ```
 
+### Very long outputs (chunked rendering)
+
+`renderMono` returns the whole result in one `Float32Array`, which has to live in
+WebAssembly memory twice over (the internal buffer plus the returned copy). A large
+stretch — e.g. a few seconds stretched several hundred times into an hour-plus of
+audio — can exceed the WASM heap and abort. For those cases use `renderMonoChunked`
+(or `renderStereoChunked`): same algorithm, but the output is delivered one chunk at
+a time so peak WASM memory stays bounded regardless of length. Accumulate the chunks
+on the JS heap, or stream them straight to disk / an encoder.
+
+```js
+const chunks = [];
+const totalFrames = renderer.renderMonoChunked(input, (chunk) => {
+  // `chunk` is a fresh Float32Array you may keep (~fftSize frames).
+  chunks.push(chunk);
+});
+
+// Stereo: callback receives (left, right) per chunk.
+// const totalFrames = renderer.renderStereoChunked(leftIn, rightIn, (l, r) => { ... });
+
+renderer.delete();
+```
+
 ### Time-varying stretch (breakpoint envelope)
 
 Positions are normalized `0..1` over the input. Values multiply the `stretch` you passed to the constructor.
diff --git a/npm/index.d.ts b/npm/index.d.ts
index a2b7d8e..4813f62 100644
--- a/npm/index.d.ts
+++ b/npm/index.d.ts
@@ -68,6 +68,23 @@ export interface OfflineRenderer {
   renderMono(input: Float32Array): Float32Array;
   renderStereo(left: Float32Array, right: Float32Array): StereoBuffer;
 
+  /**
+   * Chunked offline render — use this instead of `renderMono` for very long
+   * outputs (large stretch factors). The whole result of `renderMono` must live
+   * in WASM linear memory twice over (the C++ buffer plus the returned copy),
+   * so an hour-plus render can exceed the heap cap and abort. `renderMonoChunked`
+   * instead invokes `onChunk` with each ~`bufsize` slice; copy/accumulate it on
+   * the JS heap (or stream it to disk / an encoder) as it arrives. Peak WASM
+   * memory stays bounded regardless of output length. The Float32Array passed to
+   * `onChunk` is a fresh JS-heap copy you may keep. Returns the total frame count.
+   */
+  renderMonoChunked(input: Float32Array, onChunk: (chunk: Float32Array) => void): number;
+  renderStereoChunked(
+    left: Float32Array,
+    right: Float32Array,
+    onChunk: (left: Float32Array, right: Float32Array) => void,
+  ): number;
+
   /**
    * Set a time-varying stretch multiplier. Positions are normalized 0..1
    * over the input duration; values are multipliers on the constructor's
diff --git a/src/paulstretch.cpp b/src/paulstretch.cpp
index 0b22b0a..5b5cc5f 100644
--- a/src/paulstretch.cpp
+++ b/src/paulstretch.cpp
@@ -991,13 +991,17 @@ namespace {
 
 // ── Offline render glue (built on StreamingStretcher) ──────────────────────
 
-std::vector<float> render_channel(
+// Core streaming loop: drives a StreamingStretcher over `input` and delivers
+// each `bufsize` output chunk to `sink` (never materialising the full output).
+// Both render_channel (whole-buffer) and render_mono_chunked build on this.
+void stream_channel(
     const std::vector<float> &input,
     const RenderOptions &options,
     const std::vector<Breakpoint> &envelope,
     const ProcessOptions &process_options,
-    const std::vector<Breakpoint> &arbitrary_filter) {
-    if (input.empty()) return {};
+    const std::vector<Breakpoint> &arbitrary_filter,
+    const OfflineRenderer::ChunkSink &sink) {
+    if (input.empty()) return;
 
     StreamingStretcher stretch(options);
     if (!envelope.empty()) stretch.set_stretch_envelope(envelope);
@@ -1007,8 +1011,6 @@ std::vector<float> render_channel(
     const int bufsize = stretch.bufsize();
     std::vector<float> out_chunk(bufsize, 0.0f);
     std::vector<float> in_chunk(stretch.max_input_chunk(), 0.0f);
-    std::vector<float> output;
-    output.reserve(static_cast<std::size_t>(std::ceil(input.size() * options.stretch)) + bufsize);
 
     std::size_t cursor = 0;
     bool first = true;
@@ -1030,14 +1032,98 @@ std::vector<float> render_channel(
         }
 
         stretch.step(want > 0 ? in_chunk.data() : nullptr, pos_pct, out_chunk.data());
-        output.insert(output.end(), out_chunk.begin(), out_chunk.end());
+        sink(out_chunk.data(), bufsize);
         cursor = clamp_advance(cursor, stretch.skip_after_step(), input.size());
         first = false;
     }
+}
+
+std::vector<float> render_channel(
+    const std::vector<float> &input,
+    const RenderOptions &options,
+    const std::vector<Breakpoint> &envelope,
+    const ProcessOptions &process_options,
+    const std::vector<Breakpoint> &arbitrary_filter) {
+    if (input.empty()) return {};
+
+    std::vector<float> output;
+    output.reserve(static_cast<std::size_t>(std::ceil(input.size() * options.stretch)) + options.fft_size);
+
+    stream_channel(input, options, envelope, process_options, arbitrary_filter,
+                   [&output](const float *data, int frames) {
+                       output.insert(output.end(), data, data + frames);
+                   });
 
     return output;
 }
 
+// Core streaming loop for stereo. Runs two StreamingStretchers in lockstep and
+// delivers each `bufsize` L/R chunk pair to `sink`. Both render_stereo and
+// render_stereo_chunked build on this.
+void stream_stereo(
+    const std::vector<float> &left,
+    const std::vector<float> &right,
+    const RenderOptions &options,
+    const std::vector<Breakpoint> &envelope,
+    const ProcessOptions &process_options,
+    const std::vector<Breakpoint> &arbitrary_filter,
+    const OfflineRenderer::StereoChunkSink &sink) {
+    StreamingStretcher stretch_left(options);
+    StreamingStretcher stretch_right(options);
+    if (!envelope.empty()) {
+        stretch_left.set_stretch_envelope(envelope);
+        stretch_right.set_stretch_envelope(envelope);
+    }
+    stretch_left.set_process_options(process_options);
+    stretch_right.set_process_options(process_options);
+    if (!arbitrary_filter.empty()) {
+        stretch_left.set_arbitrary_filter(arbitrary_filter);
+        stretch_right.set_arbitrary_filter(arbitrary_filter);
+    }
+
+    const int bufsize = stretch_left.bufsize();
+    std::vector<float> in_l(stretch_left.max_input_chunk(), 0.0f);
+    std::vector<float> in_r(stretch_right.max_input_chunk(), 0.0f);
+    std::vector<float> out_l(bufsize, 0.0f);
+    std::vector<float> out_r(bufsize, 0.0f);
+
+    bool first = true;
+    std::size_t cursor = 0;
+
+    while (true) {
+        const float pos_pct = 100.0f * static_cast<float>(cursor) / static_cast<float>(left.size());
+        const int want = first ? stretch_left.max_input_chunk() : stretch_left.next_input_size();
+
+        if (want > 0 && cursor >= left.size() && !first) break;
+
+        if (want > 0) {
+            const std::size_t avail = left.size() - cursor;
+            const std::size_t take = std::min<std::size_t>(avail, static_cast<std::size_t>(want));
+            std::copy_n(left.data() + cursor, take, in_l.data());
+            std::copy_n(right.data() + cursor, take, in_r.data());
+            if (take < static_cast<std::size_t>(want)) {
+                std::fill_n(in_l.data() + take, want - take, 0.0f);
+                std::fill_n(in_r.data() + take, want - take, 0.0f);
+            }
+            cursor += take;
+        }
+
+        // NB: we drop the per-channel onset coordination the previous offline
+        // path did (combining onset_l/onset_r with std::max and feeding both
+        // sides) because the StreamingStretcher's step() applies its own onset
+        // internally. For independent stereo channels with onset detection on,
+        // this means each side reacts to its own transients — usually the more
+        // correct behavior anyway.
+        stretch_left.step(want > 0 ? in_l.data() : nullptr, pos_pct, out_l.data());
+        stretch_right.step(want > 0 ? in_r.data() : nullptr, pos_pct, out_r.data());
+
+        sink(out_l.data(), out_r.data(), bufsize);
+
+        cursor = clamp_advance(cursor, stretch_left.skip_after_step(), left.size());
+        first = false;
+    }
+}
+
 } // anonymous namespace
 
 // ── Public API ─────────────────────────────────────────────────────────────
@@ -1298,68 +1384,30 @@ StereoBuffer OfflineRenderer::render_stereo(const std::vector<float> &left, cons
     if (left.size() != right.size()) throw std::invalid_argument("left and right channel lengths must match");
     if (left.empty()) return {};
 
-    StreamingStretcher stretch_left(options_);
-    StreamingStretcher stretch_right(options_);
-    if (!envelope_.empty()) {
-        stretch_left.set_stretch_envelope(envelope_);
-        stretch_right.set_stretch_envelope(envelope_);
-    }
-    stretch_left.set_process_options(process_options_);
-    stretch_right.set_process_options(process_options_);
-    if (!arbitrary_filter_.empty()) {
-        stretch_left.set_arbitrary_filter(arbitrary_filter_);
-        stretch_right.set_arbitrary_filter(arbitrary_filter_);
-    }
-
-    const int bufsize = stretch_left.bufsize();
-    std::vector<float> in_l(stretch_left.max_input_chunk(), 0.0f);
-    std::vector<float> in_r(stretch_right.max_input_chunk(), 0.0f);
-    std::vector<float> out_l(bufsize, 0.0f);
-    std::vector<float> out_r(bufsize, 0.0f);
-
     StereoBuffer output;
     const std::size_t reserve = estimate_output_frames(left.size());
     output.left.reserve(reserve);
     output.right.reserve(reserve);
 
-    bool first = true;
-    std::size_t cursor = 0;
-
-    while (true) {
-        const float pos_pct = 100.0f * static_cast<float>(cursor) / static_cast<float>(left.size());
-        const int want = first ? stretch_left.max_input_chunk() : stretch_left.next_input_size();
+    stream_stereo(left, right, options_, envelope_, process_options_, arbitrary_filter_,
+                  [&output](const float *l, const float *r, int frames) {
+                      output.left.insert(output.left.end(), l, l + frames);
+                      output.right.insert(output.right.end(), r, r + frames);
+                  });
 
-        if (want > 0 && cursor >= left.size() && !first) break;
-
-        if (want > 0) {
-            const std::size_t avail = left.size() - cursor;
-            const std::size_t take = std::min<std::size_t>(avail, static_cast<std::size_t>(want));
-            std::copy_n(left.data() + cursor, take, in_l.data());
-            std::copy_n(right.data() + cursor, take, in_r.data());
-            if (take < static_cast<std::size_t>(want)) {
-                std::fill_n(in_l.data() + take, want - take, 0.0f);
-                std::fill_n(in_r.data() + take, want - take, 0.0f);
-            }
-            cursor += take;
-        }
-
-        // NB: we drop the per-channel onset coordination the previous offline
-        // path did (combining onset_l/onset_r with std::max and feeding both
-        // sides) because the StreamingStretcher's step() applies its own onset
-        // internally. For independent stereo channels with onset detection on,
-        // this means each side reacts to its own transients — usually the more
-        // correct behavior anyway.
-        stretch_left.step(want > 0 ? in_l.data() : nullptr, pos_pct, out_l.data());
-        stretch_right.step(want > 0 ? in_r.data() : nullptr, pos_pct, out_r.data());
-
-        output.left.insert(output.left.end(), out_l.begin(), out_l.end());
-        output.right.insert(output.right.end(), out_r.begin(), out_r.end());
+    return output;
+}
 
-        cursor = clamp_advance(cursor, stretch_left.skip_after_step(), left.size());
-        first = false;
-    }
+void OfflineRenderer::render_mono_chunked(const std::vector<float> &input, const ChunkSink &sink) const {
+    stream_channel(input, options_, envelope_, process_options_, arbitrary_filter_, sink);
+}
 
-    return output;
+void OfflineRenderer::render_stereo_chunked(const std::vector<float> &left,
+                                            const std::vector<float> &right,
+                                            const StereoChunkSink &sink) const {
+    if (left.size() != right.size()) throw std::invalid_argument("left and right channel lengths must match");
+    if (left.empty()) return;
+    stream_stereo(left, right, options_, envelope_, process_options_, arbitrary_filter_, sink);
 }
 
 std::size_t OfflineRenderer::estimate_output_frames(std::size_t input_frames) const {
diff --git a/src/wasm_bindings.cpp b/src/wasm_bindings.cpp
index ee49034..0b29e8c 100644
--- a/src/wasm_bindings.cpp
+++ b/src/wasm_bindings.cpp
@@ -17,12 +17,16 @@ std::vector<float> from_js_array(const emscripten::val &input) {
 	return samples;
 }
 
-emscripten::val to_js_float32_array(const std::vector<float> &input) {
-	emscripten::val output = emscripten::val::global("Float32Array").new_(input.size());
-	output.call<void>("set", emscripten::val(emscripten::typed_memory_view(input.size(), input.data())));
+emscripten::val to_js_float32_array(const float *data, std::size_t n) {
+	emscripten::val output = emscripten::val::global("Float32Array").new_(n);
+	output.call<void>("set", emscripten::val(emscripten::typed_memory_view(n, data)));
 	return output;
 }
 
+emscripten::val to_js_float32_array(const std::vector<float> &input) {
+	return to_js_float32_array(input.data(), input.size());
+}
+
 template <typename T>
 T get_or(const emscripten::val &obj, const char *key, T fallback) {
 	emscripten::val v = obj[key];
@@ -115,6 +119,36 @@ class WasmOfflineRenderer {
 		return result;
 	}
 
+	// Chunked offline render. Instead of returning one huge Float32Array (which
+	// for an hour-plus output would need to live in WASM linear memory twice over
+	// — the C++ vector plus the JS copy — and can blow the memory cap and abort),
+	// each ~bufsize() chunk is copied out to a fresh JS-heap Float32Array and
+	// handed to `onChunk`. Peak WASM memory stays ≈ input + one chunk; the full
+	// output accumulates on the JS side (or is streamed to disk / an encoder).
+	// Returns the total number of output frames delivered.
+	std::size_t renderMonoChunked(const emscripten::val &input, emscripten::val onChunk) const {
+		const std::vector<float> in = from_js_array(input);
+		std::size_t total = 0;
+		renderer_.render_mono_chunked(in, [&onChunk, &total](const float *data, int frames) {
+			onChunk(to_js_float32_array(data, static_cast<std::size_t>(frames)));
+			total += static_cast<std::size_t>(frames);
+		});
+		return total;
+	}
+
+	std::size_t renderStereoChunked(const emscripten::val &left, const emscripten::val &right,
+	                                emscripten::val onChunk) const {
+		const std::vector<float> l = from_js_array(left);
+		const std::vector<float> r = from_js_array(right);
+		std::size_t total = 0;
+		renderer_.render_stereo_chunked(l, r, [&onChunk, &total](const float *lc, const float *rc, int frames) {
+			onChunk(to_js_float32_array(lc, static_cast<std::size_t>(frames)),
+			        to_js_float32_array(rc, static_cast<std::size_t>(frames)));
+			total += static_cast<std::size_t>(frames);
+		});
+		return total;
+	}
+
 	void setStretchEnvelope(const emscripten::val &xs, const emscripten::val &ys) {
 		const int n = std::min(xs["length"].as<int>(), ys["length"].as<int>());
 		std::vector<paulstretch::Breakpoint> envelope(n);
@@ -301,6 +335,8 @@ EMSCRIPTEN_BINDINGS(paulstretch) {
 		.function("estimateOutputFrames", &WasmOfflineRenderer::estimateOutputFrames)
 		.function("renderMono", &WasmOfflineRenderer::renderMono)
 		.function("renderStereo", &WasmOfflineRenderer::renderStereo)
+		.function("renderMonoChunked", &WasmOfflineRenderer::renderMonoChunked)
+		.function("renderStereoChunked", &WasmOfflineRenderer::renderStereoChunked)
 		.function("setStretchEnvelope", &WasmOfflineRenderer::setStretchEnvelope)
 		.function("clearStretchEnvelope", &WasmOfflineRenderer::clearStretchEnvelope)
 		.function("setProcessOptions", &WasmOfflineRenderer::setProcessOptions)
diff --git a/tests/browser/index.html b/tests/browser/index.html
index 6756e8c..3aa8335 100644
--- a/tests/browser/index.html
+++ b/tests/browser/index.html
@@ -41,6 +41,31 @@
   if (env.length === flat.length) throw new Error('envelope did not change length');
 
   r.delete();
+
+  // Chunked render: same algorithm as renderMono, delivered incrementally so
+  // huge outputs never need the whole buffer in WASM memory at once. Accumulate
+  // the chunks JS-side. Per-instance PRNG seed means the samples won't be
+  // bit-identical to `flat`, but the total length must match and the callback
+  // must fire repeatedly with finite data (proving the bounded-memory path).
+  const rc = new M.OfflineRenderer(4, 1024, 44100, M.Window.Hann, 0);
+  let chunkCount = 0;
+  let chunkFrames = 0;
+  let allFinite = true;
+  const total = rc.renderMonoChunked(input, (chunk) => {
+    chunkCount++;
+    chunkFrames += chunk.length;
+    for (let i = 0; i < chunk.length; i++) {
+      if (!Number.isFinite(chunk[i])) allFinite = false;
+    }
+  });
+  log('chunked 4x: output', total, 'in', chunkCount, 'chunks');
+  if (chunkCount < 2) throw new Error('expected multiple chunks, got ' + chunkCount);
+  if (!allFinite) throw new Error('chunked output had non-finite samples');
+  if (total !== chunkFrames || total !== flat.length) {
+    throw new Error('chunked length ' + total + ' != flat length ' + flat.length);
+  }
+  rc.delete();
+
   out.dataset.status = 'ok';
 } catch (e) {
   fail((e && e.message) + '\n' + (e && e.stack));
diff --git a/tests/chunked_test.cpp b/tests/chunked_test.cpp
new file mode 100644
index 0000000..aa76cc0
--- /dev/null
+++ b/tests/chunked_test.cpp
@@ -0,0 +1,243 @@
+// Tests for the chunked offline render API (render_mono_chunked /
+// render_stereo_chunked).
+//
+// These exist so the WebAssembly build can produce very long outputs without
+// materialising the whole result in linear memory. renderMono returns the
+// output in one buffer that must live in WASM memory twice over (the internal
+// vector plus the returned copy), so a large render can exceed the wasm32 heap
+// and abort. The chunked variants run the same DSP loop but deliver the result
+// one bufsize() chunk at a time, keeping peak memory bounded.
+//
+// Part 1 checks behavioural parity with the whole-buffer path at a modest size.
+// Part 2 checks the thing that actually matters — that a representative extreme
+// render would blow the known wasm32 memory limits if buffered, while the
+// chunked path's working set stays a tiny fraction of them. (The native host is
+// 64-bit and can't reproduce the abort, so Part 2 reasons about the sizes via
+// estimate_output_frames rather than allocating them.)
+//
+// We can't assert bit-identity against render_mono: each Stretcher instance
+// bumps a static PRNG seed, so two renders get different phase randomization
+// (same convention as streaming_test.cpp). Hence structural comparison.
+
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paulstretch/paulstretch.h"
+
+namespace {
+
+int failures = 0;
+
+void check(bool cond, const std::string &label) {
+	if (!cond) {
+		++failures;
+		std::cerr << "FAIL: " << label << '\n';
+	} else {
+		std::cout << "  ok: " << label << '\n';
+	}
+}
+
+std::vector<float> make_sine(std::size_t frames, float sample_rate, float frequency) {
+	std::vector<float> result(frames, 0.0f);
+	const float tau = 6.28318530717958647692f;
+	for (std::size_t i = 0; i < frames; i++) {
+		float phase = tau * frequency * static_cast<float>(i) / sample_rate;
+		result[i] = 0.5f * std::sin(phase);
+	}
+	return result;
+}
+
+struct Stats {
+	std::size_t frames = 0;
+	float peak = 0.0f;
+	double rms = 0.0;
+	bool all_finite = true;
+};
+
+Stats analyze(const std::vector<float> &v) {
+	Stats s;
+	s.frames = v.size();
+	double sumsq = 0.0;
+	for (float x : v) {
+		if (!std::isfinite(x)) { s.all_finite = false; continue; }
+		float a = std::fabs(x);
+		if (a > s.peak) s.peak = a;
+		sumsq += static_cast<double>(x) * static_cast<double>(x);
+	}
+	s.rms = v.empty() ? 0.0 : std::sqrt(sumsq / static_cast<double>(v.size()));
+	return s;
+}
+
+// Same contract streaming_test uses: equal length, finite, peak within 2×, RMS
+// within 15% (phase randomness differs but averages out).
+void check_structural(const std::vector<float> &a, const std::vector<float> &b,
+                      const std::string &label) {
+	auto sa = analyze(a);
+	auto sb = analyze(b);
+	check(sa.all_finite, label + ": chunked finite");
+	check(sb.all_finite, label + ": reference finite");
+	check(a.size() == b.size(), label + ": same length");
+	if (sa.peak > 0 && sb.peak > 0) {
+		const double ratio = sa.peak / sb.peak;
+		check(ratio > 0.5 && ratio < 2.0, label + ": peaks within 2×");
+	}
+	if (sa.rms > 0 && sb.rms > 0) {
+		const double rel = std::fabs(sa.rms - sb.rms) / std::max(sa.rms, sb.rms);
+		check(rel < 0.15, label + ": RMS within 15%");
+	}
+}
+
+double gib(std::uint64_t bytes) { return static_cast<double>(bytes) / (1024.0 * 1024.0 * 1024.0); }
+
+// ── Part 1: behavioural parity at a modest, CI-fast size ───────────────────
+void test_parity() {
+	std::cout << "\n[parity] chunked vs whole-buffer render\n";
+
+	const paulstretch::RenderOptions options{
+		.stretch = 32.0f,
+		.fft_size = 4096,
+		.sample_rate = 44100.0f,
+		.window = paulstretch::Window::Hann,
+		.onset_detection_sensitivity = 0.0f,
+	};
+
+	paulstretch::OfflineRenderer renderer(options);
+	const int bufsize = static_cast<int>(renderer.options().fft_size);
+
+	const auto left_in = make_sine(44100, options.sample_rate, 220.0f);  // 1 s
+	const auto right_in = make_sine(44100, options.sample_rate, 277.0f);
+
+	// Mono.
+	const auto mono_ref = renderer.render_mono(left_in);
+	std::vector<float> mono_chunked;
+	mono_chunked.reserve(mono_ref.size());
+	std::size_t mono_chunks = 0;
+	bool mono_uniform = true;
+	renderer.render_mono_chunked(left_in, [&](const float *data, int frames) {
+		mono_chunks++;
+		if (frames != bufsize) mono_uniform = false;
+		mono_chunked.insert(mono_chunked.end(), data, data + frames);
+	});
+	std::cout << "  mono: " << mono_ref.size() << " frames in " << mono_chunks << " chunks\n";
+	check(!mono_ref.empty(), "render_mono produced output");
+	check(mono_uniform, "every mono chunk is exactly bufsize frames");
+	check(mono_chunks > 100, "many mono chunks (per-chunk memory is bounded)");
+	check_structural(mono_chunked, mono_ref, "mono");
+
+	// Stereo.
+	const auto stereo_ref = renderer.render_stereo(left_in, right_in);
+	std::vector<float> stereo_l, stereo_r;
+	stereo_l.reserve(stereo_ref.left.size());
+	stereo_r.reserve(stereo_ref.right.size());
+	bool stereo_uniform = true;
+	renderer.render_stereo_chunked(left_in, right_in, [&](const float *l, const float *r, int frames) {
+		if (frames != bufsize) stereo_uniform = false;
+		stereo_l.insert(stereo_l.end(), l, l + frames);
+		stereo_r.insert(stereo_r.end(), r, r + frames);
+	});
+	check(!stereo_ref.left.empty(), "render_stereo produced output");
+	check(stereo_uniform, "every stereo chunk is exactly bufsize frames");
+	check_structural(stereo_l, stereo_ref.left, "stereo L");
+	check_structural(stereo_r, stereo_ref.right, "stereo R");
+
+	// Empty input is a no-op (sink never fires).
+	std::size_t empty_calls = 0;
+	renderer.render_mono_chunked({}, [&](const float *, int) { empty_calls++; });
+	check(empty_calls == 0, "chunked render of empty input does not call the sink");
+}
+
+// ── Part 2: the chunked path stays under the wasm32 memory limits ──────────
+void test_memory_bounds() {
+	std::cout << "\n[limits] buffered footprint vs wasm32 heap caps\n";
+
+	// wasm32 linear-memory ceilings the WASM build runs against.
+	constexpr std::uint64_t kFloat = sizeof(float);
+	constexpr std::uint64_t kWasmDefaultCap = 2ull * 1024 * 1024 * 1024; // Emscripten default MAXIMUM_MEMORY
+	constexpr std::uint64_t kWasmMaxHeap = 4ull * 1024 * 1024 * 1024;    // wasm32 hard maximum
+
+	// A representative extreme job: a 15-second clip stretched 500× (~2 hours of
+	// output). Sizes come from estimate_output_frames — nothing is allocated.
+	const paulstretch::RenderOptions options{
+		.stretch = 500.0f,
+		.fft_size = 4096,
+		.sample_rate = 44100.0f,
+		.window = paulstretch::Window::Hann,
+		.onset_detection_sensitivity = 0.0f,
+	};
+	paulstretch::OfflineRenderer renderer(options);
+
+	const std::size_t input_frames = static_cast<std::size_t>(15 * 44100);
+	const std::uint64_t out_frames = renderer.estimate_output_frames(input_frames);
+	const std::uint64_t per_channel = out_frames * kFloat;
+
+	// What renderMono / renderStereo actually need live at once:
+	//   mono   — internal vector + the returned JS copy            = 2× a channel
+	//   stereo — both channel vectors, plus one channel's JS copy  = 3× a channel
+	const std::uint64_t mono_peak = per_channel * 2;
+	const std::uint64_t stereo_peak = per_channel * 3;
+
+	// What the chunked path needs live at once: a single bufsize chunk.
+	const std::uint64_t chunked_working_set =
+		static_cast<std::uint64_t>(renderer.options().fft_size) * kFloat;
+
+	std::cout << "  output ~" << out_frames << " frames (" << gib(per_channel) << " GiB/channel)\n";
+	std::cout << "  buffered peak: mono " << gib(mono_peak) << " GiB, stereo "
+	          << gib(stereo_peak) << " GiB\n";
+	std::cout << "  chunked working set: " << chunked_working_set << " bytes\n";
+
+	// Buffered rendering of this job would have aborted under Emscripten's old
+	// 2 GiB default heap cap — both mono and stereo overflow it.
+	check(mono_peak > kWasmDefaultCap,
+	      "buffered mono render exceeds the 2 GiB default heap cap");
+	check(stereo_peak > kWasmDefaultCap,
+	      "buffered stereo render exceeds the 2 GiB default heap cap");
+	// Raising the cap to the wasm32 maximum (4 GiB) is what lets this particular
+	// job through on the buffered path.
+	check(stereo_peak < kWasmMaxHeap,
+	      "...but fits within the raised 4 GiB cap");
+
+	// The chunked path's footprint is a negligible fraction of the cap, so it
+	// completes regardless of how long the output is.
+	check(chunked_working_set < (1ull << 20),
+	      "chunked working set is under 1 MiB");
+	check(chunked_working_set * 1000 < kWasmDefaultCap,
+	      "chunked working set is <0.1% of the heap cap");
+
+	// A bigger job (30 s × 500×) overflows even the 4 GiB maximum on the *mono*
+	// buffered path — so raising the cap is not enough, and only the chunked
+	// path (which is independent of output length) stays under the limit.
+	paulstretch::OfflineRenderer big({
+		.stretch = 500.0f,
+		.fft_size = 4096,
+		.sample_rate = 48000.0f,
+		.window = paulstretch::Window::Hann,
+		.onset_detection_sensitivity = 0.0f,
+	});
+	const std::uint64_t big_per_channel =
+		static_cast<std::uint64_t>(big.estimate_output_frames(30 * 48000)) * kFloat;
+	std::cout << "  bigger job buffered mono peak: " << gib(big_per_channel * 2) << " GiB\n";
+	check(big_per_channel * 2 > kWasmMaxHeap,
+	      "a bigger job exceeds even the 4 GiB wasm32 maximum (chunking required)");
+}
+
+} // namespace
+
+int main() {
+	std::cout << "backend=" << paulstretch::fft_backend_name()
+	          << " simd=" << paulstretch::fft_simd_arch()
+	          << " width=" << paulstretch::fft_simd_size() << '\n';
+
+	test_parity();
+	test_memory_bounds();
+
+	if (failures == 0) {
+		std::cout << "\nOK\n";
+		return EXIT_SUCCESS;
+	}
+	std::cerr << "\n" << failures << " check(s) failed\n";
+	return EXIT_FAILURE;
+}