Restore streaming download+extract pipeline

joshfriend · joshfriend · commit fc41d3967cc9 · 2026-03-05T16:35:05.000-05:00
The parallel range download and streaming extraction pipeline are now
pipelined: S3 bytes flow directly through pzstd into the extractor with
no intermediate temp file, so download and extraction overlap.

Logging is split into two INFO lines:
  - "download complete": time from start until last S3 byte consumed (the
    dominant term; extraction runs concurrently so this ≈ total wall time)
  - "restore pipeline complete": total_duration + extract_tail (the small
    gap between last S3 byte and last file written)

Also fix the Linux extractor to use io.CopyBuffer with a fixed 1 MB block
buffer instead of whole-file ReadFull, matching GNU tar's streaming pattern.
diff --git a/cmd/gradle-cache/extract_darwin.go b/cmd/gradle-cache/extract_darwin.go
@@ -13,6 +13,17 @@ import (
 	"github.com/alecthomas/errors"
 )
 
+// extractBufPool is a pool of reusable byte-slice pointers used by extractTarGo
+// on macOS. Reusing slices eliminates per-file heap allocations for the parallel
+// write path. Initial capacity is 256 KiB — large enough for most Gradle cache
+// files without needing a separate allocation.
+var extractBufPool = sync.Pool{
+	New: func() interface{} {
+		b := make([]byte, 0, 256<<10)
+		return &b
+	},
+}
+
 // mmapThreshold is the minimum file size above which ftruncate+mmap+memcpy is
 // faster than write() on macOS APFS. Below this threshold, mmap setup overhead
 // exceeds the savings. 64 KB covers most Gradle .jar files.
diff --git a/cmd/gradle-cache/extract_default.go b/cmd/gradle-cache/extract_default.go
@@ -12,14 +12,28 @@ import (
 	"github.com/alecthomas/errors"
 )
 
-// extractTarPlatform falls back to sequential extraction on unknown platforms.
+// extractTarPlatform falls back to sequential streaming extraction on unknown platforms.
 func extractTarPlatform(r io.Reader, dir string) error {
 	return extractTarSeq(r, dir)
 }
 
 func extractTarSeq(r io.Reader, dir string) error {
+	copyBuf := make([]byte, 1<<20)
 	tr := tar.NewReader(r)
 	cleanDir := filepath.Clean(dir) + string(os.PathSeparator)
+
+	createdDirs := make(map[string]struct{})
+	ensureDir := func(d string) error {
+		if _, ok := createdDirs[d]; ok {
+			return nil
+		}
+		if err := os.MkdirAll(d, 0o750); err != nil {
+			return err
+		}
+		createdDirs[d] = struct{}{}
+		return nil
+	}
+
 	for {
 		hdr, err := tr.Next()
 		if err == io.EOF {
@@ -28,37 +42,34 @@ func extractTarSeq(r io.Reader, dir string) error {
 		if err != nil {
 			return errors.Wrap(err, "read tar entry")
 		}
+
 		target := filepath.Join(dir, hdr.Name)
 		if !strings.HasPrefix(filepath.Clean(target)+string(os.PathSeparator), cleanDir) {
 			return errors.Errorf("tar entry %q escapes destination directory", hdr.Name)
 		}
+
 		switch hdr.Typeflag {
 		case tar.TypeDir:
-			if err := os.MkdirAll(target, hdr.FileInfo().Mode()); err != nil {
+			if err := ensureDir(target); err != nil {
 				return errors.Errorf("mkdir %s: %w", hdr.Name, err)
 			}
 		case tar.TypeReg:
-			bufPtr := extractBufPool.Get().(*[]byte)
-			if int64(cap(*bufPtr)) >= hdr.Size {
-				*bufPtr = (*bufPtr)[:hdr.Size]
-			} else {
-				*bufPtr = make([]byte, hdr.Size)
-			}
-			if _, err := io.ReadFull(tr, *bufPtr); err != nil {
-				extractBufPool.Put(bufPtr)
-				return errors.Errorf("read %s: %w", hdr.Name, err)
-			}
-			if err := os.MkdirAll(filepath.Dir(target), 0o750); err != nil {
-				extractBufPool.Put(bufPtr)
+			if err := ensureDir(filepath.Dir(target)); err != nil {
 				return errors.Errorf("mkdir %s: %w", hdr.Name, err)
 			}
-			if err := os.WriteFile(target, *bufPtr, hdr.FileInfo().Mode()); err != nil {
-				extractBufPool.Put(bufPtr)
+			f, err := os.OpenFile(target, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, hdr.FileInfo().Mode())
+			if err != nil {
+				return errors.Errorf("open %s: %w", hdr.Name, err)
+			}
+			if _, err := io.CopyBuffer(f, io.LimitReader(tr, hdr.Size), copyBuf); err != nil {
+				f.Close() //nolint:errcheck
 				return errors.Errorf("write %s: %w", hdr.Name, err)
 			}
-			extractBufPool.Put(bufPtr)
+			if err := f.Close(); err != nil {
+				return errors.Errorf("close %s: %w", hdr.Name, err)
+			}
 		case tar.TypeSymlink:
-			if err := os.MkdirAll(filepath.Dir(target), 0o750); err != nil {
+			if err := ensureDir(filepath.Dir(target)); err != nil {
 				return errors.Errorf("mkdir for symlink %s: %w", hdr.Name, err)
 			}
 			if err := os.Symlink(hdr.Linkname, target); err != nil {
diff --git a/cmd/gradle-cache/extract_linux.go b/cmd/gradle-cache/extract_linux.go
@@ -19,12 +19,33 @@ func extractTarPlatform(r io.Reader, dir string) error {
 	return extractTarSeq(r, dir)
 }
 
-// extractTarSeq extracts a tar stream sequentially using pooled buffers.
-// One goroutine reads and writes, avoiding goroutine-scheduling overhead and
-// VFS writeback fragmentation — the same pattern GNU tar uses on Linux.
+// extractTarSeq extracts a tar stream sequentially using a fixed-size copy
+// buffer. Files are streamed directly from the tar reader to disk one 1 MiB
+// block at a time — the same block-streaming pattern GNU tar uses — so the
+// decompressor pipe keeps flowing without large per-file allocations.
 func extractTarSeq(r io.Reader, dir string) error {
+	// Single fixed-size copy buffer for all file writes in this call.
+	// 1 MiB is large enough to amortise write syscall overhead without
+	// creating memory pressure for many-file archives.
+	copyBuf := make([]byte, 1<<20)
+
 	tr := tar.NewReader(r)
 	cleanDir := filepath.Clean(dir) + string(os.PathSeparator)
+
+	// createdDirs tracks parent directories we have already MkdirAll'd so
+	// each unique path is only created once (same optimisation as darwin).
+	createdDirs := make(map[string]struct{})
+	ensureDir := func(d string) error {
+		if _, ok := createdDirs[d]; ok {
+			return nil
+		}
+		if err := os.MkdirAll(d, 0o750); err != nil {
+			return err
+		}
+		createdDirs[d] = struct{}{}
+		return nil
+	}
+
 	for {
 		hdr, err := tr.Next()
 		if err == io.EOF {
@@ -33,37 +54,36 @@ func extractTarSeq(r io.Reader, dir string) error {
 		if err != nil {
 			return errors.Wrap(err, "read tar entry")
 		}
+
 		target := filepath.Join(dir, hdr.Name)
 		if !strings.HasPrefix(filepath.Clean(target)+string(os.PathSeparator), cleanDir) {
 			return errors.Errorf("tar entry %q escapes destination directory", hdr.Name)
 		}
+
 		switch hdr.Typeflag {
 		case tar.TypeDir:
-			if err := os.MkdirAll(target, hdr.FileInfo().Mode()); err != nil {
+			if err := ensureDir(target); err != nil {
 				return errors.Errorf("mkdir %s: %w", hdr.Name, err)
 			}
+
 		case tar.TypeReg:
-			bufPtr := extractBufPool.Get().(*[]byte)
-			if int64(cap(*bufPtr)) >= hdr.Size {
-				*bufPtr = (*bufPtr)[:hdr.Size]
-			} else {
-				*bufPtr = make([]byte, hdr.Size)
-			}
-			if _, err := io.ReadFull(tr, *bufPtr); err != nil {
-				extractBufPool.Put(bufPtr)
-				return errors.Errorf("read %s: %w", hdr.Name, err)
-			}
-			if err := os.MkdirAll(filepath.Dir(target), 0o750); err != nil {
-				extractBufPool.Put(bufPtr)
+			if err := ensureDir(filepath.Dir(target)); err != nil {
 				return errors.Errorf("mkdir %s: %w", hdr.Name, err)
 			}
-			if err := os.WriteFile(target, *bufPtr, hdr.FileInfo().Mode()); err != nil {
-				extractBufPool.Put(bufPtr)
+			f, err := os.OpenFile(target, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, hdr.FileInfo().Mode())
+			if err != nil {
+				return errors.Errorf("open %s: %w", hdr.Name, err)
+			}
+			if _, err := io.CopyBuffer(f, io.LimitReader(tr, hdr.Size), copyBuf); err != nil {
+				f.Close() //nolint:errcheck
 				return errors.Errorf("write %s: %w", hdr.Name, err)
 			}
-			extractBufPool.Put(bufPtr)
+			if err := f.Close(); err != nil {
+				return errors.Errorf("close %s: %w", hdr.Name, err)
+			}
+
 		case tar.TypeSymlink:
-			if err := os.MkdirAll(filepath.Dir(target), 0o750); err != nil {
+			if err := ensureDir(filepath.Dir(target)); err != nil {
 				return errors.Errorf("mkdir for symlink %s: %w", hdr.Name, err)
 			}
 			if err := os.Symlink(hdr.Linkname, target); err != nil {
diff --git a/cmd/gradle-cache/main.go b/cmd/gradle-cache/main.go
@@ -24,7 +24,6 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
-	"sync"
 	"time"
 
 	"github.com/alecthomas/errors"
@@ -110,9 +109,12 @@ func (c *RestoreCmd) Run(ctx context.Context) error {
 	}
 	slog.Info("cache hit", "key", hitKey)
 
-	// ── Download phase ────────────────────────────────────────────────────────
-	// Download to a temp file first so we get a clean download-speed measurement
-	// independent of decompression and file-extraction throughput.
+	// ── Download + extract phase (pipelined) ─────────────────────────────────
+	// The S3 body streams directly into pzstd → extractor with no temp file.
+	// Download and extraction run concurrently: pzstd decompresses as bytes
+	// arrive, and the extractor writes files as blocks are decompressed.
+	// This matches the Ruby aws-sdk-s3 behaviour and keeps total time close to
+	// max(download_time, extract_time) rather than their sum.
 	dlStart := time.Now()
 	slog.Info("downloading bundle", "key", hitKey)
 
@@ -121,36 +123,37 @@ func (c *RestoreCmd) Run(ctx context.Context) error {
 		return errors.Wrap(err, "create temp dir")
 	}
 
-	bundle, err := os.CreateTemp("", "gradle-cache-bundle-*")
+	body, _, err := client.get(ctx, c.Bucket, hitKey)
 	if err != nil {
-		return errors.Wrap(err, "create bundle temp file")
+		return errors.Wrap(err, "get bundle")
 	}
-	defer func() {
-		bundle.Close()           //nolint:errcheck,gosec
-		os.Remove(bundle.Name()) //nolint:errcheck,gosec
-	}()
+	defer body.Close() //nolint:errcheck,gosec
 
-	dlBytes, err := client.download(ctx, c.Bucket, hitKey, bundle)
-	if err != nil {
-		return errors.Wrap(err, "download bundle")
+	// countingBody records bytes consumed and timestamps when the S3 body is
+	// exhausted so we can log download speed independently of extraction.
+	cb := &countingBody{r: body, dlStart: dlStart}
+	if err := extractTarZstd(ctx, cb, tmpDir); err != nil {
+		return errors.Wrap(err, "extract bundle")
 	}
-	dlElapsed := time.Since(dlStart)
-	dlMBps := float64(dlBytes) / dlElapsed.Seconds() / 1e6
-	slog.Info("download complete", "duration", dlElapsed,
-		"size_mb", fmt.Sprintf("%.1f", float64(dlBytes)/1e6),
-		"speed_mbps", fmt.Sprintf("%.1f", dlMBps))
 
-	// ── Extract phase ─────────────────────────────────────────────────────────
-	if _, err := bundle.Seek(0, io.SeekStart); err != nil {
-		return errors.Wrap(err, "rewind bundle")
-	}
-	extractStart := time.Now()
-	if err := extractTarZstd(ctx, bundle, tmpDir); err != nil {
-		return errors.Wrap(err, "extract bundle")
+	totalElapsed := time.Since(dlStart)
+
+	// Log download phase: time from start until the last S3 byte was consumed
+	// by the pzstd pipeline. Because download and extraction run concurrently,
+	// this is normally the dominant term.
+	if !cb.eofAt.IsZero() {
+		dlElapsed := cb.eofAt.Sub(dlStart)
+		slog.Info("download complete", "duration", dlElapsed.Round(time.Millisecond),
+			"size_mb", fmt.Sprintf("%.1f", float64(cb.n)/1e6),
+			"speed_mbps", fmt.Sprintf("%.1f", float64(cb.n)/dlElapsed.Seconds()/1e6))
 	}
-	extractElapsed := time.Since(extractStart)
-	slog.Info("extract complete", "duration", extractElapsed,
-		"speed_mbps", fmt.Sprintf("%.1f", float64(dlBytes)/extractElapsed.Seconds()/1e6))
+
+	// Log total restore time (find + download + extraction, all pipelined).
+	// The "extract tail" is the small gap between the last byte being consumed
+	// and the last file being written; most extraction happened during download.
+	slog.Info("restore pipeline complete",
+		"total_duration", totalElapsed.Round(time.Millisecond),
+		"extract_tail", time.Since(cb.eofAt).Round(time.Millisecond))
 
 	// Symlink $GRADLE_USER_HOME/caches → tmpDir/caches.
 	cachesTarget := filepath.Join(tmpDir, "caches")
@@ -452,6 +455,25 @@ func zstdDecompressCmd(ctx context.Context) *exec.Cmd {
 // pzstd/zstd decompresses in parallel; the resulting tar stream is extracted
 // by extractTarGo (pooled-buffer parallel writer) or piped to system tar as
 // a fallback when building without CGO on platforms where tar is unavailable.
+// countingBody wraps an io.Reader, counts bytes consumed, and records the time
+// at which the underlying reader returns io.EOF (i.e. when the last S3 byte
+// was consumed by the downstream pipeline).
+type countingBody struct {
+	r       io.Reader
+	n       int64
+	dlStart time.Time
+	eofAt   time.Time
+}
+
+func (c *countingBody) Read(p []byte) (int, error) {
+	n, err := c.r.Read(p)
+	c.n += int64(n)
+	if err == io.EOF && c.eofAt.IsZero() {
+		c.eofAt = time.Now()
+	}
+	return n, err
+}
+
 func extractTarZstd(ctx context.Context, r io.Reader, dir string) error {
 	zstdCmd := zstdDecompressCmd(ctx)
 	zstdCmd.Stdin = r
@@ -481,17 +503,6 @@ func extractTarZstd(ctx context.Context, r io.Reader, dir string) error {
 	return errors.Join(errs...)
 }
 
-// extractBufPool is a pool of reusable byte-slice pointers shared by all
-// platform extractors. Reusing slices eliminates per-file heap allocations and
-// the GC pressure they cause. Initial capacity is 256 KiB — large enough for
-// most Gradle cache files without needing a separate allocation.
-var extractBufPool = sync.Pool{
-	New: func() interface{} {
-		b := make([]byte, 0, 256<<10)
-		return &b
-	},
-}
-
 // zstdCompressCmd returns the command for zstd compression.
 // Prefers pzstd (creates parallel frames, decompressable in parallel) and
 // falls back to zstd -TN -c.
diff --git a/cmd/gradle-cache/s3.go b/cmd/gradle-cache/s3.go