From 79134c219bac3a185f0c4453d46c92d0cb5aea2b Mon Sep 17 00:00:00 2001 From: frrist Date: Tue, 28 Apr 2026 17:08:00 -0700 Subject: [PATCH 1/3] feat: add embedded ms3t S3 listener backed by Forge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an S3-compatible HTTP listener that runs inside sprue, gated by config.MS3T.Enabled. When enabled, sprue exposes a path-style S3 API on a separate port; PUT/GET/HEAD/DELETE/LIST translate into mutations on a Merkle Search Tree whose blocks ship to piri via sprue's existing piriclient/routing/indexerclient (no UCAN-over-HTTP loopback). In ms3t.forge.no_cache mode (the smelt-deployed shape): - All block reads go through indexer queries + UCAN-authorized ranged retrieves on piri - Writes are synchronous to Forge — three round trips per S3 PUT - Local state is the registry SQLite (bucket → root CID) and a generated space keypair; ms3t is its own UCAN root authority When ms3t.forge.enabled is false, falls back to a local-disk uploader for development without Forge connectivity. See pkg/ms3t/architectural.md for prototype-level design notes, the choice points, and open questions for the team. Wired into the fx graph via internal/fx/ms3t.go; configuration lives under the new ms3t: block in config.example.yaml. --- config.example.yaml | 40 ++ go.mod | 42 +- go.sum | 131 +++-- internal/config/config.go | 61 +++ internal/fx/app.go | 1 + internal/fx/ms3t.go | 314 ++++++++++++ pkg/ms3t/architectural.md | 521 +++++++++++++++++++ pkg/ms3t/blockstore/buffered.go | 111 ++++ pkg/ms3t/blockstore/forge.go | 288 +++++++++++ pkg/ms3t/blockstore/sqlite.go | 69 +++ pkg/ms3t/blockstore/walk.go | 63 +++ pkg/ms3t/bucket/bucket.go | 518 +++++++++++++++++++ pkg/ms3t/bucket/cbor_gen.go | 507 +++++++++++++++++++ pkg/ms3t/bucket/chunker.go | 177 +++++++ pkg/ms3t/bucket/manifest.go | 26 + pkg/ms3t/cars/encoder.go | 164 ++++++ pkg/ms3t/gen/main.go | 16 + pkg/ms3t/mst/cbor_gen.go | 433 ++++++++++++++++ pkg/ms3t/mst/diff.go | 192 +++++++ pkg/ms3t/mst/mst.go | 866 ++++++++++++++++++++++++++++++++ pkg/ms3t/mst/mst_util.go | 212 ++++++++ pkg/ms3t/registry/registry.go | 51 ++ pkg/ms3t/registry/sqlite.go | 212 ++++++++ pkg/ms3t/server/handlers.go | 281 +++++++++++ pkg/ms3t/server/server.go | 132 +++++ pkg/ms3t/server/xml.go | 68 +++ pkg/ms3t/uploader/forgeauth.go | 92 ++++ pkg/ms3t/uploader/guppy.go | 149 ++++++ pkg/ms3t/uploader/internal.go | 393 +++++++++++++++ pkg/ms3t/uploader/uploader.go | 285 +++++++++++ 30 files changed, 6353 insertions(+), 62 deletions(-) create mode 100644 internal/fx/ms3t.go create mode 100644 pkg/ms3t/architectural.md create mode 100644 pkg/ms3t/blockstore/buffered.go create mode 100644 pkg/ms3t/blockstore/forge.go create mode 100644 pkg/ms3t/blockstore/sqlite.go create mode 100644 pkg/ms3t/blockstore/walk.go create mode 100644 pkg/ms3t/bucket/bucket.go create mode 100644 pkg/ms3t/bucket/cbor_gen.go create mode 100644 pkg/ms3t/bucket/chunker.go create mode 100644 pkg/ms3t/bucket/manifest.go create mode 100644 pkg/ms3t/cars/encoder.go create mode 100644 pkg/ms3t/gen/main.go create mode 100644 pkg/ms3t/mst/cbor_gen.go create mode 100644 pkg/ms3t/mst/diff.go create mode 100644 pkg/ms3t/mst/mst.go create mode 100644 pkg/ms3t/mst/mst_util.go create mode 100644 pkg/ms3t/registry/registry.go create mode 100644 pkg/ms3t/registry/sqlite.go create mode 100644 pkg/ms3t/server/handlers.go create mode 100644 pkg/ms3t/server/server.go create mode 100644 pkg/ms3t/server/xml.go create mode 100644 pkg/ms3t/uploader/forgeauth.go create mode 100644 pkg/ms3t/uploader/guppy.go create mode 100644 pkg/ms3t/uploader/internal.go create mode 100644 pkg/ms3t/uploader/uploader.go diff --git a/config.example.yaml b/config.example.yaml index 7d4f3c5..c06c281 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -108,3 +108,43 @@ storage: log: # Log level: debug, info, warn, error level: "info" + +# ms3t — embedded S3-compatible HTTP server backed by an MST. When +# enabled, sprue exposes an S3 listener alongside its UCAN endpoint; +# objects are stored as content-addressed blocks in a local SQLite +# blockstore and (optionally) shipped to a Forge stack via guppy. +ms3t: + enabled: false + # host:port for the S3 listener. Path-style addressing only; clients + # must set forcePathStyle=true and disable streaming uploads + # (request_checksum_calculation=when_required for AWS CLI v2.23+). + addr: ":9000" + # ms3t persists its SQLite database and (when forge is disabled) + # CAR files under this directory. Created if missing. + data_dir: "./ms3t-data" + # Body chunk size in bytes for new objects. Default 1 MiB. + chunk_size: 1048576 + # Uploader: flush a CAR after this many buffered bytes. Default 64 MiB. + batch_bytes: 67108864 + # Uploader: flush a CAR after this idle duration. Default 5s. + batch_age: "5s" + + forge: + # When true, every batched CAR is shipped to piri through + # sprue's routing, piriclient, and indexerclient — no + # UCAN-over-HTTP loopback. ms3t reuses sprue's identity as the + # signer; no separate principal or delegation file is needed. + # When false, CARs are written to data_dir/cars only. + enabled: false + # Path to ms3t's persisted space keypair. Generated on first + # run if missing. ms3t is the root UCAN authority over its own + # space, which lets self-issued retrieval delegations validate + # against piri's auth on index-blob fetches. + # Defaults to /space.key. + space_key_file: "" + # When true, all block reads (MST nodes, manifests, body + # chunks) go through indexer→piri instead of a local SQLite + # cache, AND writes go synchronously to Forge (Batched is + # bypassed). Closes the read-after-write race; raises per-PUT + # latency to the Forge round trip cost. Requires enabled = true. + no_cache: false diff --git a/go.mod b/go.mod index 6da5e59..b21fef5 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/ipfs/go-log/v2 v2.9.0 github.com/ipld/go-ipld-prime v0.21.1-0.20240917223228-6148356a4c2e github.com/jackc/pgx/v5 v5.9.1 - github.com/labstack/echo/v4 v4.14.0 + github.com/labstack/echo/v4 v4.15.0 github.com/multiformats/go-multiaddr v0.16.0 github.com/multiformats/go-multibase v0.2.0 github.com/multiformats/go-multihash v0.2.3 @@ -23,7 +23,8 @@ require ( github.com/spf13/cobra v1.10.2 github.com/spf13/viper v1.21.0 github.com/storacha/go-libstoracha v0.7.5 - github.com/storacha/go-ucanto v0.7.2 + github.com/storacha/go-ucanto v0.8.2 + github.com/storacha/guppy v0.7.0 github.com/stretchr/testify v1.11.1 github.com/testcontainers/testcontainers-go v0.42.0 github.com/testcontainers/testcontainers-go/modules/dynamodb v0.41.0 @@ -31,9 +32,14 @@ require ( github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0 go.uber.org/fx v1.24.0 go.uber.org/zap v1.27.0 + go.uber.org/zap/exp v0.3.0 + modernc.org/sqlite v1.46.1 ) require ( + github.com/cenkalti/backoff/v5 v5.0.3 // indirect + github.com/dustin/go-humanize v1.0.1 // indirect + github.com/ipfs/boxo v0.30.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect @@ -41,11 +47,18 @@ require ( github.com/moby/moby/api v1.54.1 // indirect github.com/moby/moby/client v0.4.0 // indirect github.com/moby/sys/atomicwriter v0.1.0 // indirect + github.com/ncruces/go-strftime v1.0.0 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/sethvargo/go-retry v0.3.0 // indirect + github.com/storacha/indexing-service v1.12.2 // indirect + github.com/stretchr/objx v0.5.3 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 // indirect go.opentelemetry.io/otel/sdk v1.43.0 // indirect go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect golang.org/x/sync v0.20.0 // indirect + modernc.org/libc v1.68.0 // indirect + modernc.org/mathutil v1.7.1 // indirect + modernc.org/memory v1.11.0 // indirect ) require ( @@ -86,41 +99,41 @@ require ( github.com/ebitengine/purego v0.10.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/filecoin-project/go-data-segment v0.0.1 // indirect - github.com/filecoin-project/go-fil-commcid v0.2.0 // indirect + github.com/filecoin-project/go-fil-commcid v0.3.1 // indirect github.com/filecoin-project/go-fil-commp-hashhash v0.2.0 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/go-ole/go-ole v1.2.6 // indirect + github.com/go-ole/go-ole v1.3.0 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/hashicorp/golang-lru v1.0.2 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/ipfs/bbloom v0.0.4 // indirect - github.com/ipfs/go-block-format v0.2.0 // indirect + github.com/ipfs/go-block-format v0.2.1 github.com/ipfs/go-blockservice v0.5.2 // indirect github.com/ipfs/go-datastore v0.9.0 // indirect github.com/ipfs/go-ipfs-blockstore v1.3.1 // indirect github.com/ipfs/go-ipfs-ds-help v1.1.1 // indirect github.com/ipfs/go-ipfs-exchange-interface v0.2.1 // indirect github.com/ipfs/go-ipfs-util v0.0.3 // indirect - github.com/ipfs/go-ipld-cbor v0.1.0 // indirect - github.com/ipfs/go-ipld-format v0.6.0 // indirect + github.com/ipfs/go-ipld-cbor v0.2.0 + github.com/ipfs/go-ipld-format v0.6.1 // indirect github.com/ipfs/go-ipld-legacy v0.2.1 // indirect github.com/ipfs/go-log v1.0.5 // indirect github.com/ipfs/go-merkledag v0.11.0 // indirect - github.com/ipfs/go-metrics-interface v0.0.1 // indirect + github.com/ipfs/go-metrics-interface v0.3.0 // indirect github.com/ipfs/go-verifcid v0.0.3 // indirect github.com/ipld/go-car v0.6.2 // indirect - github.com/ipld/go-codec-dagpb v1.6.0 // indirect + github.com/ipld/go-codec-dagpb v1.7.0 // indirect github.com/ipni/go-libipni v0.6.18 // indirect github.com/klauspost/compress v1.18.5 // indirect github.com/klauspost/cpuid/v2 v2.2.10 // indirect github.com/labstack/gommon v0.4.2 // indirect github.com/libp2p/go-buffer-pool v0.1.0 // indirect github.com/libp2p/go-libp2p v0.41.1 // indirect - github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/lufia/plan9stats v0.0.0-20240513124658-fba389f38bae // indirect github.com/magiconair/properties v1.8.10 // indirect github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-isatty v0.0.20 // indirect @@ -136,14 +149,13 @@ require ( github.com/mr-tron/base58 v1.2.0 // indirect github.com/multiformats/go-base32 v0.1.0 // indirect github.com/multiformats/go-base36 v0.2.0 // indirect - github.com/multiformats/go-multicodec v0.9.1 // indirect + github.com/multiformats/go-multicodec v0.9.2 github.com/multiformats/go-varint v0.1.0 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/opentracing/opentracing-go v1.2.0 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pion/datachannel v1.6.0 // indirect - github.com/pion/logging v0.2.4 // indirect github.com/pion/sctp v1.9.2 // indirect github.com/pion/webrtc/v4 v4.2.9 // indirect github.com/pkg/errors v0.9.1 // indirect @@ -164,7 +176,7 @@ require ( github.com/ucan-wg/go-ucan v0.0.0-20240916120445-37f52863156c // indirect github.com/valyala/bytebufferpool v1.0.0 // indirect github.com/valyala/fasttemplate v1.2.2 // indirect - github.com/whyrusleeping/cbor-gen v0.3.1 // indirect + github.com/whyrusleeping/cbor-gen v0.3.1 github.com/yusufpapurcu/wmi v1.2.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 // indirect @@ -176,12 +188,12 @@ require ( go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.49.0 // indirect - golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa // indirect + golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90 // indirect golang.org/x/net v0.52.0 // indirect golang.org/x/sys v0.42.0 // indirect golang.org/x/text v0.35.0 // indirect golang.org/x/time v0.14.0 // indirect - golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect + golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect google.golang.org/protobuf v1.36.11 // indirect diff --git a/go.sum b/go.sum index 7dc4c21..8d1c36f 100644 --- a/go.sum +++ b/go.sum @@ -51,6 +51,8 @@ github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kd github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= +github.com/asaskevich/EventBus v0.0.0-20200907212545-49d423059eef h1:2JGTg6JapxP9/R33ZaagQtAM4EkkSYnIAlOG5EI8gkM= +github.com/asaskevich/EventBus v0.0.0-20200907212545-49d423059eef/go.mod h1:JS7hed4L1fj0hXcyEejnW57/7LCetXggd+vwrRnYeII= github.com/aws/aws-sdk-go-v2 v1.41.3 h1:4kQ/fa22KjDt13QCy1+bYADvdgcxpfH18f0zP542kZA= github.com/aws/aws-sdk-go-v2 v1.41.3/go.mod h1:mwsPRE8ceUUpiTgF7QmQIJ7lgsKUPQOUl3o72QBrE1o= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.6 h1:N4lRUXZpZ1KVEUn6hxtco/1d2lgYhNn1fHkkl8WhlyQ= @@ -169,10 +171,12 @@ github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7 github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/filecoin-project/go-clock v0.1.0 h1:SFbYIM75M8NnFm1yMHhN9Ahy3W5bEZV9gd6MPfXbKVU= +github.com/filecoin-project/go-clock v0.1.0/go.mod h1:4uB/O4PvOjlx1VCMdZ9MyDZXRm//gkj1ELEbxfI1AZs= github.com/filecoin-project/go-data-segment v0.0.1 h1:1wmDxOG4ubWQm3ZC1XI5nCon5qgSq7Ra3Rb6Dbu10Gs= github.com/filecoin-project/go-data-segment v0.0.1/go.mod h1:H0/NKbsRxmRFBcLibmABv+yFNHdmtl5AyplYLnb0Zv4= -github.com/filecoin-project/go-fil-commcid v0.2.0 h1:B+5UX8XGgdg/XsdUpST4pEBviKkFOw+Fvl2bLhSKGpI= -github.com/filecoin-project/go-fil-commcid v0.2.0/go.mod h1:8yigf3JDIil+/WpqR5zoKyP0jBPCOGtEqq/K1CcMy9Q= +github.com/filecoin-project/go-fil-commcid v0.3.1 h1:4EfxpHSlvtkOqa9weG2Yt5kxFmPib2xU7Uc9Lbqk7fs= +github.com/filecoin-project/go-fil-commcid v0.3.1/go.mod h1:z7Ssf8d7kspF9QRAVHDbZ+43JK4mkhbGH5lyph1TnKY= github.com/filecoin-project/go-fil-commp-hashhash v0.2.0 h1:HYIUugzjq78YvV3vC6rL95+SfC/aSTVSnZSZiDV5pCk= github.com/filecoin-project/go-fil-commp-hashhash v0.2.0/go.mod h1:VH3fAFOru4yyWar4626IoS5+VGE8SfZiBODJLUigEo4= github.com/flynn/noise v1.1.0 h1:KjPQoQCEFdZDiP03phOvGi11+SVVhBG2wOWAorLsstg= @@ -193,8 +197,9 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= +github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs= @@ -248,6 +253,7 @@ github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -267,8 +273,8 @@ github.com/google/pprof v0.0.0-20201023163331-3e6fc7fc9c4c/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20201203190320-1bf35d6f28c2/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210122040257-d980be63207e/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210226084205-cbba55b83ad5/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= -github.com/google/pprof v0.0.0-20250208200701-d0013a598941 h1:43XjGa6toxLpeksjcxs1jIoIyr+vUfOqY2c6HB4bpoc= -github.com/google/pprof v0.0.0-20250208200701-d0013a598941/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -299,6 +305,8 @@ github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v1.0.2 h1:dV3g9Z/unq5DpblPpw+Oqcv4dU/1omnb4Ok8iPY6p1c= github.com/hashicorp/golang-lru v1.0.2/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64= github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ= @@ -313,10 +321,12 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/ipfs/bbloom v0.0.4 h1:Gi+8EGJ2y5qiD5FbsbpX/TMNcJw8gSqr7eyjHa4Fhvs= github.com/ipfs/bbloom v0.0.4/go.mod h1:cS9YprKXpoZ9lT0n/Mw/a6/aFV6DTjTLYHeA+gyqMG0= +github.com/ipfs/boxo v0.30.0 h1:7afsoxPGGqfoH7Dum/wOTGUB9M5fb8HyKPMlLfBvIEQ= +github.com/ipfs/boxo v0.30.0/go.mod h1:BPqgGGyHB9rZZcPSzah2Dc9C+5Or3U1aQe7EH1H7370= github.com/ipfs/go-bitswap v0.11.0 h1:j1WVvhDX1yhG32NTC9xfxnqycqYIlhzEzLXG/cU1HyQ= github.com/ipfs/go-bitswap v0.11.0/go.mod h1:05aE8H3XOU+LXpTedeAS0OZpcO1WFsj5niYQH9a1Tmk= -github.com/ipfs/go-block-format v0.2.0 h1:ZqrkxBA2ICbDRbK8KJs/u0O3dlp6gmAuuXUJNiW1Ycs= -github.com/ipfs/go-block-format v0.2.0/go.mod h1:+jpL11nFx5A/SPpsoBn6Bzkra/zaArfSmsknbPMYgzM= +github.com/ipfs/go-block-format v0.2.1 h1:96kW71XGNNa+mZw/MTzJrCpMhBWCrd9kBLoKm9Iip/Q= +github.com/ipfs/go-block-format v0.2.1/go.mod h1:frtvXHMQhM6zn7HvEQu+Qz5wSTj+04oEH/I+NjDgEjk= github.com/ipfs/go-blockservice v0.5.2 h1:in9Bc+QcXwd1apOVM7Un9t8tixPKdaHQFdLSUM1Xgk8= github.com/ipfs/go-blockservice v0.5.2/go.mod h1:VpMblFEqG67A/H2sHKAemeH9vlURVavlysbdUI632yk= github.com/ipfs/go-cid v0.6.0 h1:DlOReBV1xhHBhhfy/gBNNTSyfOM6rLiIx9J7A4DGf30= @@ -337,16 +347,16 @@ github.com/ipfs/go-ipfs-exchange-interface v0.2.1 h1:jMzo2VhLKSHbVe+mHNzYgs95n0+ github.com/ipfs/go-ipfs-exchange-interface v0.2.1/go.mod h1:MUsYn6rKbG6CTtsDp+lKJPmVt3ZrCViNyH3rfPGsZ2E= github.com/ipfs/go-ipfs-exchange-offline v0.3.0 h1:c/Dg8GDPzixGd0MC8Jh6mjOwU57uYokgWRFidfvEkuA= github.com/ipfs/go-ipfs-exchange-offline v0.3.0/go.mod h1:MOdJ9DChbb5u37M1IcbrRB02e++Z7521fMxqCNRrz9s= -github.com/ipfs/go-ipfs-pq v0.0.2 h1:e1vOOW6MuOwG2lqxcLA+wEn93i/9laCY8sXAw76jFOY= -github.com/ipfs/go-ipfs-pq v0.0.2/go.mod h1:LWIqQpqfRG3fNc5XsnIhz/wQ2XXGyugQwls7BgUmUfY= +github.com/ipfs/go-ipfs-pq v0.0.3 h1:YpoHVJB+jzK15mr/xsWC574tyDLkezVrDNeaalQBsTE= +github.com/ipfs/go-ipfs-pq v0.0.3/go.mod h1:btNw5hsHBpRcSSgZtiNm/SLj5gYIZ18AKtv3kERkRb4= github.com/ipfs/go-ipfs-routing v0.3.0 h1:9W/W3N+g+y4ZDeffSgqhgo7BsBSJwPMcyssET9OWevc= github.com/ipfs/go-ipfs-routing v0.3.0/go.mod h1:dKqtTFIql7e1zYsEuWLyuOU+E0WJWW8JjbTPLParDWo= github.com/ipfs/go-ipfs-util v0.0.3 h1:2RFdGez6bu2ZlZdI+rWfIdbQb1KudQp3VGwPtdNCmE0= github.com/ipfs/go-ipfs-util v0.0.3/go.mod h1:LHzG1a0Ig4G+iZ26UUOMjHd+lfM84LZCrn17xAKWBvs= -github.com/ipfs/go-ipld-cbor v0.1.0 h1:dx0nS0kILVivGhfWuB6dUpMa/LAwElHPw1yOGYopoYs= -github.com/ipfs/go-ipld-cbor v0.1.0/go.mod h1:U2aYlmVrJr2wsUBU67K4KgepApSZddGRDWBYR0H4sCk= -github.com/ipfs/go-ipld-format v0.6.0 h1:VEJlA2kQ3LqFSIm5Vu6eIlSxD/Ze90xtc4Meten1F5U= -github.com/ipfs/go-ipld-format v0.6.0/go.mod h1:g4QVMTn3marU3qXchwjpKPKgJv+zF+OlaKMyhJ4LHPg= +github.com/ipfs/go-ipld-cbor v0.2.0 h1:VHIW3HVIjcMd8m4ZLZbrYpwjzqlVUfjLM7oK4T5/YF0= +github.com/ipfs/go-ipld-cbor v0.2.0/go.mod h1:Cp8T7w1NKcu4AQJLqK0tWpd1nkgTxEVB5C6kVpLW6/0= +github.com/ipfs/go-ipld-format v0.6.1 h1:lQLmBM/HHbrXvjIkrydRXkn+gc0DE5xO5fqelsCKYOQ= +github.com/ipfs/go-ipld-format v0.6.1/go.mod h1:8TOH1Hj+LFyqM2PjSqI2/ZnyO0KlfhHbJLkbxFa61hs= github.com/ipfs/go-ipld-legacy v0.2.1 h1:mDFtrBpmU7b//LzLSypVrXsD8QxkEWxu5qVxN99/+tk= github.com/ipfs/go-ipld-legacy v0.2.1/go.mod h1:782MOUghNzMO2DER0FlBR94mllfdCJCkTtDtPM51otM= github.com/ipfs/go-log v1.0.5 h1:2dOuUCB1Z7uoczMWgAyDck5JLb72zHzrMnGnCNNbvY8= @@ -356,18 +366,18 @@ github.com/ipfs/go-log/v2 v2.9.0 h1:l4b06AwVXwldIzbVPZy5z7sKp9lHFTX0KWfTBCtHaOk= github.com/ipfs/go-log/v2 v2.9.0/go.mod h1:UhIYAwMV7Nb4ZmihUxfIRM2Istw/y9cAk3xaK+4Zs2c= github.com/ipfs/go-merkledag v0.11.0 h1:DgzwK5hprESOzS4O1t/wi6JDpyVQdvm9Bs59N/jqfBY= github.com/ipfs/go-merkledag v0.11.0/go.mod h1:Q4f/1ezvBiJV0YCIXvt51W/9/kqJGH4I1LsA7+djsM4= -github.com/ipfs/go-metrics-interface v0.0.1 h1:j+cpbjYvu4R8zbleSs36gvB7jR+wsL2fGD6n0jO4kdg= -github.com/ipfs/go-metrics-interface v0.0.1/go.mod h1:6s6euYU4zowdslK0GKHmqaIZ3j/b/tL7HTWtJ4VPgWY= -github.com/ipfs/go-peertaskqueue v0.8.0 h1:JyNO144tfu9bx6Hpo119zvbEL9iQ760FHOiJYsUjqaU= -github.com/ipfs/go-peertaskqueue v0.8.0/go.mod h1:cz8hEnnARq4Du5TGqiWKgMr/BOSQ5XOgMOh1K5YYKKM= +github.com/ipfs/go-metrics-interface v0.3.0 h1:YwG7/Cy4R94mYDUuwsBfeziJCVm9pBMJ6q/JR9V40TU= +github.com/ipfs/go-metrics-interface v0.3.0/go.mod h1:OxxQjZDGocXVdyTPocns6cOLwHieqej/jos7H4POwoY= +github.com/ipfs/go-peertaskqueue v0.8.2 h1:PaHFRaVFdxQk1Qo3OKiHPYjmmusQy7gKQUaL8JDszAU= +github.com/ipfs/go-peertaskqueue v0.8.2/go.mod h1:L6QPvou0346c2qPJNiJa6BvOibxDfaiPlqHInmzg0FA= github.com/ipfs/go-test v0.2.1 h1:/D/a8xZ2JzkYqcVcV/7HYlCnc7bv/pKHQiX5TdClkPE= github.com/ipfs/go-test v0.2.1/go.mod h1:dzu+KB9cmWjuJnXFDYJwC25T3j1GcN57byN+ixmK39M= github.com/ipfs/go-verifcid v0.0.3 h1:gmRKccqhWDocCRkC+a59g5QW7uJw5bpX9HWBevXa0zs= github.com/ipfs/go-verifcid v0.0.3/go.mod h1:gcCtGniVzelKrbk9ooUSX/pM3xlH73fZZJDzQJRvOUw= github.com/ipld/go-car v0.6.2 h1:Hlnl3Awgnq8icK+ze3iRghk805lu8YNq3wlREDTF2qc= github.com/ipld/go-car v0.6.2/go.mod h1:oEGXdwp6bmxJCZ+rARSkDliTeYnVzv3++eXajZ+Bmr8= -github.com/ipld/go-codec-dagpb v1.6.0 h1:9nYazfyu9B1p3NAgfVdpRco3Fs2nFC72DqVsMj6rOcc= -github.com/ipld/go-codec-dagpb v1.6.0/go.mod h1:ANzFhfP2uMJxRBr8CE+WQWs5UsNa0pYtmKZ+agnUw9s= +github.com/ipld/go-codec-dagpb v1.7.0 h1:hpuvQjCSVSLnTnHXn+QAMR0mLmb1gA6wl10LExo2Ts0= +github.com/ipld/go-codec-dagpb v1.7.0/go.mod h1:rD3Zg+zub9ZnxcLwfol/OTQRVjaLzXypgy4UqHQvilM= github.com/ipld/go-ipld-prime v0.21.1-0.20240917223228-6148356a4c2e h1:0Anxx6pMS8U/qjTLVxPhpTYuuDMssHDtUEvzIz2Skw4= github.com/ipld/go-ipld-prime v0.21.1-0.20240917223228-6148356a4c2e/go.mod h1:LN+1Tx6867lbDCmf8bErp1TNw3Kh9eY2n0eJ+whRx38= github.com/ipni/go-libipni v0.6.18 h1:x8X6y0QoMmSKtwRlczWdWEYedoLUGCEek2TttfDKPk4= @@ -409,28 +419,28 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/labstack/echo/v4 v4.14.0 h1:+tiMrDLxwv6u0oKtD03mv+V1vXXB3wCqPHJqPuIe+7M= -github.com/labstack/echo/v4 v4.14.0/go.mod h1:xmw1clThob0BSVRX1CRQkGQ/vjwcpOMjQZSZa9fKA/c= +github.com/labstack/echo/v4 v4.15.0 h1:hoRTKWcnR5STXZFe9BmYun9AMTNeSbjHi2vtDuADJ24= +github.com/labstack/echo/v4 v4.15.0/go.mod h1:xmw1clThob0BSVRX1CRQkGQ/vjwcpOMjQZSZa9fKA/c= github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0= github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU= -github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= -github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/lib/pq v1.11.1 h1:wuChtj2hfsGmmx3nf1m7xC2XpK6OtelS2shMY+bGMtI= +github.com/lib/pq v1.11.1/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= github.com/libp2p/go-buffer-pool v0.1.0 h1:oK4mSFcQz7cTQIfqbe4MIj9gLW+mnanjyFtc6cdF0Y8= github.com/libp2p/go-buffer-pool v0.1.0/go.mod h1:N+vh8gMqimBzdKkSMVuydVDq+UV5QTWy5HSiZacSbPg= github.com/libp2p/go-libp2p v0.41.1 h1:8ecNQVT5ev/jqALTvisSJeVNvXYJyK4NhQx1nNRXQZE= github.com/libp2p/go-libp2p v0.41.1/go.mod h1:DcGTovJzQl/I7HMrby5ZRjeD0kQkGiy+9w6aEkSZpRI= github.com/libp2p/go-libp2p-asn-util v0.4.1 h1:xqL7++IKD9TBFMgnLPZR6/6iYhawHKHl950SO9L6n94= github.com/libp2p/go-libp2p-asn-util v0.4.1/go.mod h1:d/NI6XZ9qxw67b4e+NgpQexCIiFYJjErASrYW4PFDN8= -github.com/libp2p/go-libp2p-record v0.2.0 h1:oiNUOCWno2BFuxt3my4i1frNrt7PerzB3queqa1NkQ0= -github.com/libp2p/go-libp2p-record v0.2.0/go.mod h1:I+3zMkvvg5m2OcSdoL0KPljyJyvNDFGKX7QdlpYUcwk= +github.com/libp2p/go-libp2p-record v0.3.1 h1:cly48Xi5GjNw5Wq+7gmjfBiG9HCzQVkiZOUZ8kUl+Fg= +github.com/libp2p/go-libp2p-record v0.3.1/go.mod h1:T8itUkLcWQLCYMqtX7Th6r7SexyUJpIyPgks757td/E= github.com/libp2p/go-libp2p-testing v0.12.0 h1:EPvBb4kKMWO29qP4mZGyhVzUyR25dvfUIK5WDu6iPUA= github.com/libp2p/go-libp2p-testing v0.12.0/go.mod h1:KcGDRXyN7sQCllucn1cOOS+Dmm7ujhfEyXQL5lvkcPg= github.com/libp2p/go-msgio v0.3.0 h1:mf3Z8B1xcFN314sWX+2vOTShIE0Mmn2TXn3YCUQGNj0= github.com/libp2p/go-msgio v0.3.0/go.mod h1:nyRM819GmVaF9LX3l03RMh10QdOroF++NBbxAb0mmDM= github.com/libp2p/go-netroute v0.2.2 h1:Dejd8cQ47Qx2kRABg6lPwknU7+nBnFRpko45/fFPuZ8= github.com/libp2p/go-netroute v0.2.2/go.mod h1:Rntq6jUAH0l9Gg17w5bFGhcC9a+vk4KNXs6s7IljKYE= -github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= -github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= +github.com/lufia/plan9stats v0.0.0-20240513124658-fba389f38bae h1:dIZY4ULFcto4tAFlj1FYZl8ztUZ13bdq+PLY+NOfbyI= +github.com/lufia/plan9stats v0.0.0-20240513124658-fba389f38bae/go.mod h1:ilwx/Dta8jXAgpFYFvSWEMwxmbWXyiUHkd5FwyKhb5k= github.com/magiconair/properties v1.8.5/go.mod h1:y3VJvCyxH9uVvJTWEGAELF3aiYNyPKd5NZ3oSwXrF60= github.com/magiconair/properties v1.8.10 h1:s31yESBquKXCV9a/ScB3ESkOjUYYv+X0rg8SYxI99mE= github.com/magiconair/properties v1.8.10/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= @@ -503,8 +513,8 @@ github.com/multiformats/go-multiaddr-fmt v0.1.0 h1:WLEFClPycPkp4fnIzoFoV9FVd49/e github.com/multiformats/go-multiaddr-fmt v0.1.0/go.mod h1:hGtDIW4PU4BqJ50gW2quDuPVjyWNZxToGUh/HwTZYJo= github.com/multiformats/go-multibase v0.2.0 h1:isdYCVLvksgWlMW9OZRYJEa9pZETFivncJHmHnnd87g= github.com/multiformats/go-multibase v0.2.0/go.mod h1:bFBZX4lKCA/2lyOFSAoKH5SS6oPyjtnzK/XTFDPkNuk= -github.com/multiformats/go-multicodec v0.9.1 h1:x/Fuxr7ZuR4jJV4Os5g444F7xC4XmyUaT/FWtE+9Zjo= -github.com/multiformats/go-multicodec v0.9.1/go.mod h1:LLWNMtyV5ithSBUo3vFIMaeDy+h3EbkMTek1m+Fybbo= +github.com/multiformats/go-multicodec v0.9.2 h1:YrlXCuqxjqm3bXl+vBq5LKz5pz4mvAsugdqy78k0pXQ= +github.com/multiformats/go-multicodec v0.9.2/go.mod h1:LLWNMtyV5ithSBUo3vFIMaeDy+h3EbkMTek1m+Fybbo= github.com/multiformats/go-multihash v0.2.3 h1:7Lyc8XfX/IY2jWb/gI7JP+o7JEq9hOa7BFvVU9RSh+U= github.com/multiformats/go-multihash v0.2.3/go.mod h1:dXgKXCXjBzdscBLk9JkjINiEsCKRVch90MdaGiKsvSM= github.com/multiformats/go-multistream v0.6.0 h1:ZaHKbsL404720283o4c/IHQXiS6gb8qAN5EIJ4PN5EA= @@ -520,8 +530,8 @@ github.com/neelance/sourcemap v0.0.0-20200213170602-2833bce08e4c/go.mod h1:Qr6/a github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= -github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU= -github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk= +github.com/onsi/ginkgo/v2 v2.23.3 h1:edHxnszytJ4lD9D5Jjc4tiDkPBZ3siDeJJkUZJJVkp0= +github.com/onsi/ginkgo/v2 v2.23.3/go.mod h1:zXTP6xIp3U8aVuXN8ENK9IXRaTjFnpVB9mGmaSRvxnM= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= @@ -583,13 +593,13 @@ github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/pressly/goose/v3 v3.27.0 h1:/D30gVTuQhu0WsNZYbJi4DMOsx1lNq+6SkLe+Wp59BM= github.com/pressly/goose/v3 v3.27.0/go.mod h1:3ZBeCXqzkgIRvrEMDkYh1guvtoJTU5oMMuDdkutoM78= -github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= -github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= +github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= +github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= -github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= -github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k= +github.com/prometheus/common v0.63.0/go.mod h1:VVFF/fBIoToEnWRVkYoXEkq3R3paCoxG9PXP74SnV18= github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= github.com/quic-go/qpack v0.5.1 h1:giqksBPnT/HDtZ6VhtFKgoLOWmlyo9Ei6u9PqzIMbhI= @@ -652,8 +662,12 @@ github.com/spf13/viper v1.21.0 h1:x5S+0EU27Lbphp4UKm1C+1oQO+rKx36vfCoaVebLFSU= github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjbTCAY= github.com/storacha/go-libstoracha v0.7.5 h1:zfRbku2RXxbH0uNWnpGQyJqafiJ+uCGs3tMmkHgZ/QE= github.com/storacha/go-libstoracha v0.7.5/go.mod h1:htUh/VZ0qHRLPJKWZsgXv9mCOqlAFGTVS//ApvQVNf0= -github.com/storacha/go-ucanto v0.7.2 h1:sLg+swDM/6VEcrb9VOik3hP8ek3NvqqKWiZRmsva5X0= -github.com/storacha/go-ucanto v0.7.2/go.mod h1:DZlWyzuSkXk3phAuJpGDyhxYWpJogW1RFqp/VfldT64= +github.com/storacha/go-ucanto v0.8.2 h1:oDrnRV3hN8+H816m33szbyEY7ItURBmyyMLmw2CHpBo= +github.com/storacha/go-ucanto v0.8.2/go.mod h1:DZlWyzuSkXk3phAuJpGDyhxYWpJogW1RFqp/VfldT64= +github.com/storacha/guppy v0.7.0 h1:oPC8FhgFIPzG2EI7Pm/BQUWTrDqRJzZEJVTSN/i64Zs= +github.com/storacha/guppy v0.7.0/go.mod h1:n5aeC6UKuTRVhxE6V90jhsbv8XYzNdeTfA8Gq2bDqTk= +github.com/storacha/indexing-service v1.12.2 h1:DrcIzvM36Ux7i0UmGoSZiU8lR8WjVIqsTULSE1kA+7I= +github.com/storacha/indexing-service v1.12.2/go.mod h1:Yk+uHoTA6qaTE13Ptq6FArsR9hESOetzej9194KwjhM= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.3 h1:jmXUvGomnU1o3W/V5h2VEradbpJDwGrzugQQvL0POH4= github.com/stretchr/objx v0.5.3/go.mod h1:rDQraq+vQZU7Fde9LOZLr8Tax6zZvy4kuNKF+QYS+U0= @@ -743,8 +757,8 @@ go.uber.org/fx v1.24.0 h1:wE8mruvpg2kiiL1Vqd0CC+tr0/24XIB10Iwp2lLWzkg= go.uber.org/fx v1.24.0/go.mod h1:AmDeGyS+ZARGKM4tlH4FY2Jr63VjbEDJHtqXTGP5hbo= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= -go.uber.org/mock v0.5.0 h1:KAMbZvZPyBPWgD14IrIQ38QCyjwpvVVV6K/bHl1IwQU= -go.uber.org/mock v0.5.0/go.mod h1:ge71pBPLYDk7QIi1LupWxdAykm7KIEFchiOqd6z7qMM= +go.uber.org/mock v0.6.0 h1:hyF9dfmbgIX5EfOdasqLsWD6xqpNZlXblLB/Dbnwv3Y= +go.uber.org/mock v0.6.0/go.mod h1:KiVJ4BqZJaMj4svdfmHM0AUx4NJYO8ZNpPnZn1Z+BBU= go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -754,6 +768,8 @@ go.uber.org/zap v1.16.0/go.mod h1:MA8QOfq0BHJwdXa996Y4dYkAqRKB8/1K1QMMZVaNZjQ= go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +go.uber.org/zap/exp v0.3.0 h1:6JYzdifzYkGmTdRR59oYH+Ng7k49H9qVpWwNSsGJj3U= +go.uber.org/zap/exp v0.3.0/go.mod h1:5I384qq7XGxYyByIhHm6jg5CHkGY0nsTfbDLgDDlgJQ= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= @@ -777,8 +793,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa h1:Zt3DZoOFFYkKhDT3v7Lm9FDMEV06GpzjG2jrqW+QTE0= -golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA= +golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90 h1:jiDhWWeC7jfWqR9c/uplMOqJ0sbNlNWv0UkzE0vX1MA= +golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90/go.mod h1:xE1HEv6b+1SCZ5/uscMRjUBKtIxworgEcEi+/n9NQDQ= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -807,8 +823,8 @@ golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= -golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= +golang.org/x/mod v0.34.0 h1:xIHgNUUnW6sYkcM5Jleh05DvLOtwc6RitGHbDk4akRI= +golang.org/x/mod v0.34.0/go.mod h1:ykgH52iCZe79kzLLMhyCUzhMci+nQj+0XkbXpNYtVjY= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181023162649-9b4f9f5ad519/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -926,6 +942,7 @@ golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= @@ -1010,8 +1027,8 @@ golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= -golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= -golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= +golang.org/x/tools v0.43.0 h1:12BdW9CeB3Z+J/I/wj34VMl8X+fEXBxVR90JeMX5E7s= +golang.org/x/tools v0.43.0/go.mod h1:uHkMso649BX2cZK6+RpuIPXS3ho2hZo4FVwfoy1vIk0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -1156,14 +1173,34 @@ honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9 honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= lukechampine.com/blake3 v1.4.1 h1:I3Smz7gso8w4/TunLKec6K2fn+kyKtDxr/xcQEN84Wg= lukechampine.com/blake3 v1.4.1/go.mod h1:QFosUxmjB8mnrWFSNwKmvxHpfY72bmD2tQ0kBMM3kwo= +modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis= +modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= +modernc.org/ccgo/v4 v4.30.2 h1:4yPaaq9dXYXZ2V8s1UgrC3KIj580l2N4ClrLwnbv2so= +modernc.org/ccgo/v4 v4.30.2/go.mod h1:yZMnhWEdW0qw3EtCndG1+ldRrVGS+bIwyWmAWzS0XEw= +modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA= +modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc= +modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= +modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= +modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo= +modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= +modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= +modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= modernc.org/libc v1.68.0 h1:PJ5ikFOV5pwpW+VqCK1hKJuEWsonkIJhhIXyuF/91pQ= modernc.org/libc v1.68.0/go.mod h1:NnKCYeoYgsEqnY3PgvNgAeaJnso968ygU8Z0DxjoEc0= modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8= +modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= +modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= +modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= modernc.org/sqlite v1.46.1 h1:eFJ2ShBLIEnUWlLy12raN0Z1plqmFX9Qe3rjQTKt6sU= modernc.org/sqlite v1.46.1/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA= +modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= +modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk= pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= diff --git a/internal/config/config.go b/internal/config/config.go index 66ac223..70cd97e 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -23,6 +23,7 @@ type Config struct { Storage StorageConfig `mapstructure:"storage"` Log LogConfig `mapstructure:"log"` Mailer MailerConfig `mapstructure:"mailer"` + MS3T MS3TConfig `mapstructure:"ms3t"` } type DeploymentConfig struct { @@ -146,6 +147,58 @@ type LogConfig struct { Level string `mapstructure:"level"` } +// MS3TConfig configures the embedded ms3t S3-compatible HTTP server. +// When Enabled is false, none of the rest is consulted and no S3 +// listener starts. +type MS3TConfig struct { + // Enabled toggles the S3 listener. + Enabled bool `mapstructure:"enabled"` + // Addr is the host:port to bind the S3 listener to. + Addr string `mapstructure:"addr"` + // DataDir is where ms3t persists its SQLite blockstore + bucket + // registry and (when Forge is disabled) emits CARs to disk. + DataDir string `mapstructure:"data_dir"` + // ChunkSize is the body chunk size used for new objects, in bytes. + // 0 → ms3t default (1 MiB). + ChunkSize int64 `mapstructure:"chunk_size"` + // BatchBytes is the buffered-CAR size at which the uploader + // flushes. 0 → ms3t default (64 MiB). + BatchBytes int64 `mapstructure:"batch_bytes"` + // BatchAge is the idle interval after which the uploader flushes. + // 0 → ms3t default (5s). + BatchAge string `mapstructure:"batch_age"` + + // Forge controls whether ms3t ships CARs to a Storacha Forge + // stack via guppy. When disabled (the default), CARs go to disk + // only under DataDir/cars. + Forge MS3TForgeConfig `mapstructure:"forge"` +} + +// MS3TForgeConfig holds the optional Forge upload integration. +// When MS3T.Forge.Enabled is true, every batched CAR is shipped to +// piri through sprue's own routing, piriclient, and indexerclient +// — no UCAN-over-HTTP loopback, no separate principal/delegation +// setup. +// +// ms3t generates and persists its own space keypair on first run. +// The space's DID is derived from that keypair, and ms3t acts as +// the root UCAN authority over its own space (so it can issue the +// retrieval delegations the indexer needs to validate writes). +type MS3TForgeConfig struct { + Enabled bool `mapstructure:"enabled"` + // SpaceKeyFile is the path to the persisted space keypair. + // Generated on first run if missing. Defaults to + // /space.key. + SpaceKeyFile string `mapstructure:"space_key_file"` + // NoCache routes all block reads (MST nodes, manifests, body + // chunks) through the indexing-service + piri instead of a local + // SQLite cache, AND makes writes synchronous to Forge (Batched + // is bypassed). Closes the read-after-write race; raises per-PUT + // latency to the cost of the Forge round trip. Requires + // Enabled = true. + NoCache bool `mapstructure:"no_cache"` +} + type MailerConfig struct { // Type specifies the mailer implementation to use (e.g., "postmark", "smtp", "nop"). Type string `mapstructure:"type"` @@ -207,6 +260,14 @@ func SetDefaults(v *viper.Viper) { // Log defaults v.SetDefault("log.level", "info") + + // MS3T defaults — disabled by default; sprue is the source of + // truth for whether the S3 listener is exposed. + v.SetDefault("ms3t.enabled", false) + v.SetDefault("ms3t.addr", ":9000") + v.SetDefault("ms3t.data_dir", "./ms3t-data") + v.SetDefault("ms3t.batch_age", "5s") + v.SetDefault("ms3t.forge.enabled", false) } // BindEnvVars sets up environment variable binding with SPRUE_ prefix. diff --git a/internal/fx/app.go b/internal/fx/app.go index 0629579..e6fb3a8 100644 --- a/internal/fx/app.go +++ b/internal/fx/app.go @@ -24,6 +24,7 @@ var AppModule = func(cfg *config.Config) fx.Option { service.Module, handlers.Module, ServerModule, + MS3TModule, } switch cfg.Storage.Type { case config.StorageTypeMemory: diff --git a/internal/fx/ms3t.go b/internal/fx/ms3t.go new file mode 100644 index 0000000..7f9cd89 --- /dev/null +++ b/internal/fx/ms3t.go @@ -0,0 +1,314 @@ +package fx + +import ( + "context" + "database/sql" + "errors" + "fmt" + "log/slog" + "net/http" + "os" + "path/filepath" + "time" + + cbor "github.com/ipfs/go-ipld-cbor" + "github.com/storacha/go-ucanto/did" + "github.com/storacha/go-ucanto/principal" + "go.uber.org/fx" + "go.uber.org/zap" + "go.uber.org/zap/exp/zapslog" + + "github.com/storacha/sprue/internal/config" + "github.com/storacha/sprue/pkg/identity" + "github.com/storacha/sprue/pkg/indexerclient" + "github.com/storacha/sprue/pkg/ms3t/blockstore" + "github.com/storacha/sprue/pkg/ms3t/bucket" + "github.com/storacha/sprue/pkg/ms3t/registry" + "github.com/storacha/sprue/pkg/ms3t/server" + "github.com/storacha/sprue/pkg/ms3t/uploader" + "github.com/storacha/sprue/pkg/piriclient" + "github.com/storacha/sprue/pkg/routing" + + _ "modernc.org/sqlite" +) + +// MS3TModule registers the embedded ms3t S3 listener. When +// config.MS3T.Enabled is false the module is a no-op, so it's safe +// to always include in the app graph. +var MS3TModule = fx.Module("ms3t", + fx.Invoke(RegisterMS3TLifecycle), +) + +// MS3TDeps bundles the sprue-internal services ms3t pulls when +// config.MS3T.Forge.Enabled is true. Marked optional so disabled +// deployments don't fail to construct (e.g., the indexer client +// short-circuits to nil when the indexer endpoint isn't set). +type MS3TDeps struct { + fx.In + + Identity *identity.Identity + Router *routing.Service + PiriProvider piriclient.Provider + IndexerClient *indexerclient.Client `optional:"true"` +} + +// RegisterMS3TLifecycle wires ms3t's bucket service, HTTP handler, +// and listener into the fx lifecycle. Construction failures (bad +// config, missing service for forge mode) are returned synchronously +// so fx can abort startup before any other module initializes. +func RegisterMS3TLifecycle( + lc fx.Lifecycle, + cfg *config.Config, + zlog *zap.Logger, + deps MS3TDeps, +) error { + mc := cfg.MS3T + if !mc.Enabled { + return nil + } + + if err := os.MkdirAll(mc.DataDir, 0o755); err != nil { + return fmt.Errorf("ms3t: mkdir data dir: %w", err) + } + + noCache := mc.Forge.Enabled && mc.Forge.NoCache + + // When Forge is enabled, load or generate ms3t's own space + // keypair. ms3t IS the space owner (root UCAN authority) so that + // self-issued space/content/retrieve delegations validate down + // the chain to piri's retrieval auth check. + var spaceSigner principal.Signer + if mc.Forge.Enabled { + keyPath := mc.Forge.SpaceKeyFile + if keyPath == "" { + keyPath = filepath.Join(mc.DataDir, "space.key") + } + s, err := uploader.LoadOrCreateSigner(keyPath) + if err != nil { + return fmt.Errorf("ms3t: space signer: %w", err) + } + spaceSigner = s + zlog.Info("ms3t space loaded", + zap.String("space_did", spaceSigner.DID().String()), + zap.String("key_file", keyPath), + ) + } + + // Build the blockstore. In no_cache mode this is a Forge-backed + // read-only store (every Get hits indexer + piri); SQLite is + // skipped entirely. Otherwise we open a SQLite file under + // data_dir. + var bs cbor.IpldBlockstore + var sqliteDB *sql.DB + + if noCache { + fb, err := blockstore.NewForge(blockstore.ForgeConfig{ + IndexerEndpoint: cfg.Indexer.Endpoint, + IndexerDID: cfg.Indexer.DID, + Spaces: []did.DID{spaceSigner.DID()}, + Signer: deps.Identity.Signer, + SpaceSigner: spaceSigner, + Logger: zlog, + }) + if err != nil { + return fmt.Errorf("ms3t: forge blockstore: %w", err) + } + bs = fb + } else { + dbPath := filepath.Join(mc.DataDir, "ms3t.db") + db, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(WAL)&_pragma=foreign_keys(on)") + if err != nil { + return fmt.Errorf("ms3t: open sqlite: %w", err) + } + db.SetMaxOpenConns(1) + sb, err := blockstore.New(db) + if err != nil { + _ = db.Close() + return fmt.Errorf("ms3t: blockstore: %w", err) + } + bs = sb + sqliteDB = db + } + + // Registry always lives in SQLite; in no_cache mode it's the + // only thing in the SQLite file. Open a (different) DB so + // reusing one connection isn't a concern. + regDBPath := filepath.Join(mc.DataDir, "ms3t-registry.db") + if !noCache { + // reuse the same db for registry when SQLite blockstore is in use + regDBPath = "" + } + regDB, err := openRegistryDB(regDBPath, sqliteDB) + if err != nil { + if sqliteDB != nil { + _ = sqliteDB.Close() + } + return fmt.Errorf("ms3t: registry db: %w", err) + } + reg, err := registry.NewSQL(regDB) + if err != nil { + _ = regDB.Close() + if sqliteDB != nil && sqliteDB != regDB { + _ = sqliteDB.Close() + } + return fmt.Errorf("ms3t: registry: %w", err) + } + + carDir := filepath.Join(mc.DataDir, "cars") + innerUp, err := buildMS3TInnerUploader(mc, carDir, deps, spaceSigner, zlog) + if err != nil { + _ = regDB.Close() + if sqliteDB != nil && sqliteDB != regDB { + _ = sqliteDB.Close() + } + return fmt.Errorf("ms3t: uploader: %w", err) + } + + // Wrap in Batched unless no_cache is on. In no_cache mode every + // PUT blocks on the full Forge round trip, closing the + // read-after-write race. + var up uploader.Uploader + if noCache { + up = innerUp + } else { + batchAge, err := parseDurationOr(mc.BatchAge, 5*time.Second) + if err != nil { + _ = regDB.Close() + if sqliteDB != nil && sqliteDB != regDB { + _ = sqliteDB.Close() + } + return fmt.Errorf("ms3t: batch_age: %w", err) + } + batchBytes := mc.BatchBytes + if batchBytes <= 0 { + batchBytes = 64 << 20 + } + up = uploader.NewBatched(innerUp, uploader.BatchedOptions{ + MaxBytes: batchBytes, + MaxAge: batchAge, + }) + } + + chunkSize := mc.ChunkSize + if chunkSize <= 0 { + chunkSize = bucket.DefaultChunkSize + } + svc := bucket.New(bs, reg, bucket.Options{ + ChunkSize: chunkSize, + Uploader: up, + }) + + // Adapt sprue's zap logger into ms3t's slog interface so log + // output funnels through one pipeline. + slogger := slog.New(zapslog.NewHandler(zlog.Core(), zapslog.WithName("ms3t"))) + httpHandler := server.New(svc, slogger) + srv := &http.Server{Addr: mc.Addr, Handler: httpHandler} + + lc.Append(fx.Hook{ + OnStart: func(ctx context.Context) error { + if err := svc.Recover(ctx); err != nil { + return fmt.Errorf("ms3t: recover: %w", err) + } + zlog.Info("starting ms3t S3 listener", + zap.String("addr", mc.Addr), + zap.String("data_dir", mc.DataDir), + zap.Bool("forge", mc.Forge.Enabled), + zap.Bool("no_cache", noCache), + zap.Int64("chunk_size", chunkSize), + ) + go func() { + if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + zlog.Error("ms3t listener error", zap.Error(err)) + } + }() + return nil + }, + OnStop: func(ctx context.Context) error { + zlog.Info("shutting down ms3t S3 listener") + shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + var errs []error + if err := srv.Shutdown(shutdownCtx); err != nil { + errs = append(errs, fmt.Errorf("http shutdown: %w", err)) + } + if err := svc.Shutdown(shutdownCtx); err != nil { + errs = append(errs, fmt.Errorf("service shutdown: %w", err)) + } + if err := regDB.Close(); err != nil { + errs = append(errs, fmt.Errorf("registry db close: %w", err)) + } + if sqliteDB != nil && sqliteDB != regDB { + if err := sqliteDB.Close(); err != nil { + errs = append(errs, fmt.Errorf("blockstore db close: %w", err)) + } + } + if len(errs) > 0 { + return fmt.Errorf("ms3t shutdown: %v", errs) + } + return nil + }, + }) + return nil +} + +// openRegistryDB returns a *sql.DB for the registry. If reuse is +// non-nil, returns it (registry shares the SQLite file with the +// blockstore). Otherwise opens a fresh sqlite db at path. +func openRegistryDB(path string, reuse *sql.DB) (*sql.DB, error) { + if reuse != nil { + return reuse, nil + } + db, err := sql.Open("sqlite", path+"?_pragma=journal_mode(WAL)&_pragma=foreign_keys(on)") + if err != nil { + return nil, err + } + db.SetMaxOpenConns(1) + return db, nil +} + +// buildMS3TInnerUploader returns the inner uploader. With +// ms3t.forge.enabled = false the inner is uploader.Disk (writes CARs +// to a local directory). With it set to true, it's uploader.Internal +// — sprue's piriclient and indexerclient with sprue's identity as +// the signer and ms3t's self-generated space keypair as the space +// owner. +func buildMS3TInnerUploader( + mc config.MS3TConfig, + carDir string, + deps MS3TDeps, + spaceSigner principal.Signer, + zlog *zap.Logger, +) (uploader.Uploader, error) { + if !mc.Forge.Enabled { + return uploader.NewDisk(carDir) + } + if deps.IndexerClient == nil { + return nil, fmt.Errorf("ms3t.forge requires the indexer client; configure indexer.endpoint") + } + if spaceSigner == nil { + return nil, fmt.Errorf("ms3t.forge requires a space signer (internal error)") + } + + zlog.Info("ms3t internal uploader configured", + zap.String("space_did", spaceSigner.DID().String()), + zap.String("signer_did", deps.Identity.DID()), + zap.Bool("no_cache", mc.Forge.NoCache), + ) + + return uploader.NewInternal(uploader.InternalConfig{ + Router: deps.Router, + PiriProvider: deps.PiriProvider, + IndexerClient: deps.IndexerClient, + Signer: deps.Identity.Signer, + SpaceSigner: spaceSigner, + Logger: zlog, + }) +} + +func parseDurationOr(s string, dflt time.Duration) (time.Duration, error) { + if s == "" { + return dflt, nil + } + return time.ParseDuration(s) +} diff --git a/pkg/ms3t/architectural.md b/pkg/ms3t/architectural.md new file mode 100644 index 0000000..27db919 --- /dev/null +++ b/pkg/ms3t/architectural.md @@ -0,0 +1,521 @@ +# ms3t — S3 over Forge (MVP / prototype) + +This document describes what the code in `sprue/pkg/ms3t/` actually +does today, running in smelt with the deployed wiring. + +It is **not** an architecture spec for a production system. ms3t is a +prototype for "expose an S3 API on top of the Forge stack." Every +choice baked into the current shape is up for debate. The job of +this doc is to give the team enough of a map to read the code, ask +"why this and not that?", and weigh in on where to go next. + +If you're looking for what isn't built yet or what was considered +and dropped, see "Choices we made (and the alternatives)" and +"Open questions" near the end. + +## What ms3t is + +A goroutine inside sprue that: + +- Listens on a configured `host:port` and speaks the AWS S3 REST + protocol (path-style; subset of operations: bucket CRUD, object + PUT/GET/HEAD/DELETE, ListObjectsV2, range GETs) +- Stores object data as content-addressed CAR files in piri (via + sprue's existing piriclient + routing + indexerclient — same + packages sprue's own UCAN handlers use) +- Stores per-bucket "what's the current MST root?" in a small SQLite + file alongside sprue +- Stores its own UCAN identity (a generated did:key) in a file + alongside sprue + +There is no other persistent state. ms3t holds no canonical block +data — every block read goes to the network. + +## Local state + +``` +/ +├── space.key # ed25519 keypair, ms3t's UCAN identity / space root +└── ms3t-registry.db # SQLite, one row per bucket +``` + +The SQLite schema (`pkg/ms3t/registry/sqlite.go`): + +```sql +CREATE TABLE buckets ( + name TEXT PRIMARY KEY, + root_cid BLOB, -- current MST root, NULL for empty bucket + forge_root_cid BLOB, -- last root known to be in Forge + created_at INTEGER NOT NULL +); +``` + +`forge_root_cid` is plumbing for a batched-writes mode that isn't +currently active; in the deployed sync-writes mode it always equals +`root_cid` after each PUT/DELETE. + +## How the data is shaped + +Each S3 object's bytes get chunked into raw IPLD blocks (default 1 +MiB, raw codec, sha256 multihash) and pointed at by an +`ObjectManifest` (DAG-CBOR): + +```go +type ObjectManifest struct { + Key string + ContentType string + Created int64 + Body Body +} + +type Body struct { + Size int64 + ChunkSize int64 + Chunks []cid.Cid + SHA256 []byte // for ETag +} +``` + +The bucket itself is an MST keyed by S3 key, with leaves pointing at +manifest CIDs. The "current state" of a bucket is a single CID — the +MST root — held in the registry. + +Every PUT/DELETE produces a new MST root via the +forked-from-atproto MST in `pkg/ms3t/mst/`, which is content-addressed +all the way down. The MST itself is fully described in its package +docs. + +## How the data lives in Forge + +For every S3 PUT, ms3t produces **one CAR file** containing: + +- the new body chunks (raw blocks) +- the new ObjectManifest +- the mutated MST nodes (the path from leaf to root) + +Plus a small **index blob** (also a CAR) describing where each inner +block sits within the data CAR, byte-offset-and-length, encoded as a +`blobindex.ShardedDagIndexView`. + +Both blobs are uploaded to piri. The index is registered with the +indexing-service via `assert/index`. From that point onward, any +inner CID (an MST node, a manifest, a body chunk) is resolvable via: + +1. Indexer query: `multihash → (CAR multihash, byte offset, length)` +2. Piri ranged GET: read `[offset, offset+length)` of the CAR + +The indexer + piri retrieval flow are how reads find anything. + +## The PUT flow + +``` +S3 client ms3t sprue services piri indexer + │ │ │ │ │ + │ PUT k=v │ │ │ │ + ├────────────▶│ │ │ │ + │ │ load HEAD root_cid │ │ │ + │ │ from registry │ │ │ + │ │ │ │ │ + │ │ chunk body into │ │ │ + │ │ raw blocks (in mem) │ │ │ + │ │ │ │ │ + │ │ mst.Add(key, mfCid): │ │ + │ │ reads existing nodes via Forge ─────┤ │ + │ │ ◀─ indexer + ranged piri GETs │ │ + │ │ produces new path nodes (in mem) │ │ + │ │ │ + │ │ pack body + manifest + mst nodes into one CAR │ + │ │ │ + │ │ piriclient.Allocate(carHash, carSize) │ + │ ├──────────────────────▶│ │ │ + │ │◀── presigned URL ─────┤ │ │ + │ │ │ + │ │ HTTP PUT carBytes ────────────────▶│ │ + │ │ │ + │ │ piriclient.Accept │ + │ ├──────────────────────▶│ │ │ + │ │ │ + │ │ build ShardedDagIndexView over CAR offsets │ + │ │ │ + │ │ Allocate + PUT + Accept the index blob ──┐ │ + │ │ ▼ │ + │ │ │ + │ │ self-issue space/content/retrieve │ + │ │ delegation (space → sprue) for the index blob │ + │ │ │ + │ │ indexerclient.PublishIndexClaim ─────────────────▶ + │ │ │ + │ │ registry: CAS root_cid old → new │ + │ │ │ + │ 200 OK + ETag │ + │◀────────────┤ │ +``` + +This is **synchronous**: every step blocks the client's PUT. Three +piri round trips per PUT (data CAR allocate+PUT+accept, index +allocate+PUT+accept, then index claim publication). Read-after-write +is correct because the assert/index has been published before 200 is +returned. + +## The GET flow + +``` +S3 client ms3t indexer piri + │ │ │ │ + │ GET k │ │ │ + ├────────────▶│ │ │ + │ │ load HEAD root from registry │ + │ │ │ + │ │ for each MST node walked from │ + │ │ root toward the leaf: │ + │ │ 1. indexer query for cid ────▶│ + │ │ 2. self-issue retrieve UCAN │ + │ │ 3. rclient.Execute on piri ──────────▶│ + │ │ ◀── block bytes (Range) ─────────┤ + │ │ 4. parse, follow next link │ + │ │ │ + │ │ once at the leaf manifest: │ + │ │ for each body chunk: same dance │ + │ │ │ + │ │ stream reassembled body to client │ + │ 200 + bytes │ │ + │◀────────────┤ │ +``` + +Every block read is a network round trip. There is no local cache +serving any of these reads. + +The `rclient.Execute` call wraps the GET with a UCAN auth header +(`X-Agent-Message`) carrying a `space/content/retrieve` invocation +chained back to the space root — piri rejects unauthenticated +retrievals. + +## Where the UCAN identity comes from + +ms3t generates and persists its own ed25519 keypair on first run. +That keypair is the **space**: a `did:key` whose private half is in +`/space.key`. ms3t is the root UCAN authority over its own +space, which lets it self-issue all the delegations it needs: + +- For the indexer: a blanket `space/content/retrieve` with + `NoCaveats` so the indexer can fetch any blob in the space when + validating an index claim +- For piri retrievals: a 60-second `space/content/retrieve` proof + per Get, attached to a typed retrieve invocation +- For PublishIndexClaim: a per-call retrieval delegation scoped to + the specific index blob + +Sprue uses its own identity (`upload.pem` in smelt) for the piri +allocate/accept invocations and as the audience of ms3t's +self-issued retrieval delegations. So: + +- **Sprue identity**: signs piri-side blob lifecycle invocations +- **ms3t space keypair**: signs anything that needs to chain back to + "the owner of this space" + +## Components map + +``` + ┌────────────────────────┐ +S3 client ──────▶ │ ms3t HTTP listener │ pkg/ms3t/server/ + │ (S3 protocol → service)│ + └───────────┬────────────┘ + │ + ┌───────────┴────────────┐ + │ bucket.Service │ pkg/ms3t/bucket/ + │ load HEAD, mutate MST, │ + │ build CAR, commit │ + └─────┬───────────┬──────┘ + │ │ + ┌────────────────┘ └─────────────────┐ + │ │ + ┌───▼─────────────┐ ┌────────▼────────────┐ + │ registry.SQL │ │ blockstore.Forge │ + │ SQLite, HEAD │ │ reads via indexer │ + │ pointer per │ │ + piri rclient. │ + │ bucket │ │ Put: no-op. │ + └─────────────────┘ └─────────┬───────────┘ + │ + ┌────────────────────────┐ │ + │ uploader.Internal │ ◀────────┘ writes side + │ Submit: encode CAR, │ + │ piriclient + indexer- │ + │ client per call │ + └────────────┬───────────┘ + │ + ┌──────────────────┼──────────────────────┐ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌────────────────┐ + │ sprue │ │ piri │ │ indexing- │ + │ routing │ │ blob │ │ service │ + │ piriclient │ store │ │ assert/index │ + └──────────┘ └──────────┘ └────────────────┘ + (in-process (HTTP w/ UCAN auth) + Go calls) +``` + +ms3t calls **sprue's services in-process** (Go function calls into +`pkg/piriclient`, `pkg/routing`, `pkg/indexerclient`). It does not +loopback through sprue's HTTP/UCAN handler. sprue's own UCAN +endpoint and ms3t's S3 endpoint are two unrelated listeners in the +same process. + +## Choices we made (and the alternatives) + +These are **prototype decisions**, made to ship something working. +Each is a place the team should weigh in on whether the choice +holds up. + +### Sync writes, no local block cache + +Every PUT blocks on three Forge round trips. Every GET hits the +network for every block. There is no local SQLite blockstore active +in this mode. + +- **Why we picked this**: forces the read path to actually work + end-to-end against real Forge. Closes the read-after-write race + by construction. Simplest possible state model: only the registry + is mutable. +- **Why it's awkward**: `aws s3 sync` of many small files is slow. + An MST traversal during a PUT pays N network round trips for N + existing nodes on the path, even though those nodes are + deterministic. +- **Alternative we have code for**: `Batched(Internal)` uploader + + SQLite blockstore as a read-through cache. This is the default + when `ms3t.forge.no_cache: false`. Faster, but the + `forge_root_cid` machinery has to actually do something — and the + read-after-write window opens. + +### ms3t owns its space + +ms3t generates its own ed25519 keypair and is the root UCAN +authority over its own space. Self-issues every delegation it needs. + +- **Why we picked this**: zero out-of-band provisioning. The first + time sprue starts with `forge.enabled`, ms3t writes a key and + uses it. No "go ask the delegator for a delegation, paste it + here." +- **Why it's awkward**: ms3t-as-space-root is unusual. In a real + multi-tenant deployment this doesn't model what we'd want — each + S3 customer would presumably have their own space, with ms3t + acting as a tenant-aware orchestrator. +- **Alternative we considered**: ms3t holds an externally-issued + delegation chain into a pre-provisioned space. Better tenant + story, requires delegation provisioning machinery. + +### One CAR per S3 op (body + structural) + +Body chunks ride in the same CAR as the structural blocks. The +indexer maps inner CIDs to byte ranges within the outer CAR. One +data-CAR upload + one index-blob upload per PUT. + +- **Why we picked this**: matches what guppy does for filesystem + uploads — minimum number of piri round trips per PUT. Body + retrievals work via ranged GETs against the outer CAR. +- **Why it's awkward**: rules out direct-passthrough of body bytes + (we'd want body chunks as their own piri blobs so a 307 redirect + has a stable URL target). +- **Alternative we considered**: separate piri blobs per body + chunk, smaller structural CAR for the MST + manifest. Doubles + the per-PUT round trip count but enables passthrough. + +### ms3t in the data path + +The S3 client uploads body bytes to ms3t; ms3t uploads to piri. +Same on the read side. ms3t pays the bandwidth. + +- **Why we picked this**: the alternative (direct passthrough) + needs a Forge feature we don't have — see "Direct passthrough" + under future directions. +- **Why it's awkward**: the operator running sprue + piri pays + bandwidth twice (client→sprue, sprue→piri) when conceptually + the bytes only need to move once. In a federated model where + piri storage is run by different operators, this becomes + structurally wrong (sprue's operator pays to deposit bytes onto + someone else's hardware). + +### Embedded in sprue + +ms3t lives at `sprue/pkg/ms3t/` and is wired by sprue's fx graph. +No deployment artifact distinct from sprue. + +- **Why we picked this**: zero auth coordination — ms3t is sprue, + it has all sprue's identities and clients in-process. One binary + to ship, one config file. +- **Why it's awkward**: every sprue release ships ms3t, every ms3t + change requires a sprue release. Sprue maintainers inherit MST + + S3 protocol surface area. +- **Alternative**: standalone ms3t binary, talks to sprue/piri via + external UCAN-over-HTTP. (This exists at + github.com/frrist/ms3t — a separate repo that was the original + prototype before we copied into sprue.) + +### Sticky-bucket routing (assumed but not built) + +The current code assumes a single ms3t instance per bucket, via the +in-process `sync.Mutex` per-bucket lock. There is no cross-instance +coordination. + +- **Why we picked this**: works for a single-process MVP. +- **What's needed for HA**: either sticky-bucket routing at a load + balancer (hash bucket name → ms3t instance) or multi-writer with + CAS retry and cache invalidation. Not implemented. + +## Operational characteristics observed + +These are observations from smelt, not promises: + +- `aws s3 cp small.txt s3://demo/k` (small file): a few hundred + milliseconds inside the docker network, dominated by the three + Forge round trips +- `aws s3 cp s3://demo/k -` immediately after: works (sync writes + close the race) +- `aws s3 sync` of many small files: visibly slow — each file pays + the full Forge round-trip cost serially per S3 PUT +- `aws s3 ls`: walks MST through the network; cost grows with + bucket size + +We have not measured anything precisely. These are rough impressions. + +## Known limitations + +- **Slow.** Sync writes + no read cache. No effort has gone into + performance. +- **No GC.** S3 DELETE removes the leaf from the MST. Body chunks + become unreferenced from the current root, but we don't tell + Forge to expire them. Storage grows monotonically. +- **No multipart upload.** S3 client splits files >8 MB into + multipart by default; we don't implement it. Operators have to + set `multipart_threshold = 5GB` in their AWS profile. +- **No `aws-chunked` body decoding.** The current AWS CLI default + upload format. Operators have to set + `request_checksum_calculation = when_required` to disable it. +- **Single-tenant.** One ms3t = one space. +- **Single-instance.** No HA story. +- **Disk and Guppy uploaders are dead code in sprue's wiring.** + They exist in `pkg/ms3t/uploader/` for the standalone-ms3t use + case; sprue only wires `Internal` (when forge enabled) or `Disk` + (when forge disabled, in the cache mode that isn't currently + deployed). + +## Future directions (not implemented) + +### Direct passthrough + +The S3 client uploads body bytes directly to a piri presigned URL +via 307 redirect; ms3t never sees the bytes. Symmetric on reads. + +- ms3t becomes purely control-plane +- Bandwidth shifts to piri's operator (correct in the federated + model) +- Blocked on a Forge feature: piri/sprue must gate the + client-visible 200 on an ms3t-side commit hook so ms3t can + finalize the MST mutation before the client believes the PUT + succeeded. Without this, the PUT-to-MST-commit window is a real + race. + +### Async writes + +`Batched(Internal)` uploader: ack the PUT after local commit, ship +to Forge in the background. Faster, but introduces a window where +PUT-then-immediate-GET fails until the batch flushes. Code already +exists; it's the default mode when `no_cache: false`. We just don't +run with it. + +### Read-through cache + +SQLite blockstore populated on writes, consulted before falling +through to Forge. Order-of-magnitude speedup on hot reads at the +cost of cache invalidation complexity (when does ms3t know its +cached version is stale? Probably "never on its own" — would need +inputs from sprue's existing replay/invalidation mechanisms.). + +### Multi-tenant + +One ms3t serving N S3 customers, each in their own space. Requires +either: +- Per-tenant space delegations imported into ms3t (provisioning + machinery), or +- ms3t generating + tracking per-tenant spaces, with some external + authority for tenant identity + +### Multi-instance + +Either sticky-bucket routing at a load balancer (bucket name → ms3t +instance via consistent hash) or proper multi-writer with CAS retry ++ cache invalidation. Both unbuilt. + +### Multipart upload + aws-chunked + +Real S3 compatibility. Both are well-defined extensions of the +current per-PUT model — multipart effectively becomes "many +UploadPart calls accumulate body chunks; CompleteMultipartUpload +fires the MST mutation." + +### GC + +Walk reachable from current HEAD (and any retained snapshots), mark +those CIDs, ask Forge to expire the rest. Forge would need to grow +an `assert/expire`-style claim, and we'd need a retention policy. + +## Open questions for the team + +1. **Sync vs batched writes for MVP**: is `aws s3 sync` slowness + acceptable for now, or should we wire `Batched` and accept the + read-after-write window? + +2. **Tenant model**: when we want N S3 customers, do they share + ms3t's space or each get their own? The latter implies a + provisioning step we currently avoid. + +3. **Where should ms3t actually run?** Embedded in sprue is what + we have. Standalone ms3t-with-Guppy works too (the original + prototype). Embedded-in-piri was discussed and rejected. Are + there scenarios where standalone matters more than we've + assumed? + +4. **Direct passthrough's commit-hook feature**: is this on + anyone's roadmap? It's the lever for federated topologies. If + not, the "ms3t in the data path" choice becomes load-bearing + for any deployment beyond a single operator. + +5. **Server-side concat for large GETs**: a multi-chunk body has + no clean direct-passthrough path because there's no single URL + to redirect to. Either large-object reads always go through + ms3t (current behavior), or piri grows a "stream this ordered + list of multihashes as one body" capability. + +6. **MST for buckets, registry for buckets**: the registry + (bucket → root CID) is itself a `string → CID` map. We could + make it an MST too, store the registry MST in Forge, and have + only one mutable pointer (the registry MST root). Discussed + earlier; rejected for now because the registry needs SQL-style + transactional CAS that Forge doesn't provide. + +7. **Should the standalone ms3t repo at github.com/frrist/ms3t + continue to exist?** It has the same code (modulo imports) and + no consumer. The Disk and Guppy uploaders only make sense + there. + +## Reading the code + +If you're new to ms3t and want to follow a request through: + +- **PUT**: `pkg/ms3t/server/handlers.go::putObject` → + `bucket.Service.PutObject` (in `pkg/ms3t/bucket/bucket.go`) → + `chunker.putBody` → `mst.Add` → `CARBuffer.Commit` → + `uploader.Internal.Submit` (in `pkg/ms3t/uploader/internal.go`) + → registry CAS + +- **GET**: `pkg/ms3t/server/handlers.go::getObject` → + `bucket.Service.GetObject` → `mst.Get` (every node fetched via + `blockstore.Forge.Get` in `pkg/ms3t/blockstore/forge.go`) → + manifest decoded → body chunks fetched the same way → streamed + to client + +- **Where things plug into sprue**: `internal/fx/ms3t.go`. This is + the only sprue-side file that knows about ms3t. + +- **The MST itself**: `pkg/ms3t/mst/`. This is a fork of the + atproto MST with relaxed key validation. Standalone, no + dependencies on the rest of ms3t. diff --git a/pkg/ms3t/blockstore/buffered.go b/pkg/ms3t/blockstore/buffered.go new file mode 100644 index 0000000..1b15a0c --- /dev/null +++ b/pkg/ms3t/blockstore/buffered.go @@ -0,0 +1,111 @@ +package blockstore + +import ( + "context" + "fmt" + "sync" + + "github.com/storacha/sprue/pkg/ms3t/uploader" + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" +) + +// CARBuffer is a per-S3-op IpldBlockstore that captures every Put — body +// chunks, MST nodes, ObjectManifests — in memory. On Commit it submits +// the entire batch to the configured Uploader (which may flush +// immediately or buffer further) and then flushes the blocks to the +// underlying canonical store. +// +// Reads check the in-memory buffer first and fall through to the +// underlying store on miss. This lets the MST's GetPointer recompute +// path Put a node and immediately re-Read it during the same op. +// +// Single-shot per session: create a CARBuffer at the start of an S3 op, +// Put any number of blocks, then call Commit(root) exactly once on +// success or Discard on failure. +// +// Safe for concurrent reads but writes are serialized within one +// session — the bucket service holds a per-bucket mutex around the +// whole flow. +type CARBuffer struct { + underlying cbor.IpldBlockstore + uploader uploader.Uploader + + mu sync.RWMutex + blocks map[cid.Cid]block.Block + order []cid.Cid +} + +// NewCARBuffer constructs a per-op buffer backed by underlying for +// reads and Submitting to up at Commit. +func NewCARBuffer(underlying cbor.IpldBlockstore, up uploader.Uploader) *CARBuffer { + return &CARBuffer{ + underlying: underlying, + uploader: up, + blocks: map[cid.Cid]block.Block{}, + } +} + +func (b *CARBuffer) Get(ctx context.Context, c cid.Cid) (block.Block, error) { + b.mu.RLock() + blk, ok := b.blocks[c] + b.mu.RUnlock() + if ok { + return blk, nil + } + return b.underlying.Get(ctx, c) +} + +func (b *CARBuffer) Put(_ context.Context, blk block.Block) error { + b.mu.Lock() + defer b.mu.Unlock() + if _, exists := b.blocks[blk.Cid()]; !exists { + b.blocks[blk.Cid()] = blk + b.order = append(b.order, blk.Cid()) + } + return nil +} + +// Commit submits the buffered blocks to the Uploader rooted at root, +// then flushes them to the underlying canonical store. Empties the +// buffer on success. +func (b *CARBuffer) Commit(ctx context.Context, root cid.Cid) error { + b.mu.Lock() + defer b.mu.Unlock() + + if len(b.order) == 0 { + return nil + } + + blks := make([]block.Block, len(b.order)) + for i, c := range b.order { + blks[i] = b.blocks[c] + } + + if err := b.uploader.Submit(ctx, []cid.Cid{root}, blks); err != nil { + return fmt.Errorf("carbuffer: submit: %w", err) + } + + for _, blk := range blks { + if err := b.underlying.Put(ctx, blk); err != nil { + return fmt.Errorf("carbuffer: flush %s: %w", blk.Cid(), err) + } + } + + b.blocks = map[cid.Cid]block.Block{} + b.order = nil + return nil +} + +// Discard drops any buffered blocks without submitting or flushing +// them. Use this when the surrounding op has failed and in-flight +// blocks should be abandoned. +func (b *CARBuffer) Discard() { + b.mu.Lock() + defer b.mu.Unlock() + b.blocks = map[cid.Cid]block.Block{} + b.order = nil +} + +var _ cbor.IpldBlockstore = (*CARBuffer)(nil) diff --git a/pkg/ms3t/blockstore/forge.go b/pkg/ms3t/blockstore/forge.go new file mode 100644 index 0000000..fb2e69e --- /dev/null +++ b/pkg/ms3t/blockstore/forge.go @@ -0,0 +1,288 @@ +package blockstore + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "time" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" + captypes "github.com/storacha/go-libstoracha/capabilities/types" + "github.com/storacha/go-libstoracha/capabilities/assert" + contentcap "github.com/storacha/go-libstoracha/capabilities/space/content" + "github.com/storacha/go-libstoracha/failure" + rclient "github.com/storacha/go-ucanto/client/retrieval" + "github.com/storacha/go-ucanto/core/dag/blockstore" + "github.com/storacha/go-ucanto/core/delegation" + "github.com/storacha/go-ucanto/core/receipt" + "github.com/storacha/go-ucanto/core/result" + "github.com/storacha/go-ucanto/did" + "github.com/storacha/go-ucanto/principal" + "github.com/storacha/go-ucanto/ucan" + "github.com/storacha/guppy/pkg/client/locator" + indexclient "github.com/storacha/indexing-service/pkg/client" + "go.uber.org/zap" +) + +// Forge is a read-only IpldBlockstore that resolves CIDs through the +// Storacha indexing-service and fetches the underlying bytes via +// authorized UCAN-wrapped GETs against piri storage nodes. +// +// Used in ms3t's "no_cache" mode: every Get goes to the network. There +// is no in-process block cache; the only caching is the small +// metadata cache inside the IndexLocator (digest → location +// commitment), which exists per-Forge instance and resets on process +// restart. Block bytes always traverse the network. +// +// Put is a no-op so this type can be passed as the underlying for +// CARBuffer (whose Commit calls Put for each freshly-Submitted +// block). +type Forge struct { + locator locator.Locator + signer principal.Signer + spaceSigner principal.Signer + spaces []did.DID + logger *zap.Logger +} + +// ForgeConfig wires sprue's existing services into a read-only Forge +// blockstore. +type ForgeConfig struct { + // IndexerEndpoint is the indexing-service URL (cfg.Indexer.Endpoint). + IndexerEndpoint string + // IndexerDID is the indexing-service principal (cfg.Indexer.DID). + IndexerDID string + // Spaces scopes the locator queries; for ms3t this is the single + // space ms3t owns. + Spaces []did.DID + // Signer is sprue's upload-service identity. Used as the issuer of + // `space/content/retrieve` invocations against piri. + Signer principal.Signer + // SpaceSigner is the keypair of the space ms3t owns. Used to + // self-issue space/content/retrieve delegations. The chain is + // space → sprue → piri (with the sprue→piri hop being the actual + // retrieve invocation that piri authorizes). + SpaceSigner principal.Signer + // HTTPClient is used for the underlying indexer queries. piri + // retrievals use go-ucanto's retrieval client which manages its + // own HTTP. Optional; defaults to http.DefaultClient. + HTTPClient *http.Client + // Logger is optional. + Logger *zap.Logger +} + +// NewForge constructs a Forge blockstore. Builds an indexing-service +// client and wraps it with guppy's IndexLocator. +func NewForge(cfg ForgeConfig) (*Forge, error) { + if cfg.IndexerEndpoint == "" { + return nil, errors.New("forge blockstore: indexer endpoint is required") + } + if cfg.IndexerDID == "" { + return nil, errors.New("forge blockstore: indexer DID is required") + } + if len(cfg.Spaces) == 0 { + return nil, errors.New("forge blockstore: at least one space is required") + } + if cfg.Signer == nil { + return nil, errors.New("forge blockstore: signer is required") + } + if cfg.SpaceSigner == nil { + return nil, errors.New("forge blockstore: space signer is required") + } + + endpointURL, err := url.Parse(cfg.IndexerEndpoint) + if err != nil { + return nil, fmt.Errorf("forge blockstore: parse indexer endpoint: %w", err) + } + indexerDID, err := did.Parse(cfg.IndexerDID) + if err != nil { + return nil, fmt.Errorf("forge blockstore: parse indexer DID: %w", err) + } + + httpc := cfg.HTTPClient + if httpc == nil { + httpc = http.DefaultClient + } + logger := cfg.Logger + if logger == nil { + logger = zap.NewNop() + } + + idxClient, err := indexclient.New(indexerDID, *endpointURL, indexclient.WithHTTPClient(httpc)) + if err != nil { + return nil, fmt.Errorf("forge blockstore: build indexing-service client: %w", err) + } + + authFn := newAuthorizeRetrieval(cfg.SpaceSigner, indexerDID) + loc := locator.NewIndexLocator(idxClient, authFn) + + return &Forge{ + locator: loc, + signer: cfg.Signer, + spaceSigner: cfg.SpaceSigner, + spaces: cfg.Spaces, + logger: logger, + }, nil +} + +// Get resolves the CID through the indexer and retrieves the +// underlying byte slice from piri via a UCAN-authorized +// `space/content/retrieve` invocation. The request is scoped to +// the inner block's offset/length within the containing CAR shard. +func (f *Forge) Get(ctx context.Context, c cid.Cid) (block.Block, error) { + locations, err := f.locator.Locate(ctx, f.spaces, c.Hash()) + if err != nil { + var nf locator.NotFoundError + if errors.As(err, &nf) { + return nil, ErrNotFound + } + return nil, fmt.Errorf("forge: locate %s: %w", c, err) + } + if len(locations) == 0 { + return nil, ErrNotFound + } + + loc := locations[0] + caveats, rerr := assert.LocationCaveatsReader.Read(loc.Commitment.Nb()) + if rerr != nil { + return nil, fmt.Errorf("forge: read location caveats for %s: %w", c, rerr) + } + if len(caveats.Location) == 0 { + return nil, fmt.Errorf("forge: empty location URL set for %s", c) + } + target := caveats.Location[0] + + // space scopes the retrieve capability. Fall back to our + // configured space if the commitment is the legacy form without + // a Space field. + space := caveats.Space + if space == (did.DID{}) { + space = f.spaces[0] + } + + // audience for the retrieve invocation is the storage provider + // that issued the commitment. + storageProvider, err := did.Parse(loc.Commitment.With()) + if err != nil { + return nil, fmt.Errorf("forge: parse storage provider DID: %w", err) + } + + // Self-issued retrieval proof: space → sprue. Per-call to keep + // the chain short-lived. + retrievalProof, err := delegation.Delegate( + f.spaceSigner, + f.signer, + []ucan.Capability[ucan.NoCaveats]{ + ucan.NewCapability(contentcap.Retrieve.Can(), space.String(), ucan.NoCaveats{}), + }, + delegation.WithExpiration(int(time.Now().Add(60*time.Second).Unix())), + ) + if err != nil { + return nil, fmt.Errorf("forge: build retrieval proof: %w", err) + } + + rangeStart := loc.Position.Offset + rangeEnd := rangeStart + loc.Position.Length - 1 + + inv, err := contentcap.Retrieve.Invoke( + f.signer, // issuer = sprue + storageProvider, // audience = piri + space.String(), // with = space + contentcap.RetrieveCaveats{ + Blob: contentcap.BlobDigest{Digest: caveats.Content.Hash()}, + Range: contentcap.Range{Start: rangeStart, End: rangeEnd}, + }, + delegation.WithProof(delegation.FromDelegation(retrievalProof)), + ) + if err != nil { + return nil, fmt.Errorf("forge: build retrieve invocation: %w", err) + } + + conn, err := rclient.NewConnection(storageProvider, &target) + if err != nil { + return nil, fmt.Errorf("forge: build retrieval connection: %w", err) + } + + xres, hres, err := rclient.Execute(ctx, inv, conn) + if err != nil { + return nil, fmt.Errorf("forge: execute retrieve for %s: %w", c, err) + } + + rcptLink, ok := xres.Get(inv.Link()) + if !ok { + return nil, fmt.Errorf("forge: no receipt for retrieve of %s", c) + } + bs, err := blockstore.NewBlockReader(blockstore.WithBlocksIterator(xres.Blocks())) + if err != nil { + return nil, fmt.Errorf("forge: build block reader: %w", err) + } + anyRcpt, err := receipt.NewAnyReceipt(rcptLink, bs) + if err != nil { + return nil, fmt.Errorf("forge: build receipt: %w", err) + } + rcpt, err := receipt.Rebind[contentcap.RetrieveOk, failure.FailureModel]( + anyRcpt, contentcap.RetrieveOkType(), failure.FailureType(), captypes.Converters..., + ) + if err != nil { + return nil, fmt.Errorf("forge: rebind receipt: %w", err) + } + if _, err := result.Unwrap(result.MapError(rcpt.Out(), failure.FromFailureModel)); err != nil { + return nil, fmt.Errorf("forge: retrieve %s: %w", c, err) + } + + body, err := io.ReadAll(hres.Body()) + if err != nil { + return nil, fmt.Errorf("forge: read retrieve body for %s: %w", c, err) + } + if uint64(len(body)) != loc.Position.Length { + return nil, fmt.Errorf("forge: %s short read: got %d bytes, want %d", + c, len(body), loc.Position.Length) + } + + return block.NewBlockWithCid(body, c) +} + +// Put is a no-op. CARBuffer.Commit calls Put on its underlying +// blockstore for every freshly-Submitted block; in no_cache mode we +// don't want to persist anything locally because the uploader has +// already shipped the data to Forge. +func (f *Forge) Put(_ context.Context, _ block.Block) error { + return nil +} + +// newAuthorizeRetrieval returns the AuthorizeRetrievalFunc the +// IndexLocator calls before each indexer query. The space signer +// (root authority) directly authorizes the indexer to fetch any +// blob in the space. NoCaveats means "no specific blob digest +// constraint" — the indexer pulls whichever index blob it needs +// to satisfy the lookup. +// +// Mirrors the pattern in +// github.com/storacha/guppy/cmd/retrieve.go::94. Difference: ms3t's +// "user" is itself, so the proof chain is one hop (space → indexer) +// rather than the typical (user → upload service → indexer). +func newAuthorizeRetrieval(spaceSigner principal.Signer, indexerDID did.DID) locator.AuthorizeRetrievalFunc { + return func(spaces []did.DID) (delegation.Delegation, error) { + caps := make([]ucan.Capability[ucan.NoCaveats], 0, len(spaces)) + for _, space := range spaces { + caps = append(caps, ucan.NewCapability( + contentcap.Retrieve.Can(), + space.String(), + ucan.NoCaveats{}, + )) + } + return delegation.Delegate( + spaceSigner, + indexerDID, + caps, + delegation.WithExpiration(int(time.Now().Add(60*time.Second).Unix())), + ) + } +} + +var _ cbor.IpldBlockstore = (*Forge)(nil) diff --git a/pkg/ms3t/blockstore/sqlite.go b/pkg/ms3t/blockstore/sqlite.go new file mode 100644 index 0000000..a1ee74b --- /dev/null +++ b/pkg/ms3t/blockstore/sqlite.go @@ -0,0 +1,69 @@ +// Package blockstore provides a SQLite-backed implementation of the +// go-ipld-cbor IpldBlockstore interface, used to persist MST nodes and +// object manifests as content-addressed blocks. +package blockstore + +import ( + "context" + "database/sql" + "errors" + "fmt" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" +) + +// Schema is the DDL for the blocks table. Vanilla SQL so it works on SQLite, +// Postgres, etc. +const Schema = ` +CREATE TABLE IF NOT EXISTS blocks ( + cid BLOB PRIMARY KEY, + data BLOB NOT NULL +); +` + +// Store is a SQLite-backed IPLD blockstore. +type Store struct { + db *sql.DB +} + +// New wraps an open *sql.DB and ensures the schema exists. +func New(db *sql.DB) (*Store, error) { + if _, err := db.Exec(Schema); err != nil { + return nil, fmt.Errorf("blockstore: ensure schema: %w", err) + } + return &Store{db: db}, nil +} + +// Get fetches a block by CID. Returns ErrNotFound if absent. +func (s *Store) Get(ctx context.Context, c cid.Cid) (block.Block, error) { + var data []byte + err := s.db.QueryRowContext(ctx, + `SELECT data FROM blocks WHERE cid = ?`, + c.Bytes()).Scan(&data) + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } + if err != nil { + return nil, fmt.Errorf("blockstore: get %s: %w", c, err) + } + return block.NewBlockWithCid(data, c) +} + +// Put writes a block. Idempotent: re-inserting the same CID is a no-op. +func (s *Store) Put(ctx context.Context, b block.Block) error { + _, err := s.db.ExecContext(ctx, + `INSERT INTO blocks (cid, data) VALUES (?, ?) ON CONFLICT (cid) DO NOTHING`, + b.Cid().Bytes(), b.RawData()) + if err != nil { + return fmt.Errorf("blockstore: put %s: %w", b.Cid(), err) + } + return nil +} + +// ErrNotFound is returned by Get when the requested CID is absent. +var ErrNotFound = errors.New("blockstore: block not found") + +// Compile-time assertion: *Store implements cbor.IpldBlockstore. +var _ cbor.IpldBlockstore = (*Store)(nil) diff --git a/pkg/ms3t/blockstore/walk.go b/pkg/ms3t/blockstore/walk.go new file mode 100644 index 0000000..2d42fbf --- /dev/null +++ b/pkg/ms3t/blockstore/walk.go @@ -0,0 +1,63 @@ +package blockstore + +import ( + "bytes" + "context" + "fmt" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" + cbg "github.com/whyrusleeping/cbor-gen" +) + +// WalkReachable returns every block reachable from root in the given +// blockstore via IPLD links. DAG-CBOR blocks are scanned for child CIDs; +// raw blocks are leaves. +// +// Cycles are detected and not revisited. Block order is BFS by +// discovery — useful for CAR-friendly streaming (root first, then its +// direct children, etc.). +// +// Used by recovery: walking from a bucket's HEAD collects every +// structural block plus every body chunk reachable from any current +// ObjectManifest, which is exactly the set we want to ship to Forge. +func WalkReachable(ctx context.Context, bs cbor.IpldBlockstore, root cid.Cid) ([]block.Block, error) { + if !root.Defined() { + return nil, nil + } + + visited := map[cid.Cid]struct{}{} + var out []block.Block + queue := []cid.Cid{root} + + for len(queue) > 0 { + c := queue[0] + queue = queue[1:] + if _, seen := visited[c]; seen { + continue + } + visited[c] = struct{}{} + + blk, err := bs.Get(ctx, c) + if err != nil { + return nil, fmt.Errorf("walk %s: %w", c, err) + } + out = append(out, blk) + + // Only DAG-CBOR blocks have IPLD links to follow. Raw blocks + // (codec 0x55) are body chunks — leaves of the DAG. + if c.Prefix().Codec != cid.DagCBOR { + continue + } + err = cbg.ScanForLinks(bytes.NewReader(blk.RawData()), func(child cid.Cid) { + if _, seen := visited[child]; !seen { + queue = append(queue, child) + } + }) + if err != nil { + return nil, fmt.Errorf("scan links %s: %w", c, err) + } + } + return out, nil +} diff --git a/pkg/ms3t/bucket/bucket.go b/pkg/ms3t/bucket/bucket.go new file mode 100644 index 0000000..513545e --- /dev/null +++ b/pkg/ms3t/bucket/bucket.go @@ -0,0 +1,518 @@ +// Package bucket implements S3-style CRUD operations on top of the forked +// MST (one tree per bucket) and an IPLD blockstore that holds both the +// structural blocks (MST nodes, ObjectManifests) and the raw body chunks. +package bucket + +import ( + "context" + "errors" + "fmt" + "io" + "strings" + "sync" + "time" + + "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" + "github.com/storacha/sprue/pkg/ms3t/mst" + "github.com/storacha/sprue/pkg/ms3t/registry" + "github.com/storacha/sprue/pkg/ms3t/uploader" +) + +// Service is the entry point for bucket operations. +type Service struct { + bs cbor.IpldBlockstore + cst cbor.IpldStore // long-lived, read-only over bs (for List/Head) + reg registry.Registry + uploader uploader.Uploader + + chunkSize int64 + + mu sync.Mutex + locks map[string]*sync.Mutex +} + +// Options configures a Service. Zero-valued options take sensible defaults. +type Options struct { + // ChunkSize is the body chunk size for new objects. 0 → DefaultChunkSize. + ChunkSize int64 + // Uploader receives a CAR per mutation (PutObject, DeleteObject) + // containing the structural blocks created by that op. nil → + // uploader.Noop (CARs are dropped on the floor). + Uploader uploader.Uploader +} + +// New wires the dependencies into a Service. The blockstore can be +// any cbor.IpldBlockstore implementation; sprue's fx wiring chooses +// either a SQLite-backed local store (default) or a Forge-backed +// pass-through store (no_cache mode). +func New(bs cbor.IpldBlockstore, reg registry.Registry, opt Options) *Service { + cs := opt.ChunkSize + if cs <= 0 { + cs = DefaultChunkSize + } + up := opt.Uploader + if up == nil { + up = uploader.Noop{} + } + return &Service{ + bs: bs, + cst: mst.CborStore(bs), + reg: reg, + uploader: up, + chunkSize: cs, + locks: map[string]*sync.Mutex{}, + } +} + +func (s *Service) bucketLock(name string) *sync.Mutex { + s.mu.Lock() + defer s.mu.Unlock() + if m, ok := s.locks[name]; ok { + return m + } + m := &sync.Mutex{} + s.locks[name] = m + return m +} + +var ( + ErrBucketExists = registry.ErrExists + ErrBucketNotFound = registry.ErrNotFound + ErrObjectNotFound = errors.New("bucket: object not found") + ErrInvalidKey = errors.New("bucket: invalid object key") + ErrInvalidBucket = errors.New("bucket: invalid bucket name") + ErrBucketNotEmpty = errors.New("bucket: bucket not empty") + ErrInvalidRange = errors.New("bucket: invalid range") +) + +// === Bucket lifecycle === + +func (s *Service) CreateBucket(ctx context.Context, name string) error { + if !validBucketName(name) { + return ErrInvalidBucket + } + return s.reg.Create(ctx, name, time.Now().Unix()) +} + +func (s *Service) ListBuckets(ctx context.Context) ([]*registry.State, error) { + return s.reg.List(ctx) +} + +// === Forge sync (recovery + shutdown) === + +// Recover ensures every bucket's current root has been shipped to the +// Uploader. For each bucket, if the persisted ForgeRoot does not equal +// the current Root, walks the entire DAG reachable from Root, submits +// the blocks, flushes the Uploader, and advances ForgeRoot. +// +// Intended to be called once at startup, before the HTTP listener +// begins serving. Idempotent: if everything is already in sync, it's +// a fast scan and a no-op flush. +func (s *Service) Recover(ctx context.Context) error { + states, err := s.reg.List(ctx) + if err != nil { + return fmt.Errorf("recover: list buckets: %w", err) + } + + var dirty []*registry.State + for _, st := range states { + if !st.Root.Defined() { + continue + } + if st.ForgeRoot.Defined() && st.ForgeRoot.Equals(st.Root) { + continue + } + blocks, err := blockstore.WalkReachable(ctx, s.bs, st.Root) + if err != nil { + return fmt.Errorf("recover %q: walk: %w", st.Name, err) + } + if len(blocks) == 0 { + continue + } + if err := s.uploader.Submit(ctx, []cid.Cid{st.Root}, blocks); err != nil { + return fmt.Errorf("recover %q: submit: %w", st.Name, err) + } + dirty = append(dirty, st) + } + + if len(dirty) == 0 { + return nil + } + if err := s.uploader.Flush(ctx); err != nil { + return fmt.Errorf("recover: flush: %w", err) + } + for _, st := range dirty { + if err := s.reg.SetForgeRoot(ctx, st.Name, st.Root); err != nil { + return fmt.Errorf("recover %q: set forge root: %w", st.Name, err) + } + } + return nil +} + +// Shutdown cleanly drains the Uploader and advances ForgeRoot to the +// current Root for every bucket. After Shutdown returns successfully, +// a subsequent Recover at the next startup is a no-op. +func (s *Service) Shutdown(ctx context.Context) error { + if err := s.uploader.Close(ctx); err != nil { + return fmt.Errorf("shutdown: close uploader: %w", err) + } + states, err := s.reg.List(ctx) + if err != nil { + return fmt.Errorf("shutdown: list buckets: %w", err) + } + for _, st := range states { + if !st.Root.Defined() { + continue + } + if st.ForgeRoot.Defined() && st.ForgeRoot.Equals(st.Root) { + continue + } + if err := s.reg.SetForgeRoot(ctx, st.Name, st.Root); err != nil { + return fmt.Errorf("shutdown %q: set forge root: %w", st.Name, err) + } + } + return nil +} + +func (s *Service) DeleteBucket(ctx context.Context, name string) error { + lock := s.bucketLock(name) + lock.Lock() + defer lock.Unlock() + + st, err := s.reg.Get(ctx, name) + if err != nil { + return err + } + if st.Root.Defined() { + t := mst.LoadMST(s.cst, st.Root) + var seen bool + walkErr := t.WalkLeavesFromNocache(ctx, "", func(string, cid.Cid) error { + seen = true + return mst.ErrStopWalk + }) + if walkErr != nil { + return fmt.Errorf("bucket: scan empty: %w", walkErr) + } + if seen { + return ErrBucketNotEmpty + } + } + return s.reg.Delete(ctx, name) +} + +// === Object operations === + +// PutObject stores body under (bucket, key), creating or replacing as +// needed. Body bytes are chunked into raw IPLD blocks (written directly +// to the underlying blockstore). Structural blocks (manifest + mutated +// MST nodes) are captured into a per-op CARBuffer and emitted as a +// single CAR via the configured Uploader at commit time. +func (s *Service) PutObject(ctx context.Context, bucket, key string, body io.Reader, contentType string) (*ObjectManifest, error) { + if !mst.IsValidKey(key) { + return nil, ErrInvalidKey + } + + lock := s.bucketLock(bucket) + lock.Lock() + defer lock.Unlock() + + st, err := s.reg.Get(ctx, bucket) + if err != nil { + return nil, err + } + + // All blocks for this PUT — body chunks, manifest, MST mutation path — + // flow through the same per-op CARBuffer. The Uploader receives one + // Submit per S3 op containing every block reachable from the new root, + // individually addressable inside the resulting CAR via the indexer. + buf := blockstore.NewCARBuffer(s.bs, s.uploader) + + bodyRec, err := putBody(ctx, buf, body, s.chunkSize) + if err != nil { + buf.Discard() + return nil, fmt.Errorf("bucket: chunk body: %w", err) + } + + if contentType == "" { + contentType = "application/octet-stream" + } + mf := &ObjectManifest{ + Key: key, + ContentType: contentType, + Created: time.Now().Unix(), + Body: bodyRec, + } + + opCst := mst.CborStore(buf) + + mfCid, err := opCst.Put(ctx, mf) + if err != nil { + buf.Discard() + return nil, fmt.Errorf("bucket: manifest put: %w", err) + } + + t := loadOrEmpty(opCst, st.Root) + t2, err := t.Add(ctx, key, mfCid, -1) + if errors.Is(err, mst.ErrAlreadyExists) { + t2, err = t.Update(ctx, key, mfCid) + } + if err != nil { + buf.Discard() + return nil, fmt.Errorf("bucket: mst write: %w", err) + } + + newRoot, err := t2.GetPointer(ctx) + if err != nil { + buf.Discard() + return nil, fmt.Errorf("bucket: mst pointer: %w", err) + } + + if err := buf.Commit(ctx, newRoot); err != nil { + return nil, fmt.Errorf("bucket: car commit: %w", err) + } + + if err := s.reg.CASRoot(ctx, bucket, st.Root, newRoot); err != nil { + return nil, fmt.Errorf("bucket: advance root: %w", err) + } + return mf, nil +} + +// GetObject opens the body and returns the manifest. Caller must Close. +// If rng is non-nil, returns a reader over the requested byte range. +func (s *Service) GetObject(ctx context.Context, bucket, key string, rng *Range) (io.ReadCloser, *ObjectManifest, error) { + mf, err := s.HeadObject(ctx, bucket, key) + if err != nil { + return nil, nil, err + } + if rng == nil { + return openBody(ctx, s.bs, mf.Body), mf, nil + } + if err := rng.resolve(mf.Body.Size); err != nil { + return nil, mf, err + } + return openBodyRange(ctx, s.bs, mf.Body, rng.Start, rng.End), mf, nil +} + +// HeadObject returns just the manifest. +func (s *Service) HeadObject(ctx context.Context, bucket, key string) (*ObjectManifest, error) { + st, err := s.reg.Get(ctx, bucket) + if err != nil { + return nil, err + } + if !st.Root.Defined() { + return nil, ErrObjectNotFound + } + t := mst.LoadMST(s.cst, st.Root) + mfCid, err := t.Get(ctx, key) + if errors.Is(err, mst.ErrNotFound) { + return nil, ErrObjectNotFound + } + if err != nil { + return nil, fmt.Errorf("bucket: mst get: %w", err) + } + var mf ObjectManifest + if err := s.cst.Get(ctx, mfCid, &mf); err != nil { + return nil, fmt.Errorf("bucket: manifest get: %w", err) + } + return &mf, nil +} + +// DeleteObject removes a key from the bucket. Missing keys return nil +// (matching S3's idempotent DELETE semantics). Body chunks are NOT +// deleted from the blockstore; GC is a future, separate pass over live +// manifests. +func (s *Service) DeleteObject(ctx context.Context, bucket, key string) error { + lock := s.bucketLock(bucket) + lock.Lock() + defer lock.Unlock() + + st, err := s.reg.Get(ctx, bucket) + if err != nil { + return err + } + if !st.Root.Defined() { + return nil + } + + buf := blockstore.NewCARBuffer(s.bs, s.uploader) + opCst := mst.CborStore(buf) + + t := mst.LoadMST(opCst, st.Root) + t2, err := t.Delete(ctx, key) + if errors.Is(err, mst.ErrNotFound) { + buf.Discard() + return nil + } + if err != nil { + buf.Discard() + return fmt.Errorf("bucket: mst delete: %w", err) + } + + newRoot, err := t2.GetPointer(ctx) + if err != nil { + buf.Discard() + return fmt.Errorf("bucket: mst pointer: %w", err) + } + if err := buf.Commit(ctx, newRoot); err != nil { + return fmt.Errorf("bucket: car commit: %w", err) + } + if err := s.reg.CASRoot(ctx, bucket, st.Root, newRoot); err != nil { + return fmt.Errorf("bucket: advance root: %w", err) + } + return nil +} + +// === Range support === + +// Range describes an inclusive byte range, matching HTTP Range semantics. +// +// To support the open-ended ("bytes=N-") and suffix ("bytes=-N") forms +// without forcing the HTTP layer to do a separate HEAD before the GET, +// callers may set sentinel values: +// - Start = -1 means "the last End bytes" (suffix form) +// - End = -1 means "from Start to the end of the object" +// +// resolve() is called by the service once the body size is known. +type Range struct { + Start int64 + End int64 +} + +// resolve fills in any sentinel values (-1) using size and validates the +// resulting range. Returns ErrInvalidRange if the range is unsatisfiable. +func (r *Range) resolve(size int64) error { + if size <= 0 { + return ErrInvalidRange + } + switch { + case r.Start < 0 && r.End >= 0: + // suffix: last End bytes + if r.End == 0 { + return ErrInvalidRange + } + if r.End > size { + r.End = size + } + r.Start = size - r.End + r.End = size - 1 + case r.Start >= 0 && r.End < 0: + // open-ended + r.End = size - 1 + } + if r.Start < 0 || r.End < r.Start || r.End >= size { + return ErrInvalidRange + } + return nil +} + +// === Listing === + +type ListResult struct { + Objects []*ObjectManifest + CommonPrefixes []string + Truncated bool + NextToken string +} + +type ListOptions struct { + Prefix string + Delimiter string + StartAfter string + MaxKeys int +} + +const defaultMaxKeys = 1000 + +func (s *Service) List(ctx context.Context, bucket string, opt ListOptions) (*ListResult, error) { + if opt.MaxKeys <= 0 { + opt.MaxKeys = defaultMaxKeys + } + + st, err := s.reg.Get(ctx, bucket) + if err != nil { + return nil, err + } + res := &ListResult{} + if !st.Root.Defined() { + return res, nil + } + + t := mst.LoadMST(s.cst, st.Root) + + from := opt.Prefix + if opt.StartAfter != "" && opt.StartAfter > from { + from = opt.StartAfter + "\x01" + } + + seenPrefix := map[string]struct{}{} + walkErr := t.WalkLeavesFromNocache(ctx, from, func(k string, mfCid cid.Cid) error { + if opt.Prefix != "" && !strings.HasPrefix(k, opt.Prefix) { + return mst.ErrStopWalk + } + + if opt.Delimiter != "" { + tail := k[len(opt.Prefix):] + if i := strings.Index(tail, opt.Delimiter); i >= 0 { + cp := opt.Prefix + tail[:i+len(opt.Delimiter)] + if _, dup := seenPrefix[cp]; !dup { + seenPrefix[cp] = struct{}{} + res.CommonPrefixes = append(res.CommonPrefixes, cp) + if len(res.Objects)+len(res.CommonPrefixes) >= opt.MaxKeys { + res.Truncated = true + res.NextToken = cp + return mst.ErrStopWalk + } + } + return nil + } + } + + var mf ObjectManifest + if err := s.cst.Get(ctx, mfCid, &mf); err != nil { + return fmt.Errorf("manifest get %s: %w", mfCid, err) + } + res.Objects = append(res.Objects, &mf) + + if len(res.Objects)+len(res.CommonPrefixes) >= opt.MaxKeys { + res.Truncated = true + res.NextToken = k + return mst.ErrStopWalk + } + return nil + }) + if walkErr != nil { + return nil, fmt.Errorf("bucket: walk: %w", walkErr) + } + return res, nil +} + +// === Internal helpers === + +func loadOrEmpty(cst cbor.IpldStore, root cid.Cid) *mst.MerkleSearchTree { + if root.Defined() { + return mst.LoadMST(cst, root) + } + return mst.NewEmptyMST(cst) +} + +func validBucketName(s string) bool { + if len(s) < 3 || len(s) > 63 { + return false + } + for i, r := range s { + switch { + case r >= 'a' && r <= 'z': + case r >= '0' && r <= '9': + case r == '-' || r == '.': + if i == 0 { + return false + } + default: + return false + } + } + return true +} diff --git a/pkg/ms3t/bucket/cbor_gen.go b/pkg/ms3t/bucket/cbor_gen.go new file mode 100644 index 0000000..3b35d4f --- /dev/null +++ b/pkg/ms3t/bucket/cbor_gen.go @@ -0,0 +1,507 @@ +// Code generated by github.com/whyrusleeping/cbor-gen. DO NOT EDIT. + +package bucket + +import ( + "fmt" + "io" + "math" + "sort" + + cid "github.com/ipfs/go-cid" + cbg "github.com/whyrusleeping/cbor-gen" + xerrors "golang.org/x/xerrors" +) + +var _ = xerrors.Errorf +var _ = cid.Undef +var _ = math.E +var _ = sort.Sort + +func (t *ObjectManifest) MarshalCBOR(w io.Writer) error { + if t == nil { + _, err := w.Write(cbg.CborNull) + return err + } + + cw := cbg.NewCborWriter(w) + + if _, err := cw.Write([]byte{164}); err != nil { + return err + } + + // t.Body (bucket.Body) (struct) + if len("b") > 1000000 { + return xerrors.Errorf("Value in field \"b\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("b"))); err != nil { + return err + } + if _, err := cw.WriteString(string("b")); err != nil { + return err + } + + if err := t.Body.MarshalCBOR(cw); err != nil { + return err + } + + // t.Key (string) (string) + if len("k") > 1000000 { + return xerrors.Errorf("Value in field \"k\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("k"))); err != nil { + return err + } + if _, err := cw.WriteString(string("k")); err != nil { + return err + } + + if len(t.Key) > 1000000 { + return xerrors.Errorf("Value in field t.Key was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len(t.Key))); err != nil { + return err + } + if _, err := cw.WriteString(string(t.Key)); err != nil { + return err + } + + // t.Created (int64) (int64) + if len("t") > 1000000 { + return xerrors.Errorf("Value in field \"t\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("t"))); err != nil { + return err + } + if _, err := cw.WriteString(string("t")); err != nil { + return err + } + + if t.Created >= 0 { + if err := cw.WriteMajorTypeHeader(cbg.MajUnsignedInt, uint64(t.Created)); err != nil { + return err + } + } else { + if err := cw.WriteMajorTypeHeader(cbg.MajNegativeInt, uint64(-t.Created-1)); err != nil { + return err + } + } + + // t.ContentType (string) (string) + if len("ct") > 1000000 { + return xerrors.Errorf("Value in field \"ct\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("ct"))); err != nil { + return err + } + if _, err := cw.WriteString(string("ct")); err != nil { + return err + } + + if len(t.ContentType) > 1000000 { + return xerrors.Errorf("Value in field t.ContentType was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len(t.ContentType))); err != nil { + return err + } + if _, err := cw.WriteString(string(t.ContentType)); err != nil { + return err + } + return nil +} + +func (t *ObjectManifest) UnmarshalCBOR(r io.Reader) (err error) { + *t = ObjectManifest{} + + cr := cbg.NewCborReader(r) + + maj, extra, err := cr.ReadHeader() + if err != nil { + return err + } + defer func() { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + }() + + if maj != cbg.MajMap { + return fmt.Errorf("cbor input should be of type map") + } + + if extra > cbg.MaxLength { + return fmt.Errorf("ObjectManifest: map struct too large (%d)", extra) + } + + n := extra + + nameBuf := make([]byte, 2) + for i := uint64(0); i < n; i++ { + nameLen, ok, err := cbg.ReadFullStringIntoBuf(cr, nameBuf, 1000000) + if err != nil { + return err + } + + if !ok { + // Field doesn't exist on this type, so ignore it + if err := cbg.ScanForLinks(cr, func(cid.Cid) {}); err != nil { + return err + } + continue + } + + switch string(nameBuf[:nameLen]) { + // t.Body (bucket.Body) (struct) + case "b": + + { + + if err := t.Body.UnmarshalCBOR(cr); err != nil { + return xerrors.Errorf("unmarshaling t.Body: %w", err) + } + + } + // t.Key (string) (string) + case "k": + + { + sval, err := cbg.ReadStringWithMax(cr, 1000000) + if err != nil { + return err + } + + t.Key = string(sval) + } + // t.Created (int64) (int64) + case "t": + { + maj, extra, err := cr.ReadHeader() + if err != nil { + return err + } + var extraI int64 + switch maj { + case cbg.MajUnsignedInt: + extraI = int64(extra) + if extraI < 0 { + return fmt.Errorf("int64 positive overflow") + } + case cbg.MajNegativeInt: + extraI = int64(extra) + if extraI < 0 { + return fmt.Errorf("int64 negative overflow") + } + extraI = -1 - extraI + default: + return fmt.Errorf("wrong type for int64 field: %d", maj) + } + + t.Created = int64(extraI) + } + // t.ContentType (string) (string) + case "ct": + + { + sval, err := cbg.ReadStringWithMax(cr, 1000000) + if err != nil { + return err + } + + t.ContentType = string(sval) + } + + default: + // Field doesn't exist on this type, so ignore it + if err := cbg.ScanForLinks(r, func(cid.Cid) {}); err != nil { + return err + } + } + } + + return nil +} +func (t *Body) MarshalCBOR(w io.Writer) error { + if t == nil { + _, err := w.Write(cbg.CborNull) + return err + } + + cw := cbg.NewCborWriter(w) + + if _, err := cw.Write([]byte{164}); err != nil { + return err + } + + // t.Chunks ([]cid.Cid) (slice) + if len("c") > 1000000 { + return xerrors.Errorf("Value in field \"c\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("c"))); err != nil { + return err + } + if _, err := cw.WriteString(string("c")); err != nil { + return err + } + + if len(t.Chunks) > 8192 { + return xerrors.Errorf("Slice value in field t.Chunks was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajArray, uint64(len(t.Chunks))); err != nil { + return err + } + for _, v := range t.Chunks { + + if err := cbg.WriteCid(cw, v); err != nil { + return xerrors.Errorf("failed to write cid field v: %w", err) + } + + } + + // t.SHA256 ([]uint8) (slice) + if len("h") > 1000000 { + return xerrors.Errorf("Value in field \"h\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("h"))); err != nil { + return err + } + if _, err := cw.WriteString(string("h")); err != nil { + return err + } + + if len(t.SHA256) > 2097152 { + return xerrors.Errorf("Byte array in field t.SHA256 was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajByteString, uint64(len(t.SHA256))); err != nil { + return err + } + + if _, err := cw.Write(t.SHA256); err != nil { + return err + } + + // t.Size (int64) (int64) + if len("s") > 1000000 { + return xerrors.Errorf("Value in field \"s\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("s"))); err != nil { + return err + } + if _, err := cw.WriteString(string("s")); err != nil { + return err + } + + if t.Size >= 0 { + if err := cw.WriteMajorTypeHeader(cbg.MajUnsignedInt, uint64(t.Size)); err != nil { + return err + } + } else { + if err := cw.WriteMajorTypeHeader(cbg.MajNegativeInt, uint64(-t.Size-1)); err != nil { + return err + } + } + + // t.ChunkSize (int64) (int64) + if len("cs") > 1000000 { + return xerrors.Errorf("Value in field \"cs\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("cs"))); err != nil { + return err + } + if _, err := cw.WriteString(string("cs")); err != nil { + return err + } + + if t.ChunkSize >= 0 { + if err := cw.WriteMajorTypeHeader(cbg.MajUnsignedInt, uint64(t.ChunkSize)); err != nil { + return err + } + } else { + if err := cw.WriteMajorTypeHeader(cbg.MajNegativeInt, uint64(-t.ChunkSize-1)); err != nil { + return err + } + } + + return nil +} + +func (t *Body) UnmarshalCBOR(r io.Reader) (err error) { + *t = Body{} + + cr := cbg.NewCborReader(r) + + maj, extra, err := cr.ReadHeader() + if err != nil { + return err + } + defer func() { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + }() + + if maj != cbg.MajMap { + return fmt.Errorf("cbor input should be of type map") + } + + if extra > cbg.MaxLength { + return fmt.Errorf("Body: map struct too large (%d)", extra) + } + + n := extra + + nameBuf := make([]byte, 2) + for i := uint64(0); i < n; i++ { + nameLen, ok, err := cbg.ReadFullStringIntoBuf(cr, nameBuf, 1000000) + if err != nil { + return err + } + + if !ok { + // Field doesn't exist on this type, so ignore it + if err := cbg.ScanForLinks(cr, func(cid.Cid) {}); err != nil { + return err + } + continue + } + + switch string(nameBuf[:nameLen]) { + // t.Chunks ([]cid.Cid) (slice) + case "c": + + maj, extra, err = cr.ReadHeader() + if err != nil { + return err + } + + if extra > 8192 { + return fmt.Errorf("t.Chunks: array too large (%d)", extra) + } + + if maj != cbg.MajArray { + return fmt.Errorf("expected cbor array") + } + + if extra > 0 { + t.Chunks = make([]cid.Cid, extra) + } + + for i := 0; i < int(extra); i++ { + { + var maj byte + var extra uint64 + var err error + _ = maj + _ = extra + _ = err + + { + + c, err := cbg.ReadCid(cr) + if err != nil { + return xerrors.Errorf("failed to read cid field t.Chunks[i]: %w", err) + } + + t.Chunks[i] = c + + } + + } + } + // t.SHA256 ([]uint8) (slice) + case "h": + + maj, extra, err = cr.ReadHeader() + if err != nil { + return err + } + + if extra > 2097152 { + return fmt.Errorf("t.SHA256: byte array too large (%d)", extra) + } + if maj != cbg.MajByteString { + return fmt.Errorf("expected byte array") + } + + if extra > 0 { + t.SHA256 = make([]uint8, extra) + } + + if _, err := io.ReadFull(cr, t.SHA256); err != nil { + return err + } + + // t.Size (int64) (int64) + case "s": + { + maj, extra, err := cr.ReadHeader() + if err != nil { + return err + } + var extraI int64 + switch maj { + case cbg.MajUnsignedInt: + extraI = int64(extra) + if extraI < 0 { + return fmt.Errorf("int64 positive overflow") + } + case cbg.MajNegativeInt: + extraI = int64(extra) + if extraI < 0 { + return fmt.Errorf("int64 negative overflow") + } + extraI = -1 - extraI + default: + return fmt.Errorf("wrong type for int64 field: %d", maj) + } + + t.Size = int64(extraI) + } + // t.ChunkSize (int64) (int64) + case "cs": + { + maj, extra, err := cr.ReadHeader() + if err != nil { + return err + } + var extraI int64 + switch maj { + case cbg.MajUnsignedInt: + extraI = int64(extra) + if extraI < 0 { + return fmt.Errorf("int64 positive overflow") + } + case cbg.MajNegativeInt: + extraI = int64(extra) + if extraI < 0 { + return fmt.Errorf("int64 negative overflow") + } + extraI = -1 - extraI + default: + return fmt.Errorf("wrong type for int64 field: %d", maj) + } + + t.ChunkSize = int64(extraI) + } + + default: + // Field doesn't exist on this type, so ignore it + if err := cbg.ScanForLinks(r, func(cid.Cid) {}); err != nil { + return err + } + } + } + + return nil +} diff --git a/pkg/ms3t/bucket/chunker.go b/pkg/ms3t/bucket/chunker.go new file mode 100644 index 0000000..25a096e --- /dev/null +++ b/pkg/ms3t/bucket/chunker.go @@ -0,0 +1,177 @@ +package bucket + +import ( + "context" + "crypto/sha256" + "errors" + "fmt" + "io" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" + mh "github.com/multiformats/go-multihash" +) + +// DefaultChunkSize is the chunk size used when callers don't supply one. +// 1 MiB matches typical UnixFS chunking and balances per-blob piri +// overhead against range-read granularity. +const DefaultChunkSize int64 = 1 << 20 + +// rawBlockPrefix produces CIDs for body chunks: CIDv1, raw codec (0x55), +// sha256 multihash. Chunks are opaque bytes — no IPLD links — so the raw +// codec is the natural fit. +var rawBlockPrefix = cid.Prefix{ + Version: 1, + Codec: cid.Raw, + MhType: mh.SHA2_256, + MhLength: -1, +} + +// putBody reads body bytes from r, splits at chunkSize, writes each chunk +// as a raw IPLD block to bs, and returns a Body record. The body's full +// sha256 is computed once during chunking and stored on the Body for use +// as the ETag. +func putBody(ctx context.Context, bs cbor.IpldBlockstore, r io.Reader, chunkSize int64) (Body, error) { + if chunkSize <= 0 { + chunkSize = DefaultChunkSize + } + + buf := make([]byte, chunkSize) + bodyHasher := sha256.New() + var chunks []cid.Cid + var total int64 + + for { + n, err := io.ReadFull(r, buf) + if n > 0 { + chunk := buf[:n] + bodyHasher.Write(chunk) + c, perr := putRawBlock(ctx, bs, chunk) + if perr != nil { + return Body{}, fmt.Errorf("put chunk: %w", perr) + } + chunks = append(chunks, c) + total += int64(n) + } + if err == nil { + continue + } + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { + break + } + return Body{}, fmt.Errorf("read body: %w", err) + } + + return Body{ + Size: total, + ChunkSize: chunkSize, + Chunks: chunks, + SHA256: bodyHasher.Sum(nil), + }, nil +} + +func putRawBlock(ctx context.Context, bs cbor.IpldBlockstore, data []byte) (cid.Cid, error) { + c, err := rawBlockPrefix.Sum(data) + if err != nil { + return cid.Undef, err + } + blk, err := block.NewBlockWithCid(data, c) + if err != nil { + return cid.Undef, err + } + if err := bs.Put(ctx, blk); err != nil { + return cid.Undef, err + } + return c, nil +} + +// openBody returns a reader over the full body. +func openBody(ctx context.Context, bs cbor.IpldBlockstore, body Body) io.ReadCloser { + return &bodyReader{ctx: ctx, bs: bs, body: body, end: body.Size - 1} +} + +// openBodyRange returns a reader over [start, end] inclusive of the body. +// Caller must ensure 0 <= start <= end <= Size-1. +func openBodyRange(ctx context.Context, bs cbor.IpldBlockstore, body Body, start, end int64) io.ReadCloser { + cs := body.ChunkSize + startChunk := int(start / cs) + startOffset := start % cs + return &bodyReader{ + ctx: ctx, + bs: bs, + body: body, + nextChunk: startChunk, + startOff: startOffset, + pos: start, + end: end, + havePartial: true, + } +} + +// bodyReader streams chunks lazily. It supports both whole-body and ranged +// reads via the same loop — only the initial offset and the inclusive end +// position differ. +type bodyReader struct { + ctx context.Context + bs cbor.IpldBlockstore + body Body + + nextChunk int + startOff int64 // offset into the first chunk we read + havePartial bool // whether startOff still applies to the next chunk read + + cur []byte // currently materialized chunk bytes + curOff int // read position within cur + + pos int64 // current absolute byte position (next byte to return) + end int64 // last byte to return (inclusive) + err error +} + +func (br *bodyReader) Read(p []byte) (int, error) { + if br.err != nil { + return 0, br.err + } + if br.pos > br.end { + br.err = io.EOF + return 0, io.EOF + } + + if br.cur == nil || br.curOff >= len(br.cur) { + if br.nextChunk >= len(br.body.Chunks) { + br.err = io.EOF + return 0, io.EOF + } + blk, err := br.bs.Get(br.ctx, br.body.Chunks[br.nextChunk]) + if err != nil { + br.err = fmt.Errorf("read chunk %d: %w", br.nextChunk, err) + return 0, br.err + } + br.cur = blk.RawData() + br.curOff = 0 + br.nextChunk++ + if br.havePartial { + br.curOff = int(br.startOff) + br.havePartial = false + } + } + + // Don't read past the inclusive end position. + remaining := br.end - br.pos + 1 + available := int64(len(br.cur) - br.curOff) + want := int64(len(p)) + if want > available { + want = available + } + if want > remaining { + want = remaining + } + + n := copy(p[:want], br.cur[br.curOff:br.curOff+int(want)]) + br.curOff += n + br.pos += int64(n) + return n, nil +} + +func (br *bodyReader) Close() error { return nil } diff --git a/pkg/ms3t/bucket/manifest.go b/pkg/ms3t/bucket/manifest.go new file mode 100644 index 0000000..27756f0 --- /dev/null +++ b/pkg/ms3t/bucket/manifest.go @@ -0,0 +1,26 @@ +package bucket + +import "github.com/ipfs/go-cid" + +// ObjectManifest is the per-object metadata record stored as a CBOR block +// in the IPLD blockstore. The MST leaf for an object key points at this +// record's CID. The body bytes themselves live as raw IPLD blocks (codec +// 0x55) addressed by sha256 multihash; this manifest holds the ordered +// list of chunk CIDs. +type ObjectManifest struct { + Key string `cborgen:"k"` + ContentType string `cborgen:"ct"` + Created int64 `cborgen:"t"` + Body Body `cborgen:"b"` +} + +// Body describes how the object's bytes are split into content-addressed +// chunks. ChunkSize is fixed across the object's chunks; the last chunk +// may be shorter than ChunkSize. Range arithmetic is direct: byte N lives +// in chunk index N/ChunkSize at offset N%ChunkSize. +type Body struct { + Size int64 `cborgen:"s"` + ChunkSize int64 `cborgen:"cs"` + Chunks []cid.Cid `cborgen:"c"` + SHA256 []byte `cborgen:"h"` // full-body sha256, for ETag +} diff --git a/pkg/ms3t/cars/encoder.go b/pkg/ms3t/cars/encoder.go new file mode 100644 index 0000000..11bdcdc --- /dev/null +++ b/pkg/ms3t/cars/encoder.go @@ -0,0 +1,164 @@ +// Package cars writes CAR v1 (Content Addressable aRchive) files. +// +// The format is intentionally simple: +// +// [varint: header_len][DAG-CBOR header bytes] +// [varint: frame_len][CID bytes][block bytes] +// [varint: frame_len][CID bytes][block bytes] +// ... +// +// Header is `{ "roots": [...], "version": 1 }` in DAG-CBOR +// (deterministic key order: by length, then bytewise — "roots" before +// "version"). +// +// Each block frame's varint length covers the CID bytes plus the raw +// block bytes that follow. +package cars + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + cbg "github.com/whyrusleeping/cbor-gen" +) + +// BlockPosition records where a block's raw payload bytes live within +// an encoded CAR. Offset and Length are measured against the **block +// data**, NOT the frame header or the CID prefix — i.e. they describe +// the slice of the CAR you'd seek to and read from to recover the +// raw block bytes. +// +// This is the convention `blobindex.Position` expects. +type BlockPosition struct { + CID cid.Cid + Offset uint64 + Length uint64 +} + +// Write encodes a CAR v1 file with the given roots and blocks. Block +// ordering is preserved. +func Write(w io.Writer, roots []cid.Cid, blocks []block.Block) error { + _, err := WriteWithPositions(w, roots, blocks) + return err +} + +// WriteWithPositions is like Write, but additionally returns the byte +// position of each block's payload within the encoded CAR. Used by the +// Forge uploader to build a `blobindex.ShardedDagIndexView` mapping +// inner CIDs to their slices of the outer CAR blob. +func WriteWithPositions(w io.Writer, roots []cid.Cid, blocks []block.Block) ([]BlockPosition, error) { + if len(roots) == 0 { + return nil, fmt.Errorf("cars: at least one root required") + } + + cw := &countingWriter{w: w} + + headerBytes, err := encodeHeader(roots) + if err != nil { + return nil, fmt.Errorf("cars: encode header: %w", err) + } + if err := writeUvarint(cw, uint64(len(headerBytes))); err != nil { + return nil, fmt.Errorf("cars: write header len: %w", err) + } + if _, err := cw.Write(headerBytes); err != nil { + return nil, fmt.Errorf("cars: write header: %w", err) + } + + positions := make([]BlockPosition, 0, len(blocks)) + for i, blk := range blocks { + pos, err := writeBlock(cw, blk) + if err != nil { + return nil, fmt.Errorf("cars: write block %d (%s): %w", i, blk.Cid(), err) + } + positions = append(positions, pos) + } + return positions, nil +} + +func encodeHeader(roots []cid.Cid) ([]byte, error) { + var buf bytes.Buffer + cw := cbg.NewCborWriter(&buf) + + if err := cw.WriteMajorTypeHeader(cbg.MajMap, 2); err != nil { + return nil, err + } + + if err := writeMapKey(cw, "roots"); err != nil { + return nil, err + } + if err := cw.WriteMajorTypeHeader(cbg.MajArray, uint64(len(roots))); err != nil { + return nil, err + } + for _, c := range roots { + if err := cbg.WriteCid(cw, c); err != nil { + return nil, err + } + } + + if err := writeMapKey(cw, "version"); err != nil { + return nil, err + } + if err := cw.WriteMajorTypeHeader(cbg.MajUnsignedInt, 1); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +func writeMapKey(cw *cbg.CborWriter, key string) error { + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len(key))); err != nil { + return err + } + _, err := cw.WriteString(key) + return err +} + +// writeBlock emits one frame and returns the position of the block's +// payload (post-CID-prefix bytes) within the surrounding CAR. +func writeBlock(cw *countingWriter, blk block.Block) (BlockPosition, error) { + cidBytes := blk.Cid().Bytes() + data := blk.RawData() + frameLen := uint64(len(cidBytes) + len(data)) + + if err := writeUvarint(cw, frameLen); err != nil { + return BlockPosition{}, err + } + if _, err := cw.Write(cidBytes); err != nil { + return BlockPosition{}, err + } + + dataOffset := cw.n + if _, err := cw.Write(data); err != nil { + return BlockPosition{}, err + } + return BlockPosition{ + CID: blk.Cid(), + Offset: uint64(dataOffset), + Length: uint64(len(data)), + }, nil +} + +func writeUvarint(w io.Writer, n uint64) error { + var buf [binary.MaxVarintLen64]byte + sz := binary.PutUvarint(buf[:], n) + _, err := w.Write(buf[:sz]) + return err +} + +// countingWriter forwards writes to an underlying io.Writer while +// tracking the total number of bytes written. Used to compute block +// payload offsets for the index. +type countingWriter struct { + w io.Writer + n int64 +} + +func (cw *countingWriter) Write(p []byte) (int, error) { + n, err := cw.w.Write(p) + cw.n += int64(n) + return n, err +} diff --git a/pkg/ms3t/gen/main.go b/pkg/ms3t/gen/main.go new file mode 100644 index 0000000..a8bf3fa --- /dev/null +++ b/pkg/ms3t/gen/main.go @@ -0,0 +1,16 @@ +// Generates CBOR marshal/unmarshal methods for ms3t types. Run from repo root: +// +// go run ./gen +package main + +import ( + "github.com/storacha/sprue/pkg/ms3t/bucket" + cbg "github.com/whyrusleeping/cbor-gen" +) + +func main() { + cfg := cbg.Gen{MaxStringLength: 1_000_000} + if err := cfg.WriteMapEncodersToFile("bucket/cbor_gen.go", "bucket", bucket.ObjectManifest{}, bucket.Body{}); err != nil { + panic(err) + } +} diff --git a/pkg/ms3t/mst/cbor_gen.go b/pkg/ms3t/mst/cbor_gen.go new file mode 100644 index 0000000..2c56f7b --- /dev/null +++ b/pkg/ms3t/mst/cbor_gen.go @@ -0,0 +1,433 @@ +// Code generated by github.com/whyrusleeping/cbor-gen. DO NOT EDIT. + +package mst + +import ( + "fmt" + "io" + "math" + "sort" + + cid "github.com/ipfs/go-cid" + cbg "github.com/whyrusleeping/cbor-gen" + xerrors "golang.org/x/xerrors" +) + +var _ = xerrors.Errorf +var _ = cid.Undef +var _ = math.E +var _ = sort.Sort + +func (t *NodeData) MarshalCBOR(w io.Writer) error { + if t == nil { + _, err := w.Write(cbg.CborNull) + return err + } + + cw := cbg.NewCborWriter(w) + + if _, err := cw.Write([]byte{162}); err != nil { + return err + } + + // t.Entries ([]mst.TreeEntry) (slice) + if len("e") > 1000000 { + return xerrors.Errorf("Value in field \"e\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("e"))); err != nil { + return err + } + if _, err := cw.WriteString(string("e")); err != nil { + return err + } + + if len(t.Entries) > 8192 { + return xerrors.Errorf("Slice value in field t.Entries was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajArray, uint64(len(t.Entries))); err != nil { + return err + } + for _, v := range t.Entries { + if err := v.MarshalCBOR(cw); err != nil { + return err + } + + } + + // t.Left (cid.Cid) (struct) + if len("l") > 1000000 { + return xerrors.Errorf("Value in field \"l\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("l"))); err != nil { + return err + } + if _, err := cw.WriteString(string("l")); err != nil { + return err + } + + if t.Left == nil { + if _, err := cw.Write(cbg.CborNull); err != nil { + return err + } + } else { + if err := cbg.WriteCid(cw, *t.Left); err != nil { + return xerrors.Errorf("failed to write cid field t.Left: %w", err) + } + } + + return nil +} + +func (t *NodeData) UnmarshalCBOR(r io.Reader) (err error) { + *t = NodeData{} + + cr := cbg.NewCborReader(r) + + maj, extra, err := cr.ReadHeader() + if err != nil { + return err + } + defer func() { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + }() + + if maj != cbg.MajMap { + return fmt.Errorf("cbor input should be of type map") + } + + if extra > cbg.MaxLength { + return fmt.Errorf("NodeData: map struct too large (%d)", extra) + } + + n := extra + + nameBuf := make([]byte, 1) + for i := uint64(0); i < n; i++ { + nameLen, ok, err := cbg.ReadFullStringIntoBuf(cr, nameBuf, 1000000) + if err != nil { + return err + } + + if !ok { + // Field doesn't exist on this type, so ignore it + if err := cbg.ScanForLinks(cr, func(cid.Cid) {}); err != nil { + return err + } + continue + } + + switch string(nameBuf[:nameLen]) { + // t.Entries ([]mst.TreeEntry) (slice) + case "e": + + maj, extra, err = cr.ReadHeader() + if err != nil { + return err + } + + if extra > 8192 { + return fmt.Errorf("t.Entries: array too large (%d)", extra) + } + + if maj != cbg.MajArray { + return fmt.Errorf("expected cbor array") + } + + if extra > 0 { + t.Entries = make([]TreeEntry, extra) + } + + for i := 0; i < int(extra); i++ { + { + var maj byte + var extra uint64 + var err error + _ = maj + _ = extra + _ = err + + { + + if err := t.Entries[i].UnmarshalCBOR(cr); err != nil { + return xerrors.Errorf("unmarshaling t.Entries[i]: %w", err) + } + + } + + } + } + // t.Left (cid.Cid) (struct) + case "l": + + { + + b, err := cr.ReadByte() + if err != nil { + return err + } + if b != cbg.CborNull[0] { + if err := cr.UnreadByte(); err != nil { + return err + } + + c, err := cbg.ReadCid(cr) + if err != nil { + return xerrors.Errorf("failed to read cid field t.Left: %w", err) + } + + t.Left = &c + } + + } + + default: + // Field doesn't exist on this type, so ignore it + if err := cbg.ScanForLinks(r, func(cid.Cid) {}); err != nil { + return err + } + } + } + + return nil +} +func (t *TreeEntry) MarshalCBOR(w io.Writer) error { + if t == nil { + _, err := w.Write(cbg.CborNull) + return err + } + + cw := cbg.NewCborWriter(w) + + if _, err := cw.Write([]byte{164}); err != nil { + return err + } + + // t.KeySuffix ([]uint8) (slice) + if len("k") > 1000000 { + return xerrors.Errorf("Value in field \"k\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("k"))); err != nil { + return err + } + if _, err := cw.WriteString(string("k")); err != nil { + return err + } + + if len(t.KeySuffix) > 2097152 { + return xerrors.Errorf("Byte array in field t.KeySuffix was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajByteString, uint64(len(t.KeySuffix))); err != nil { + return err + } + + if _, err := cw.Write(t.KeySuffix); err != nil { + return err + } + + // t.PrefixLen (int64) (int64) + if len("p") > 1000000 { + return xerrors.Errorf("Value in field \"p\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("p"))); err != nil { + return err + } + if _, err := cw.WriteString(string("p")); err != nil { + return err + } + + if t.PrefixLen >= 0 { + if err := cw.WriteMajorTypeHeader(cbg.MajUnsignedInt, uint64(t.PrefixLen)); err != nil { + return err + } + } else { + if err := cw.WriteMajorTypeHeader(cbg.MajNegativeInt, uint64(-t.PrefixLen-1)); err != nil { + return err + } + } + + // t.Tree (cid.Cid) (struct) + if len("t") > 1000000 { + return xerrors.Errorf("Value in field \"t\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("t"))); err != nil { + return err + } + if _, err := cw.WriteString(string("t")); err != nil { + return err + } + + if t.Tree == nil { + if _, err := cw.Write(cbg.CborNull); err != nil { + return err + } + } else { + if err := cbg.WriteCid(cw, *t.Tree); err != nil { + return xerrors.Errorf("failed to write cid field t.Tree: %w", err) + } + } + + // t.Val (cid.Cid) (struct) + if len("v") > 1000000 { + return xerrors.Errorf("Value in field \"v\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("v"))); err != nil { + return err + } + if _, err := cw.WriteString(string("v")); err != nil { + return err + } + + if err := cbg.WriteCid(cw, t.Val); err != nil { + return xerrors.Errorf("failed to write cid field t.Val: %w", err) + } + + return nil +} + +func (t *TreeEntry) UnmarshalCBOR(r io.Reader) (err error) { + *t = TreeEntry{} + + cr := cbg.NewCborReader(r) + + maj, extra, err := cr.ReadHeader() + if err != nil { + return err + } + defer func() { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + }() + + if maj != cbg.MajMap { + return fmt.Errorf("cbor input should be of type map") + } + + if extra > cbg.MaxLength { + return fmt.Errorf("TreeEntry: map struct too large (%d)", extra) + } + + n := extra + + nameBuf := make([]byte, 1) + for i := uint64(0); i < n; i++ { + nameLen, ok, err := cbg.ReadFullStringIntoBuf(cr, nameBuf, 1000000) + if err != nil { + return err + } + + if !ok { + // Field doesn't exist on this type, so ignore it + if err := cbg.ScanForLinks(cr, func(cid.Cid) {}); err != nil { + return err + } + continue + } + + switch string(nameBuf[:nameLen]) { + // t.KeySuffix ([]uint8) (slice) + case "k": + + maj, extra, err = cr.ReadHeader() + if err != nil { + return err + } + + if extra > 2097152 { + return fmt.Errorf("t.KeySuffix: byte array too large (%d)", extra) + } + if maj != cbg.MajByteString { + return fmt.Errorf("expected byte array") + } + + if extra > 0 { + t.KeySuffix = make([]uint8, extra) + } + + if _, err := io.ReadFull(cr, t.KeySuffix); err != nil { + return err + } + + // t.PrefixLen (int64) (int64) + case "p": + { + maj, extra, err := cr.ReadHeader() + if err != nil { + return err + } + var extraI int64 + switch maj { + case cbg.MajUnsignedInt: + extraI = int64(extra) + if extraI < 0 { + return fmt.Errorf("int64 positive overflow") + } + case cbg.MajNegativeInt: + extraI = int64(extra) + if extraI < 0 { + return fmt.Errorf("int64 negative overflow") + } + extraI = -1 - extraI + default: + return fmt.Errorf("wrong type for int64 field: %d", maj) + } + + t.PrefixLen = int64(extraI) + } + // t.Tree (cid.Cid) (struct) + case "t": + + { + + b, err := cr.ReadByte() + if err != nil { + return err + } + if b != cbg.CborNull[0] { + if err := cr.UnreadByte(); err != nil { + return err + } + + c, err := cbg.ReadCid(cr) + if err != nil { + return xerrors.Errorf("failed to read cid field t.Tree: %w", err) + } + + t.Tree = &c + } + + } + // t.Val (cid.Cid) (struct) + case "v": + + { + + c, err := cbg.ReadCid(cr) + if err != nil { + return xerrors.Errorf("failed to read cid field t.Val: %w", err) + } + + t.Val = c + + } + + default: + // Field doesn't exist on this type, so ignore it + if err := cbg.ScanForLinks(r, func(cid.Cid) {}); err != nil { + return err + } + } + } + + return nil +} diff --git a/pkg/ms3t/mst/diff.go b/pkg/ms3t/mst/diff.go new file mode 100644 index 0000000..0d3d844 --- /dev/null +++ b/pkg/ms3t/mst/diff.go @@ -0,0 +1,192 @@ +package mst + +import ( + "context" + "fmt" + + cid "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" +) + +// DiffOp describes a single change between two MST roots. +type DiffOp struct { + Depth int + Op string // "add", "del", "mut" + Rpath string + OldCid cid.Cid + NewCid cid.Cid +} + +// DiffTrees enumerates the additions, deletions, and mutations needed to go +// from the MST rooted at `from` to the MST rooted at `to`. +func DiffTrees(ctx context.Context, bs cbor.IpldBlockstore, from, to cid.Cid) ([]*DiffOp, error) { + cst := CborStore(bs) + + if from == cid.Undef { + return identityDiff(ctx, bs, to) + } + + ft := LoadMST(cst, from) + tt := LoadMST(cst, to) + + fents, err := ft.getEntries(ctx) + if err != nil { + return nil, err + } + + tents, err := tt.getEntries(ctx) + if err != nil { + return nil, err + } + + var ixf, ixt int + var out []*DiffOp + for ixf < len(fents) && ixt < len(tents) { + ef := fents[ixf] + et := tents[ixt] + + if nodeEntriesEqual(&ef, &et) { + ixf++ + ixt++ + continue + } + + if ef.isLeaf() && et.isLeaf() { + if ef.Key == et.Key { + if ef.Val == et.Val { + return nil, fmt.Errorf("hang on, why are these leaves equal?") + } + + out = append(out, &DiffOp{ + Op: "mut", + Rpath: ef.Key, + OldCid: ef.Val, + NewCid: et.Val, + }) + ixf++ + ixt++ + continue + } + + if ef.Key > et.Key { + out = append(out, &DiffOp{ + Op: "add", + Rpath: et.Key, + NewCid: et.Val, + }) + ixt++ + } else { + out = append(out, &DiffOp{ + Op: "del", + Rpath: ef.Key, + OldCid: ef.Val, + }) + ixf++ + } + + continue + } + + if ef.isTree() { + sub, err := ef.Tree.getEntries(ctx) + if err != nil { + return nil, err + } + + fents = append(sub, fents[ixf+1:]...) + ixf = 0 + continue + } + + if et.isTree() { + sub, err := et.Tree.getEntries(ctx) + if err != nil { + return nil, err + } + + tents = append(sub, tents[ixt+1:]...) + ixt = 0 + continue + } + } + + for ; ixf < len(fents); ixf++ { + e := fents[ixf] + if e.isLeaf() { + out = append(out, &DiffOp{ + Op: "del", + Rpath: e.Key, + OldCid: e.Val, + }) + } else if e.isTree() { + if err := e.Tree.WalkLeavesFrom(ctx, "", func(key string, val cid.Cid) error { + out = append(out, &DiffOp{ + Op: "del", + Rpath: key, + OldCid: val, + }) + return nil + }); err != nil { + return nil, err + } + } + } + + for ; ixt < len(tents); ixt++ { + e := tents[ixt] + if e.isLeaf() { + out = append(out, &DiffOp{ + Op: "add", + Rpath: e.Key, + NewCid: e.Val, + }) + } else if e.isTree() { + if err := e.Tree.WalkLeavesFrom(ctx, "", func(key string, val cid.Cid) error { + out = append(out, &DiffOp{ + Op: "add", + Rpath: key, + NewCid: val, + }) + return nil + }); err != nil { + return nil, err + } + } + } + + return out, nil +} + +func nodeEntriesEqual(a, b *nodeEntry) bool { + if !(a.Key == b.Key && a.Val == b.Val) { + return false + } + + if a.Tree == nil && b.Tree == nil { + return true + } + + if a.Tree != nil && b.Tree != nil && a.Tree.pointer == b.Tree.pointer { + return true + } + + return false +} + +func identityDiff(ctx context.Context, bs cbor.IpldBlockstore, root cid.Cid) ([]*DiffOp, error) { + cst := CborStore(bs) + tt := LoadMST(cst, root) + + var ops []*DiffOp + if err := tt.WalkLeavesFrom(ctx, "", func(key string, val cid.Cid) error { + ops = append(ops, &DiffOp{ + Op: "add", + Rpath: key, + NewCid: val, + }) + return nil + }); err != nil { + return nil, err + } + return ops, nil +} diff --git a/pkg/ms3t/mst/mst.go b/pkg/ms3t/mst/mst.go new file mode 100644 index 0000000..c5f7c1a --- /dev/null +++ b/pkg/ms3t/mst/mst.go @@ -0,0 +1,866 @@ +// Package mst is a fork of github.com/bluesky-social/indigo/mst with the +// atproto-specific key validation relaxed for use as a generic ordered +// content-addressed key/value map. Keys may be any non-empty UTF-8 string up +// to 1024 bytes, with the only forbidden bytes being NUL. +// +// On-disk format is unchanged from the atproto MST: NodeData / TreeEntry CBOR +// blocks with prefix-compressed byte-string keys. Cross-implementation +// compatibility with atproto MSTs is intentionally not preserved. +// +// See https://hal.inria.fr/hal-02303490/document for the underlying data +// structure. SHA-256 is used for key hashing with a 4-bit fanout (~16 entries +// per layer). +package mst + +import ( + "context" + "fmt" + "reflect" + + "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" +) + +// nodeKind is the type of node in the MST. +type nodeKind uint8 + +const ( + entryUndefined nodeKind = 0 + entryLeaf nodeKind = 1 + entryTree nodeKind = 2 +) + +// nodeEntry is either a leaf (key/value) or a pointer to a subtree. +type nodeEntry struct { + Kind nodeKind + Key string + Val cid.Cid + Tree *MerkleSearchTree +} + +func mkTreeEntry(t *MerkleSearchTree) nodeEntry { + return nodeEntry{ + Kind: entryTree, + Tree: t, + } +} + +func (ne nodeEntry) isTree() bool { return ne.Kind == entryTree } +func (ne nodeEntry) isLeaf() bool { return ne.Kind == entryLeaf } +func (ne nodeEntry) isUndefined() bool { return ne.Kind == entryUndefined } + +// Sanity check: two trees can never be neighbors in an entries slice. +func checkTreeInvariant(ents []nodeEntry) { + for i := 0; i < len(ents)-1; i++ { + if ents[i].isTree() && ents[i+1].isTree() { + panic(fmt.Sprintf("two trees next to each other! %d %d", i, i+1)) + } + } +} + +// CBORTypes returns the types in this package that need to be registered with +// the CBOR codec. +func CBORTypes() []reflect.Type { + return []reflect.Type{ + reflect.TypeOf(NodeData{}), + reflect.TypeOf(TreeEntry{}), + } +} + +// NodeData is the CBOR-serialized form of an MST node. +type NodeData struct { + Left *cid.Cid `cborgen:"l"` // [nullable] pointer to lower-level subtree to the "left" of this path/key + Entries []TreeEntry `cborgen:"e"` // ordered list of entries at this node +} + +// TreeEntry is one entry within a NodeData. +type TreeEntry struct { + PrefixLen int64 `cborgen:"p"` // count of bytes shared with previous key in tree + KeySuffix []byte `cborgen:"k"` // remaining part of key (appended to "previous key") + Val cid.Cid `cborgen:"v"` // CID pointer at this path/key + Tree *cid.Cid `cborgen:"t"` // [nullable] pointer to lower-level subtree to the "right" of this entry +} + +// MerkleSearchTree is an MST tree node. Values are immutable: methods return +// copies with changes applied. Hydration is lazy; a tree loaded by CID has no +// entries until getEntries is called. +type MerkleSearchTree struct { + cst cbor.IpldStore + entries []nodeEntry // non-nil when "hydrated" + layer int + pointer cid.Cid + validPtr bool +} + +// NewEmptyMST returns a new empty MST using cst as its storage. +func NewEmptyMST(cst cbor.IpldStore) *MerkleSearchTree { + return createMST(cst, cid.Undef, []nodeEntry{}, 0) +} + +func createMST(cst cbor.IpldStore, ptr cid.Cid, entries []nodeEntry, layer int) *MerkleSearchTree { + mst := &MerkleSearchTree{ + cst: cst, + pointer: ptr, + layer: layer, + entries: entries, + validPtr: ptr.Defined(), + } + return mst +} + +// LoadMST returns a lazy reference to an MST rooted at the given CID. Entries +// are not loaded until needed. +func LoadMST(cst cbor.IpldStore, root cid.Cid) *MerkleSearchTree { + return createMST(cst, root, nil, -1) +} + +// === Immutability === + +func (mst *MerkleSearchTree) newTree(entries []nodeEntry) *MerkleSearchTree { + if entries == nil { + panic("nil entries passed to newTree") + } + return createMST(mst.cst, cid.Undef, entries, mst.layer) +} + +// === Lazy getters === + +func (mst *MerkleSearchTree) getEntries(ctx context.Context) ([]nodeEntry, error) { + if mst.entries != nil { + return mst.entries, nil + } + + if mst.pointer != cid.Undef { + var nd NodeData + if err := mst.cst.Get(ctx, mst.pointer, &nd); err != nil { + return nil, err + } + entries, err := entriesFromNodeData(ctx, &nd, mst.cst) + if err != nil { + return nil, err + } + if entries == nil { + panic("got nil entries from node data decoding") + } + mst.entries = entries + return entries, nil + } + + return nil, fmt.Errorf("no entries or self-pointer (CID) on MerkleSearchTree") +} + +func entriesFromNodeData(ctx context.Context, nd *NodeData, cst cbor.IpldStore) ([]nodeEntry, error) { + layer := -1 + if len(nd.Entries) > 0 { + // the first entry's KeySuffix is a complete key (PrefixLen=0) + firstLeaf := nd.Entries[0] + layer = leadingZerosOnHashBytes(firstLeaf.KeySuffix) + } + + entries, err := deserializeNodeData(ctx, cst, nd, layer) + if err != nil { + return nil, err + } + + return entries, nil +} + +// GetPointer returns the CID of this MST root, recomputing it if any subtree +// has been mutated since the last call. +func (mst *MerkleSearchTree) GetPointer(ctx context.Context) (cid.Cid, error) { + if mst.validPtr { + return mst.pointer, nil + } + + if _, err := mst.getEntries(ctx); err != nil { + return cid.Undef, err + } + + for i, e := range mst.entries { + if e.isTree() { + if !e.Tree.validPtr { + if _, err := e.Tree.GetPointer(ctx); err != nil { + return cid.Undef, err + } + mst.entries[i] = e + } + } + } + + nptr, err := cidForEntries(ctx, mst.entries, mst.cst) + if err != nil { + return cid.Undef, err + } + mst.pointer = nptr + mst.validPtr = true + + return mst.pointer, nil +} + +func (mst *MerkleSearchTree) getLayer(ctx context.Context) (int, error) { + layer, err := mst.attemptGetLayer(ctx) + if err != nil { + return -1, err + } + if layer < 0 { + mst.layer = 0 + } else { + mst.layer = layer + } + return mst.layer, nil +} + +func (mst *MerkleSearchTree) attemptGetLayer(ctx context.Context) (int, error) { + if mst.layer >= 0 { + return mst.layer, nil + } + + entries, err := mst.getEntries(ctx) + if err != nil { + return -1, err + } + + layer := layerForEntries(entries) + if layer < 0 { + for _, e := range entries { + if e.isTree() { + childLayer, err := e.Tree.attemptGetLayer(ctx) + if err != nil { + return -1, err + } + if childLayer >= 0 { + layer = childLayer + 1 + break + } + } + } + } + + if layer >= 0 { + mst.layer = layer + } + return mst.layer, nil +} + +// === Core operations === + +// Add inserts a new key/value pair. Returns ErrAlreadyExists if the key is +// already present. +func (mst *MerkleSearchTree) Add(ctx context.Context, key string, val cid.Cid, knownZeros int) (*MerkleSearchTree, error) { + if err := ensureValidKey(key); err != nil { + return nil, err + } + + if val == cid.Undef { + return nil, fmt.Errorf("tried to insert an undef CID") + } + + keyZeros := knownZeros + if keyZeros < 0 { + keyZeros = leadingZerosOnHash(key) + } + + layer, err := mst.getLayer(ctx) + if err != nil { + return nil, fmt.Errorf("getting layer failed: %w", err) + } + + newLeaf := nodeEntry{ + Kind: entryLeaf, + Key: key, + Val: val, + } + + if keyZeros == layer { + index, err := mst.findGtOrEqualLeafIndex(ctx, key) + if err != nil { + return nil, err + } + + found, err := mst.atIndex(index) + if err != nil { + return nil, err + } + + if found.isLeaf() && found.Key == key { + return nil, ErrAlreadyExists + } + + prevNode, err := mst.atIndex(index - 1) + if err != nil { + return nil, err + } + + if prevNode.isUndefined() || prevNode.isLeaf() { + return mst.spliceIn(ctx, newLeaf, index) + } + + left, right, err := prevNode.Tree.splitAround(ctx, key) + if err != nil { + return nil, err + } + return mst.replaceWithSplit(ctx, index-1, left, newLeaf, right) + + } else if keyZeros < layer { + index, err := mst.findGtOrEqualLeafIndex(ctx, key) + if err != nil { + return nil, err + } + + prevNode, err := mst.atIndex(index - 1) + if err != nil { + return nil, err + } + + if !prevNode.isUndefined() && prevNode.isTree() { + newSubtree, err := prevNode.Tree.Add(ctx, key, val, keyZeros) + if err != nil { + return nil, err + } + return mst.updateEntry(ctx, index-1, mkTreeEntry(newSubtree)) + } + + subTree, err := mst.createChild(ctx) + if err != nil { + return nil, err + } + + newSubTree, err := subTree.Add(ctx, key, val, keyZeros) + if err != nil { + return nil, fmt.Errorf("subtree add: %w", err) + } + + return mst.spliceIn(ctx, mkTreeEntry(newSubTree), index) + } + + // keyZeros > layer: must push the rest of the tree down + left, right, err := mst.splitAround(ctx, key) + if err != nil { + return nil, err + } + + layer, err = mst.getLayer(ctx) + if err != nil { + return nil, fmt.Errorf("get layer in split case failed: %w", err) + } + + extraLayersToAdd := keyZeros - layer + + for i := 1; i < extraLayersToAdd; i++ { + if left != nil { + par, err := left.createParent(ctx) + if err != nil { + return nil, fmt.Errorf("create left parent: %w", err) + } + left = par + } + + if right != nil { + par, err := right.createParent(ctx) + if err != nil { + return nil, fmt.Errorf("create right parent: %w", err) + } + right = par + } + } + + var updated []nodeEntry + if left != nil { + updated = append(updated, mkTreeEntry(left)) + } + + updated = append(updated, nodeEntry{ + Kind: entryLeaf, + Key: key, + Val: val, + }) + + if right != nil { + updated = append(updated, mkTreeEntry(right)) + } + + checkTreeInvariant(updated) + newRoot := createMST(mst.cst, cid.Undef, updated, keyZeros) + newRoot.validPtr = false + + return newRoot, nil +} + +// ErrNotFound is returned by Get / Update / Delete when the key is absent. +var ErrNotFound = fmt.Errorf("mst: not found") + +// ErrAlreadyExists is returned by Add when the key is already present. +var ErrAlreadyExists = fmt.Errorf("mst: key already exists") + +// Get returns the CID at the given key, or ErrNotFound. +func (mst *MerkleSearchTree) Get(ctx context.Context, k string) (cid.Cid, error) { + index, err := mst.findGtOrEqualLeafIndex(ctx, k) + if err != nil { + return cid.Undef, err + } + + found, err := mst.atIndex(index) + if err != nil { + return cid.Undef, err + } + + if !found.isUndefined() && found.isLeaf() && found.Key == k { + return found.Val, nil + } + + prev, err := mst.atIndex(index - 1) + if err != nil { + return cid.Undef, err + } + + if !prev.isUndefined() && prev.isTree() { + return prev.Tree.Get(ctx, k) + } + + return cid.Undef, ErrNotFound +} + +// Update replaces the value at an existing key. Returns ErrNotFound if absent. +func (mst *MerkleSearchTree) Update(ctx context.Context, k string, val cid.Cid) (*MerkleSearchTree, error) { + if err := ensureValidKey(k); err != nil { + return nil, err + } + + if val == cid.Undef { + return nil, fmt.Errorf("tried to insert an undef CID") + } + + index, err := mst.findGtOrEqualLeafIndex(ctx, k) + if err != nil { + return nil, err + } + + found, err := mst.atIndex(index) + if err != nil { + return nil, err + } + + if !found.isUndefined() && found.isLeaf() && found.Key == k { + return mst.updateEntry(ctx, index, nodeEntry{ + Kind: entryLeaf, + Key: k, + Val: val, + }) + } + + prev, err := mst.atIndex(index - 1) + if err != nil { + return nil, err + } + + if !prev.isUndefined() && prev.isTree() { + updatedTree, err := prev.Tree.Update(ctx, k, val) + if err != nil { + return nil, err + } + return mst.updateEntry(ctx, index-1, mkTreeEntry(updatedTree)) + } + + return nil, ErrNotFound +} + +// Delete removes the leaf at the given key. +func (mst *MerkleSearchTree) Delete(ctx context.Context, k string) (*MerkleSearchTree, error) { + altered, err := mst.deleteRecurse(ctx, k) + if err != nil { + return nil, err + } + return altered.trimTop(ctx) +} + +func (mst *MerkleSearchTree) deleteRecurse(ctx context.Context, k string) (*MerkleSearchTree, error) { + ix, err := mst.findGtOrEqualLeafIndex(ctx, k) + if err != nil { + return nil, err + } + + found, err := mst.atIndex(ix) + if err != nil { + return nil, err + } + + if found.isLeaf() && found.Key == k { + prev, err := mst.atIndex(ix - 1) + if err != nil { + return nil, err + } + + next, err := mst.atIndex(ix + 1) + if err != nil { + return nil, err + } + + if prev.isTree() && next.isTree() { + merged, err := prev.Tree.appendMerge(ctx, next.Tree) + if err != nil { + return nil, err + } + entries, err := mst.getEntries(ctx) + if err != nil { + return nil, err + } + return mst.newTree(append(append(entries[:ix-1], mkTreeEntry(merged)), entries[ix+2:]...)), nil + } + return mst.removeEntry(ctx, ix) + } + + prev, err := mst.atIndex(ix - 1) + if err != nil { + return nil, err + } + + if prev.isTree() { + subtree, err := prev.Tree.deleteRecurse(ctx, k) + if err != nil { + return nil, err + } + + subtreeEntries, err := subtree.getEntries(ctx) + if err != nil { + return nil, err + } + + if len(subtreeEntries) == 0 { + return mst.removeEntry(ctx, ix-1) + } + return mst.updateEntry(ctx, ix-1, mkTreeEntry(subtree)) + } + + return nil, ErrNotFound +} + +// === Simple operations === + +func (mst *MerkleSearchTree) updateEntry(ctx context.Context, ix int, entry nodeEntry) (*MerkleSearchTree, error) { + entries, err := mst.getEntries(ctx) + if err != nil { + return nil, err + } + + nents := make([]nodeEntry, len(entries)) + copy(nents, entries[:ix]) + nents[ix] = entry + copy(nents[ix+1:], entries[ix+1:]) + + checkTreeInvariant(nents) + return mst.newTree(nents), nil +} + +func (mst *MerkleSearchTree) removeEntry(ctx context.Context, ix int) (*MerkleSearchTree, error) { + entries, err := mst.getEntries(ctx) + if err != nil { + return nil, err + } + + nents := make([]nodeEntry, len(entries)-1) + copy(nents, entries[:ix]) + copy(nents[ix:], entries[ix+1:]) + + checkTreeInvariant(nents) + return mst.newTree(nents), nil +} + +func (mst *MerkleSearchTree) append(ctx context.Context, ent nodeEntry) (*MerkleSearchTree, error) { + entries, err := mst.getEntries(ctx) + if err != nil { + return nil, err + } + + nents := make([]nodeEntry, len(entries)+1) + copy(nents, entries) + nents[len(nents)-1] = ent + + checkTreeInvariant(nents) + return mst.newTree(nents), nil +} + +func (mst *MerkleSearchTree) prepend(ctx context.Context, ent nodeEntry) (*MerkleSearchTree, error) { + entries, err := mst.getEntries(ctx) + if err != nil { + return nil, err + } + + nents := make([]nodeEntry, len(entries)+1) + copy(nents[1:], entries) + nents[0] = ent + + checkTreeInvariant(nents) + return mst.newTree(nents), nil +} + +func (mst *MerkleSearchTree) atIndex(ix int) (nodeEntry, error) { + entries, err := mst.getEntries(context.TODO()) + if err != nil { + return nodeEntry{}, err + } + + if ix < 0 || ix >= len(entries) { + return nodeEntry{}, nil + } + + return entries[ix], nil +} + +func (mst *MerkleSearchTree) spliceIn(ctx context.Context, entry nodeEntry, ix int) (*MerkleSearchTree, error) { + entries, err := mst.getEntries(ctx) + if err != nil { + return nil, err + } + + nents := make([]nodeEntry, len(entries)+1) + copy(nents, entries[:ix]) + nents[ix] = entry + copy(nents[ix+1:], entries[ix:]) + + checkTreeInvariant(nents) + return mst.newTree(nents), nil +} + +func (mst *MerkleSearchTree) replaceWithSplit(ctx context.Context, ix int, left *MerkleSearchTree, nl nodeEntry, right *MerkleSearchTree) (*MerkleSearchTree, error) { + entries, err := mst.getEntries(ctx) + if err != nil { + return nil, err + } + checkTreeInvariant(entries) + var update []nodeEntry + update = append(update, entries[:ix]...) + + if left != nil { + update = append(update, nodeEntry{ + Kind: entryTree, + Tree: left, + }) + } + + update = append(update, nl) + + if right != nil { + update = append(update, nodeEntry{ + Kind: entryTree, + Tree: right, + }) + } + + update = append(update, entries[ix+1:]...) + + checkTreeInvariant(update) + return mst.newTree(update), nil +} + +func (mst *MerkleSearchTree) trimTop(ctx context.Context) (*MerkleSearchTree, error) { + entries, err := mst.getEntries(ctx) + if err != nil { + return nil, err + } + if len(entries) == 1 && entries[0].isTree() { + return entries[0].Tree.trimTop(ctx) + } + return mst, nil +} + +// === Subtree splits === + +func (mst *MerkleSearchTree) splitAround(ctx context.Context, key string) (*MerkleSearchTree, *MerkleSearchTree, error) { + index, err := mst.findGtOrEqualLeafIndex(ctx, key) + if err != nil { + return nil, nil, err + } + + entries, err := mst.getEntries(ctx) + if err != nil { + return nil, nil, err + } + + leftData := entries[:index] + rightData := entries[index:] + left := mst.newTree(leftData) + right := mst.newTree(rightData) + + if len(leftData) > 0 && leftData[len(leftData)-1].isTree() { + lastInLeft := leftData[len(leftData)-1] + + nleft, err := left.removeEntry(ctx, len(leftData)-1) + if err != nil { + return nil, nil, err + } + left = nleft + + subl, subr, err := lastInLeft.Tree.splitAround(ctx, key) + if err != nil { + return nil, nil, err + } + + if subl != nil { + left, err = left.append(ctx, mkTreeEntry(subl)) + if err != nil { + return nil, nil, err + } + } + + if subr != nil { + right, err = right.prepend(ctx, mkTreeEntry(subr)) + if err != nil { + return nil, nil, err + } + } + } + + if left.entryCount() == 0 { + left = nil + } + if right.entryCount() == 0 { + right = nil + } + + return left, right, nil +} + +func (mst *MerkleSearchTree) entryCount() int { + entries, err := mst.getEntries(context.TODO()) + if err != nil { + panic(err) + } + return len(entries) +} + +func (mst *MerkleSearchTree) appendMerge(ctx context.Context, omst *MerkleSearchTree) (*MerkleSearchTree, error) { + mylayer, err := mst.getLayer(ctx) + if err != nil { + return nil, err + } + + olayer, err := omst.getLayer(ctx) + if err != nil { + return nil, err + } + + if mylayer != olayer { + return nil, fmt.Errorf("trying to merge two nodes from different layers") + } + + entries, err := mst.getEntries(ctx) + if err != nil { + return nil, err + } + + tomergeEnts, err := omst.getEntries(ctx) + if err != nil { + return nil, err + } + + lastInLeft := entries[len(entries)-1] + firstInRight := tomergeEnts[0] + + if lastInLeft.isTree() && firstInRight.isTree() { + merged, err := lastInLeft.Tree.appendMerge(ctx, firstInRight.Tree) + if err != nil { + return nil, err + } + return mst.newTree(append(append(entries[:len(entries)-1], mkTreeEntry(merged)), tomergeEnts[1:]...)), nil + } + return mst.newTree(append(entries, tomergeEnts...)), nil +} + +// === Create relatives === + +func (mst *MerkleSearchTree) createChild(ctx context.Context) (*MerkleSearchTree, error) { + layer, err := mst.getLayer(ctx) + if err != nil { + return nil, err + } + return createMST(mst.cst, cid.Undef, []nodeEntry{}, layer-1), nil +} + +func (mst *MerkleSearchTree) createParent(ctx context.Context) (*MerkleSearchTree, error) { + layer, err := mst.getLayer(ctx) + if err != nil { + return nil, err + } + return createMST(mst.cst, cid.Undef, []nodeEntry{mkTreeEntry(mst)}, layer+1), nil +} + +// === Finding insertion points === + +func (mst *MerkleSearchTree) findGtOrEqualLeafIndex(ctx context.Context, key string) (int, error) { + entries, err := mst.getEntries(ctx) + if err != nil { + return -1, err + } + + for i, e := range entries { + if e.isLeaf() && e.Key >= key { + return i, nil + } + } + + return len(entries), nil +} + +// === List operations === + +// ErrStopWalk halts a WalkLeavesFrom traversal without surfacing as an error +// to the caller. The walk function returns nil after stopping. +var ErrStopWalk = fmt.Errorf("mst: stop walk") + +// WalkLeavesFrom walks leaves in sorted order starting at the first key >= +// from. The callback may return ErrStopWalk to halt early; any other error +// aborts and is returned to the caller. +func (mst *MerkleSearchTree) WalkLeavesFrom(ctx context.Context, from string, cb func(key string, val cid.Cid) error) error { + err := mst.walkLeavesFrom(ctx, from, false, cb) + if err == ErrStopWalk { + return nil + } + return err +} + +// WalkLeavesFromNocache is like WalkLeavesFrom but does not retain hydrated +// subtree state, intended for one-pass streaming traversals. +func (mst *MerkleSearchTree) WalkLeavesFromNocache(ctx context.Context, from string, cb func(key string, val cid.Cid) error) error { + err := mst.walkLeavesFrom(ctx, from, true, cb) + if err == ErrStopWalk { + return nil + } + return err +} + +func (mst *MerkleSearchTree) walkLeavesFrom(ctx context.Context, from string, nocache bool, cb func(key string, val cid.Cid) error) error { + index, err := mst.findGtOrEqualLeafIndex(ctx, from) + if err != nil { + return err + } + + entries, err := mst.getEntries(ctx) + if err != nil { + return fmt.Errorf("get entries: %w", err) + } + + if index > 0 { + prev := entries[index-1] + if !prev.isUndefined() && prev.isTree() { + if err := prev.Tree.walkLeavesFrom(ctx, from, nocache, cb); err != nil { + return err + } + } + } + + for _, e := range entries[index:] { + if e.isLeaf() { + if err := cb(e.Key, e.Val); err != nil { + return err + } + } else { + if err := e.Tree.walkLeavesFrom(ctx, from, nocache, cb); err != nil { + return err + } + if nocache { + e.Tree = nil + } + } + } + return nil +} diff --git a/pkg/ms3t/mst/mst_util.go b/pkg/ms3t/mst/mst_util.go new file mode 100644 index 0000000..730b5cc --- /dev/null +++ b/pkg/ms3t/mst/mst_util.go @@ -0,0 +1,212 @@ +package mst + +import ( + "context" + "crypto/sha256" + "fmt" + "strings" + "unicode/utf8" + "unsafe" + + "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" + mh "github.com/multiformats/go-multihash" +) + +// MaxKeyBytes is the maximum length, in bytes, of a key stored in the MST. +// Matches S3's object key length cap. +const MaxKeyBytes = 1024 + +// 4-bit fanout: count zero bits in 2-bit chunks. A leading 0x00 byte = 4 zeros. +func leadingZerosOnHash(key string) int { + var b []byte + if len(key) > 0 { + b = unsafe.Slice(unsafe.StringData(key), len(key)) + } + return leadingZerosOnHashBytes(b) +} + +func leadingZerosOnHashBytes(key []byte) (total int) { + hv := sha256.Sum256(key) + for _, b := range hv { + if b&0xC0 != 0 { + break + } + if b == 0x00 { + total += 4 + continue + } + if b&0xFC == 0x00 { + total += 3 + } else if b&0xF0 == 0x00 { + total += 2 + } else { + total += 1 + } + break + } + return total +} + +func layerForEntries(entries []nodeEntry) int { + var firstLeaf nodeEntry + for _, e := range entries { + if e.isLeaf() { + firstLeaf = e + break + } + } + + if firstLeaf.Kind == entryUndefined { + return -1 + } + + return leadingZerosOnHash(firstLeaf.Key) +} + +func deserializeNodeData(ctx context.Context, cst cbor.IpldStore, nd *NodeData, layer int) ([]nodeEntry, error) { + entries := []nodeEntry{} + if nd.Left != nil { + entries = append(entries, nodeEntry{ + Kind: entryTree, + Tree: createMST(cst, *nd.Left, nil, layer-1), + }) + } + + var lastKey string + var keyb []byte // re-used between entries + for _, e := range nd.Entries { + if keyb == nil { + keyb = make([]byte, 0, int(e.PrefixLen)+len(e.KeySuffix)) + } + keyb = append(keyb[:0], lastKey[:e.PrefixLen]...) + keyb = append(keyb, e.KeySuffix...) + + keyStr := string(keyb) + if err := ensureValidKey(keyStr); err != nil { + return nil, err + } + + entries = append(entries, nodeEntry{ + Kind: entryLeaf, + Key: keyStr, + Val: e.Val, + }) + + if e.Tree != nil { + entries = append(entries, nodeEntry{ + Kind: entryTree, + Tree: createMST(cst, *e.Tree, nil, layer-1), + Key: keyStr, + }) + } + lastKey = keyStr + } + + return entries, nil +} + +func serializeNodeData(entries []nodeEntry) (*NodeData, error) { + var data NodeData + + i := 0 + if len(entries) > 0 && entries[0].isTree() { + i++ + + ptr, err := entries[0].Tree.GetPointer(context.TODO()) + if err != nil { + return nil, err + } + data.Left = &ptr + } + + var lastKey string + for i < len(entries) { + leaf := entries[i] + + if !leaf.isLeaf() { + return nil, fmt.Errorf("not a valid node: two subtrees next to each other (%d, %d)", i, len(entries)) + } + i++ + + var subtree *cid.Cid + + if i < len(entries) { + next := entries[i] + + if next.isTree() { + ptr, err := next.Tree.GetPointer(context.TODO()) + if err != nil { + return nil, fmt.Errorf("getting subtree pointer: %w", err) + } + + subtree = &ptr + i++ + } + } + + if err := ensureValidKey(leaf.Key); err != nil { + return nil, err + } + + prefixLen := countPrefixLen(lastKey, leaf.Key) + data.Entries = append(data.Entries, TreeEntry{ + PrefixLen: int64(prefixLen), + KeySuffix: []byte(leaf.Key)[prefixLen:], + Val: leaf.Val, + Tree: subtree, + }) + + lastKey = leaf.Key + } + + return &data, nil +} + +func countPrefixLen(a, b string) int { + var i int + for i = 0; i < len(a) && i < len(b); i++ { + if a[i] != b[i] { + return i + } + } + return i +} + +func cidForEntries(ctx context.Context, entries []nodeEntry, cst cbor.IpldStore) (cid.Cid, error) { + nd, err := serializeNodeData(entries) + if err != nil { + return cid.Undef, fmt.Errorf("serializing new entries: %w", err) + } + return cst.Put(ctx, nd) +} + +// IsValidKey reports whether s is a valid MST key under this fork's relaxed +// rules: non-empty, valid UTF-8, no NUL bytes, at most MaxKeyBytes bytes long. +func IsValidKey(s string) bool { + if len(s) == 0 || len(s) > MaxKeyBytes { + return false + } + if !utf8.ValidString(s) { + return false + } + if strings.ContainsRune(s, 0) { + return false + } + return true +} + +func ensureValidKey(s string) error { + if !IsValidKey(s) { + return fmt.Errorf("invalid mst key (len=%d)", len(s)) + } + return nil +} + +// CborStore wraps a blockstore in a CBOR-aware IpldStore using SHA2-256 +// multihashing. Equivalent to indigo's util.CborStore. +func CborStore(bs cbor.IpldBlockstore) *cbor.BasicIpldStore { + cst := cbor.NewCborStore(bs) + cst.DefaultMultihash = mh.SHA2_256 + return cst +} diff --git a/pkg/ms3t/registry/registry.go b/pkg/ms3t/registry/registry.go new file mode 100644 index 0000000..b3fe485 --- /dev/null +++ b/pkg/ms3t/registry/registry.go @@ -0,0 +1,51 @@ +// Package registry tracks the set of buckets and the current MST root CID +// for each. The interface is small enough that swapping SQLite for postgres +// or DynamoDB later is just a new implementation. +package registry + +import ( + "context" + "errors" + + "github.com/ipfs/go-cid" +) + +// State is the metadata stored per bucket. +type State struct { + Name string + Root cid.Cid // current MST root; cid.Undef for empty bucket + ForgeRoot cid.Cid // last MST root whose DAG has been shipped to Forge + CreatedAt int64 // unix seconds +} + +// Registry tracks bucket state. All methods are safe for concurrent use. +type Registry interface { + // Create inserts a new bucket. Returns ErrExists if name is taken. + Create(ctx context.Context, name string, createdAt int64) error + + // Get returns the state of a bucket, or ErrNotFound. + Get(ctx context.Context, name string) (*State, error) + + // List returns every bucket in lexicographic name order. + List(ctx context.Context) ([]*State, error) + + // Delete removes a bucket. Returns ErrNotFound if absent. + Delete(ctx context.Context, name string) error + + // CASRoot atomically advances the bucket root from expect to next. + // Returns ErrConflict if the current root does not equal expect. + CASRoot(ctx context.Context, name string, expect, next cid.Cid) error + + // SetForgeRoot records that the DAG reachable from root has been + // successfully shipped to Forge. Used as the high-water mark by + // the recovery loop: anything reachable from Root but not from + // ForgeRoot needs to be re-submitted on startup. + SetForgeRoot(ctx context.Context, name string, root cid.Cid) error +} + +// Common errors. +var ( + ErrNotFound = errors.New("registry: bucket not found") + ErrExists = errors.New("registry: bucket already exists") + ErrConflict = errors.New("registry: root cas conflict") +) diff --git a/pkg/ms3t/registry/sqlite.go b/pkg/ms3t/registry/sqlite.go new file mode 100644 index 0000000..c85cf0f --- /dev/null +++ b/pkg/ms3t/registry/sqlite.go @@ -0,0 +1,212 @@ +package registry + +import ( + "context" + "database/sql" + "errors" + "fmt" + "strings" + + "github.com/ipfs/go-cid" +) + +// Schema is the DDL for the buckets table. Vanilla SQL. +const Schema = ` +CREATE TABLE IF NOT EXISTS buckets ( + name TEXT PRIMARY KEY, + root_cid BLOB, + forge_root_cid BLOB, + created_at INTEGER NOT NULL +); +` + +// addForgeRootColumn brings older schemas (without forge_root_cid) +// forward in place. Idempotent: if the column exists, the ALTER +// errors and we treat that as already-migrated. +const addForgeRootColumn = `ALTER TABLE buckets ADD COLUMN forge_root_cid BLOB` + +// SQL is a database/sql-backed Registry. Works with any SQL driver that +// supports the byte-blob and integer types used here. +type SQL struct { + db *sql.DB +} + +// NewSQL wraps an open *sql.DB and ensures the schema exists. +func NewSQL(db *sql.DB) (*SQL, error) { + if _, err := db.Exec(Schema); err != nil { + return nil, fmt.Errorf("registry: ensure schema: %w", err) + } + // Best-effort migration for older databases. The error case is the + // column already existing (driver-specific message), which is fine. + if _, err := db.Exec(addForgeRootColumn); err != nil { + if !strings.Contains(err.Error(), "duplicate column") { + return nil, fmt.Errorf("registry: add forge_root_cid: %w", err) + } + } + return &SQL{db: db}, nil +} + +func (r *SQL) Create(ctx context.Context, name string, createdAt int64) error { + _, err := r.db.ExecContext(ctx, + `INSERT INTO buckets (name, root_cid, created_at) VALUES (?, NULL, ?)`, + name, createdAt) + if err != nil { + // Cheap, portable detection: a second Create with the same name will + // trip the PK. Different drivers wrap this error differently, so + // fall back to Get to distinguish. + if existing, gerr := r.Get(ctx, name); gerr == nil && existing != nil { + return ErrExists + } + return fmt.Errorf("registry: create %q: %w", name, err) + } + return nil +} + +func (r *SQL) Get(ctx context.Context, name string) (*State, error) { + var rootBytes, forgeBytes []byte + var createdAt int64 + err := r.db.QueryRowContext(ctx, + `SELECT root_cid, forge_root_cid, created_at FROM buckets WHERE name = ?`, name). + Scan(&rootBytes, &forgeBytes, &createdAt) + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } + if err != nil { + return nil, fmt.Errorf("registry: get %q: %w", name, err) + } + + st := &State{Name: name, CreatedAt: createdAt} + if err := setCid(&st.Root, rootBytes, name, "root_cid"); err != nil { + return nil, err + } + if err := setCid(&st.ForgeRoot, forgeBytes, name, "forge_root_cid"); err != nil { + return nil, err + } + return st, nil +} + +func (r *SQL) List(ctx context.Context) ([]*State, error) { + rows, err := r.db.QueryContext(ctx, + `SELECT name, root_cid, forge_root_cid, created_at FROM buckets ORDER BY name ASC`) + if err != nil { + return nil, fmt.Errorf("registry: list: %w", err) + } + defer rows.Close() + + var out []*State + for rows.Next() { + var name string + var rootBytes, forgeBytes []byte + var createdAt int64 + if err := rows.Scan(&name, &rootBytes, &forgeBytes, &createdAt); err != nil { + return nil, fmt.Errorf("registry: list scan: %w", err) + } + st := &State{Name: name, CreatedAt: createdAt} + if err := setCid(&st.Root, rootBytes, name, "root_cid"); err != nil { + return nil, err + } + if err := setCid(&st.ForgeRoot, forgeBytes, name, "forge_root_cid"); err != nil { + return nil, err + } + out = append(out, st) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("registry: list rows: %w", err) + } + return out, nil +} + +func setCid(dst *cid.Cid, raw []byte, name, field string) error { + if len(raw) == 0 { + *dst = cid.Undef + return nil + } + c, err := cid.Cast(raw) + if err != nil { + return fmt.Errorf("registry: bad %s for %q: %w", field, name, err) + } + *dst = c + return nil +} + +func (r *SQL) Delete(ctx context.Context, name string) error { + res, err := r.db.ExecContext(ctx, + `DELETE FROM buckets WHERE name = ?`, name) + if err != nil { + return fmt.Errorf("registry: delete %q: %w", name, err) + } + n, err := res.RowsAffected() + if err != nil { + return fmt.Errorf("registry: delete rows: %w", err) + } + if n == 0 { + return ErrNotFound + } + return nil +} + +func (r *SQL) CASRoot(ctx context.Context, name string, expect, next cid.Cid) error { + var ( + expectBytes []byte + nextBytes []byte + ) + if expect.Defined() { + expectBytes = expect.Bytes() + } + if next.Defined() { + nextBytes = next.Bytes() + } + + var ( + res sql.Result + err error + ) + if expectBytes == nil { + res, err = r.db.ExecContext(ctx, + `UPDATE buckets SET root_cid = ? WHERE name = ? AND root_cid IS NULL`, + nextBytes, name) + } else { + res, err = r.db.ExecContext(ctx, + `UPDATE buckets SET root_cid = ? WHERE name = ? AND root_cid = ?`, + nextBytes, name, expectBytes) + } + if err != nil { + return fmt.Errorf("registry: cas %q: %w", name, err) + } + n, err := res.RowsAffected() + if err != nil { + return fmt.Errorf("registry: cas rows: %w", err) + } + if n == 0 { + // Either the bucket doesn't exist or the expected root didn't match. + if _, gerr := r.Get(ctx, name); errors.Is(gerr, ErrNotFound) { + return ErrNotFound + } + return ErrConflict + } + return nil +} + +func (r *SQL) SetForgeRoot(ctx context.Context, name string, root cid.Cid) error { + var rootBytes []byte + if root.Defined() { + rootBytes = root.Bytes() + } + res, err := r.db.ExecContext(ctx, + `UPDATE buckets SET forge_root_cid = ? WHERE name = ?`, + rootBytes, name) + if err != nil { + return fmt.Errorf("registry: set forge root %q: %w", name, err) + } + n, err := res.RowsAffected() + if err != nil { + return fmt.Errorf("registry: set forge root rows: %w", err) + } + if n == 0 { + return ErrNotFound + } + return nil +} + +// Compile-time assertion. +var _ Registry = (*SQL)(nil) diff --git a/pkg/ms3t/server/handlers.go b/pkg/ms3t/server/handlers.go new file mode 100644 index 0000000..28de59e --- /dev/null +++ b/pkg/ms3t/server/handlers.go @@ -0,0 +1,281 @@ +package server + +import ( + "encoding/hex" + "errors" + "fmt" + "io" + "net/http" + "strconv" + "strings" + "time" + + "github.com/storacha/sprue/pkg/ms3t/bucket" +) + +const httpTimeFormat = http.TimeFormat // RFC1123 GMT + +// --- Buckets --- + +func (h *Handler) listBuckets(w http.ResponseWriter, r *http.Request) { + states, err := h.svc.ListBuckets(reqCtx(r)) + if err != nil { + writeServiceError(w, r, err) + return + } + resp := ListAllMyBucketsResult{ + Xmlns: s3Namespace, + Owner: bucketsOwner{ID: "ms3t", DisplayName: "ms3t"}, + } + for _, st := range states { + resp.Buckets.Bucket = append(resp.Buckets.Bucket, bucketEntry{ + Name: st.Name, + CreationDate: time.Unix(st.CreatedAt, 0).UTC().Format(time.RFC3339), + }) + } + writeXML(w, http.StatusOK, resp) +} + +func (h *Handler) createBucket(w http.ResponseWriter, r *http.Request, name string) { + err := h.svc.CreateBucket(reqCtx(r), name) + if err != nil && !errors.Is(err, bucket.ErrBucketExists) { + writeServiceError(w, r, err) + return + } + w.Header().Set("Location", "/"+name) + w.WriteHeader(http.StatusOK) +} + +func (h *Handler) deleteBucket(w http.ResponseWriter, r *http.Request, name string) { + if err := h.svc.DeleteBucket(reqCtx(r), name); err != nil { + writeServiceError(w, r, err) + return + } + w.WriteHeader(http.StatusNoContent) +} + +func (h *Handler) headBucket(w http.ResponseWriter, r *http.Request, name string) { + if _, err := h.svc.List(reqCtx(r), name, bucket.ListOptions{MaxKeys: 1}); err != nil { + writeServiceError(w, r, err) + return + } + w.WriteHeader(http.StatusOK) +} + +// --- Objects --- + +func (h *Handler) putObject(w http.ResponseWriter, r *http.Request, name, key string) { + defer r.Body.Close() + + // AWS SDKs default to chunked aws-chunked encoding, which we do NOT + // decode here. Clients must disable streaming/chunked uploads or upload + // small bodies in a single PUT. + if v := r.Header.Get("x-amz-content-sha256"); v == "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" || v == "STREAMING-UNSIGNED-PAYLOAD-TRAILER" { + writeError(w, http.StatusNotImplemented, "NotImplemented", + "chunked aws-chunked uploads are not yet supported; configure the client to send unsigned/non-chunked payloads", + r.URL.Path) + return + } + + mf, err := h.svc.PutObject(reqCtx(r), name, key, r.Body, r.Header.Get("Content-Type")) + if err != nil { + writeServiceError(w, r, err) + return + } + w.Header().Set("ETag", etag(mf)) + w.WriteHeader(http.StatusOK) +} + +func (h *Handler) getObject(w http.ResponseWriter, r *http.Request, name, key string) { + rng, rangeErr := parseRange(r.Header.Get("Range")) + if rangeErr != nil { + writeError(w, http.StatusRequestedRangeNotSatisfiable, "InvalidRange", + "invalid Range header", r.URL.Path) + return + } + + body, mf, err := h.svc.GetObject(reqCtx(r), name, key, rng) + if err != nil { + if errors.Is(err, bucket.ErrInvalidRange) { + // We have the manifest; advertise the actual size for clients. + if mf != nil { + w.Header().Set("Content-Range", fmt.Sprintf("bytes */%d", mf.Body.Size)) + } + writeError(w, http.StatusRequestedRangeNotSatisfiable, "InvalidRange", + "requested range not satisfiable", r.URL.Path) + return + } + writeServiceError(w, r, err) + return + } + defer body.Close() + + if rng != nil { + writeRangeHeaders(w, mf, rng) + w.WriteHeader(http.StatusPartialContent) + } else { + writeObjectHeaders(w, mf) + w.WriteHeader(http.StatusOK) + } + if _, err := io.Copy(w, body); err != nil { + h.log.Warn("getobject body copy", "err", err, "key", key) + } +} + +func (h *Handler) headObject(w http.ResponseWriter, r *http.Request, name, key string) { + mf, err := h.svc.HeadObject(reqCtx(r), name, key) + if err != nil { + writeServiceError(w, r, err) + return + } + writeObjectHeaders(w, mf) + w.WriteHeader(http.StatusOK) +} + +func (h *Handler) deleteObject(w http.ResponseWriter, r *http.Request, name, key string) { + if err := h.svc.DeleteObject(reqCtx(r), name, key); err != nil { + writeServiceError(w, r, err) + return + } + w.WriteHeader(http.StatusNoContent) +} + +func etag(mf *bucket.ObjectManifest) string { + return `"` + hex.EncodeToString(mf.Body.SHA256) + `"` +} + +func writeObjectHeaders(w http.ResponseWriter, mf *bucket.ObjectManifest) { + w.Header().Set("Content-Type", mf.ContentType) + w.Header().Set("Content-Length", strconv.FormatInt(mf.Body.Size, 10)) + w.Header().Set("ETag", etag(mf)) + w.Header().Set("Last-Modified", time.Unix(mf.Created, 0).UTC().Format(httpTimeFormat)) + w.Header().Set("Accept-Ranges", "bytes") +} + +func writeRangeHeaders(w http.ResponseWriter, mf *bucket.ObjectManifest, rng *bucket.Range) { + length := rng.End - rng.Start + 1 + w.Header().Set("Content-Type", mf.ContentType) + w.Header().Set("Content-Length", strconv.FormatInt(length, 10)) + w.Header().Set("ETag", etag(mf)) + w.Header().Set("Last-Modified", time.Unix(mf.Created, 0).UTC().Format(httpTimeFormat)) + w.Header().Set("Accept-Ranges", "bytes") + w.Header().Set("Content-Range", + fmt.Sprintf("bytes %d-%d/%d", rng.Start, rng.End, mf.Body.Size)) +} + +// parseRange handles the single-range subset of RFC 7233 that S3 supports: +// "bytes=START-END", "bytes=START-", or "bytes=-SUFFIX". Multi-range +// requests are rejected. Empty header → no range. +// +// Returns (rng, nil) for a valid range, (nil, nil) when no Range header +// is present, (nil, err) on a malformed header. The "suffix" form +// (bytes=-N) cannot be resolved without the body size and is returned +// with Start=-1; the bucket service applies it after loading the manifest. +func parseRange(h string) (*bucket.Range, error) { + if h == "" { + return nil, nil + } + if !strings.HasPrefix(h, "bytes=") { + return nil, errBadRange + } + spec := strings.TrimPrefix(h, "bytes=") + if strings.Contains(spec, ",") { + return nil, errBadRange // multi-range not supported + } + dash := strings.IndexByte(spec, '-') + if dash < 0 { + return nil, errBadRange + } + startStr := spec[:dash] + endStr := spec[dash+1:] + + var start, end int64 = -1, -1 + var err error + if startStr != "" { + start, err = strconv.ParseInt(startStr, 10, 64) + if err != nil || start < 0 { + return nil, errBadRange + } + } + if endStr != "" { + end, err = strconv.ParseInt(endStr, 10, 64) + if err != nil || end < 0 { + return nil, errBadRange + } + } + + switch { + case startStr != "" && endStr != "": + // "bytes=START-END" + if end < start { + return nil, errBadRange + } + return &bucket.Range{Start: start, End: end}, nil + case startStr != "" && endStr == "": + // "bytes=START-" — End resolved later by the service against Size. + return &bucket.Range{Start: start, End: -1}, nil + case startStr == "" && endStr != "": + // "bytes=-SUFFIX" — last N bytes; encoded as Start=-1, End=N. + return &bucket.Range{Start: -1, End: end}, nil + default: + return nil, errBadRange + } +} + +var errBadRange = errors.New("bad range header") + +// --- Listing --- + +func (h *Handler) listObjects(w http.ResponseWriter, r *http.Request, name string) { + q := r.URL.Query() + + prefix := q.Get("prefix") + delimiter := q.Get("delimiter") + startAfter := q.Get("start-after") + token := q.Get("continuation-token") + maxKeys := parseInt(q.Get("max-keys"), 1000) + + from := startAfter + if token != "" { + from = token + } + + res, err := h.svc.List(reqCtx(r), name, bucket.ListOptions{ + Prefix: prefix, + Delimiter: delimiter, + StartAfter: from, + MaxKeys: maxKeys, + }) + if err != nil { + writeServiceError(w, r, err) + return + } + + resp := ListBucketResult{ + Xmlns: s3Namespace, + Name: name, + Prefix: prefix, + Delimiter: delimiter, + MaxKeys: maxKeys, + IsTruncated: res.Truncated, + KeyCount: len(res.Objects) + len(res.CommonPrefixes), + StartAfter: startAfter, + ContinuationToken: token, + } + if res.Truncated { + resp.NextContinuationToken = res.NextToken + } + for _, mf := range res.Objects { + resp.Contents = append(resp.Contents, objectEntry{ + Key: mf.Key, + LastModified: time.Unix(mf.Created, 0).UTC().Format(time.RFC3339), + ETag: etag(mf), + Size: mf.Body.Size, + StorageClass: "STANDARD", + }) + } + for _, cp := range res.CommonPrefixes { + resp.CommonPrefixes = append(resp.CommonPrefixes, commonPrefix{Prefix: cp}) + } + writeXML(w, http.StatusOK, resp) +} diff --git a/pkg/ms3t/server/server.go b/pkg/ms3t/server/server.go new file mode 100644 index 0000000..d267613 --- /dev/null +++ b/pkg/ms3t/server/server.go @@ -0,0 +1,132 @@ +// Package server exposes the bucket service over an S3-compatible HTTP API. +// Path-style addressing only (clients must set forcePathStyle=true). +// +// Auth is intentionally not validated: the Authorization header is read and +// logged so the request can be traced, but its contents are ignored. Real +// auth is a future middleware; this matches the localstack/MinIO-test style +// of giving the SDK a credential to sign with. +package server + +import ( + "context" + "encoding/xml" + "errors" + "fmt" + "log/slog" + "net/http" + "strconv" + "strings" + + "github.com/storacha/sprue/pkg/ms3t/bucket" + "github.com/storacha/sprue/pkg/ms3t/registry" +) + +// Handler implements http.Handler over a *bucket.Service. +type Handler struct { + svc *bucket.Service + log *slog.Logger +} + +// New returns an http.Handler for the bucket service. +func New(svc *bucket.Service, log *slog.Logger) *Handler { + if log == nil { + log = slog.Default() + } + return &Handler{svc: svc, log: log} +} + +func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + // Trim leading slash, split into at most 2 components. + path := strings.TrimPrefix(r.URL.Path, "/") + var bucketName, key string + if path != "" { + if i := strings.Index(path, "/"); i >= 0 { + bucketName, key = path[:i], path[i+1:] + } else { + bucketName = path + } + } + + h.log.Debug("s3 request", + "method", r.Method, + "bucket", bucketName, + "key", key, + "query", r.URL.RawQuery, + "auth", r.Header.Get("Authorization") != "") + + switch { + case bucketName == "" && r.Method == http.MethodGet: + h.listBuckets(w, r) + case key == "" && r.Method == http.MethodPut: + h.createBucket(w, r, bucketName) + case key == "" && r.Method == http.MethodDelete: + h.deleteBucket(w, r, bucketName) + case key == "" && r.Method == http.MethodGet: + h.listObjects(w, r, bucketName) + case key == "" && r.Method == http.MethodHead: + h.headBucket(w, r, bucketName) + case key != "" && r.Method == http.MethodPut: + h.putObject(w, r, bucketName, key) + case key != "" && r.Method == http.MethodGet: + h.getObject(w, r, bucketName, key) + case key != "" && r.Method == http.MethodHead: + h.headObject(w, r, bucketName, key) + case key != "" && r.Method == http.MethodDelete: + h.deleteObject(w, r, bucketName, key) + default: + writeError(w, http.StatusMethodNotAllowed, "MethodNotAllowed", + fmt.Sprintf("method %s not allowed for this resource", r.Method), r.URL.Path) + } +} + +// === Helpers === + +func writeXML(w http.ResponseWriter, status int, body any) { + w.Header().Set("Content-Type", "application/xml") + w.WriteHeader(status) + _, _ = w.Write([]byte(xml.Header)) + _ = xml.NewEncoder(w).Encode(body) +} + +func writeError(w http.ResponseWriter, status int, code, msg, resource string) { + writeXML(w, status, ErrorResponse{ + Code: code, Message: msg, Resource: resource, + }) +} + +func mapServiceError(err error) (status int, code, msg string) { + switch { + case errors.Is(err, bucket.ErrBucketNotFound): + return http.StatusNotFound, "NoSuchBucket", "The specified bucket does not exist" + case errors.Is(err, bucket.ErrObjectNotFound): + return http.StatusNotFound, "NoSuchKey", "The specified key does not exist" + case errors.Is(err, bucket.ErrBucketExists), errors.Is(err, registry.ErrExists): + return http.StatusConflict, "BucketAlreadyOwnedByYou", "Your previous request to create the named bucket succeeded" + case errors.Is(err, bucket.ErrInvalidBucket): + return http.StatusBadRequest, "InvalidBucketName", "The specified bucket is not valid" + case errors.Is(err, bucket.ErrInvalidKey): + return http.StatusBadRequest, "InvalidArgument", "Object key is invalid" + case errors.Is(err, bucket.ErrBucketNotEmpty): + return http.StatusConflict, "BucketNotEmpty", "The bucket you tried to delete is not empty" + default: + return http.StatusInternalServerError, "InternalError", err.Error() + } +} + +func writeServiceError(w http.ResponseWriter, r *http.Request, err error) { + status, code, msg := mapServiceError(err) + writeError(w, status, code, msg, r.URL.Path) +} + +func parseInt(s string, dflt int) int { + if s == "" { + return dflt + } + n, err := strconv.Atoi(s) + if err != nil { + return dflt + } + return n +} + +func reqCtx(r *http.Request) context.Context { return r.Context() } diff --git a/pkg/ms3t/server/xml.go b/pkg/ms3t/server/xml.go new file mode 100644 index 0000000..23d038a --- /dev/null +++ b/pkg/ms3t/server/xml.go @@ -0,0 +1,68 @@ +package server + +import "encoding/xml" + +// S3 XML response shapes. Field names and namespaces match the AWS S3 REST +// API documentation closely enough for the AWS SDK to parse them. + +const s3Namespace = "http://s3.amazonaws.com/doc/2006-03-01/" + +// ListAllMyBucketsResult is the body of GET / +type ListAllMyBucketsResult struct { + XMLName xml.Name `xml:"ListAllMyBucketsResult"` + Xmlns string `xml:"xmlns,attr"` + Owner bucketsOwner `xml:"Owner"` + Buckets bucketsBlock `xml:"Buckets"` +} + +type bucketsOwner struct { + ID string `xml:"ID"` + DisplayName string `xml:"DisplayName"` +} + +type bucketsBlock struct { + Bucket []bucketEntry `xml:"Bucket"` +} + +type bucketEntry struct { + Name string `xml:"Name"` + CreationDate string `xml:"CreationDate"` +} + +// ListBucketResult is the body of GET /?list-type=2 (V2). +type ListBucketResult struct { + XMLName xml.Name `xml:"ListBucketResult"` + Xmlns string `xml:"xmlns,attr"` + Name string `xml:"Name"` + Prefix string `xml:"Prefix"` + Delimiter string `xml:"Delimiter,omitempty"` + MaxKeys int `xml:"MaxKeys"` + IsTruncated bool `xml:"IsTruncated"` + KeyCount int `xml:"KeyCount"` + StartAfter string `xml:"StartAfter,omitempty"` + ContinuationToken string `xml:"ContinuationToken,omitempty"` + NextContinuationToken string `xml:"NextContinuationToken,omitempty"` + Contents []objectEntry `xml:"Contents"` + CommonPrefixes []commonPrefix `xml:"CommonPrefixes"` +} + +type objectEntry struct { + Key string `xml:"Key"` + LastModified string `xml:"LastModified"` + ETag string `xml:"ETag"` + Size int64 `xml:"Size"` + StorageClass string `xml:"StorageClass"` +} + +type commonPrefix struct { + Prefix string `xml:"Prefix"` +} + +// ErrorResponse is the body of any S3 error. +type ErrorResponse struct { + XMLName xml.Name `xml:"Error"` + Code string `xml:"Code"` + Message string `xml:"Message"` + Resource string `xml:"Resource,omitempty"` + RequestID string `xml:"RequestId,omitempty"` +} diff --git a/pkg/ms3t/uploader/forgeauth.go b/pkg/ms3t/uploader/forgeauth.go new file mode 100644 index 0000000..f68a560 --- /dev/null +++ b/pkg/ms3t/uploader/forgeauth.go @@ -0,0 +1,92 @@ +package uploader + +import ( + "fmt" + "net/url" + "os" + + uclient "github.com/storacha/go-ucanto/client" + "github.com/storacha/go-ucanto/core/delegation" + "github.com/storacha/go-ucanto/did" + "github.com/storacha/go-ucanto/principal" + "github.com/storacha/go-ucanto/principal/ed25519/signer" + "github.com/storacha/go-ucanto/transport/car" + uhttp "github.com/storacha/go-ucanto/transport/http" + guppyclient "github.com/storacha/guppy/pkg/client" +) + +// LoadOrCreateSigner reads a persisted principal.Signer from path or +// generates and writes a fresh one if the file does not exist. The +// on-disk format is the canonical did:key string representation +// (signer.Format). +// +// The returned signer's DID is what the operator passes to a delegator +// when requesting a `space/blob/add` + `space/index/add` delegation. +func LoadOrCreateSigner(path string) (principal.Signer, error) { + data, err := os.ReadFile(path) + if os.IsNotExist(err) { + s, err := signer.Generate() + if err != nil { + return nil, fmt.Errorf("uploader: generate signer: %w", err) + } + formatted, err := signer.Format(s) + if err != nil { + return nil, fmt.Errorf("uploader: format signer: %w", err) + } + if err := os.WriteFile(path, []byte(formatted), 0o600); err != nil { + return nil, fmt.Errorf("uploader: persist signer: %w", err) + } + return s, nil + } + if err != nil { + return nil, fmt.Errorf("uploader: read signer: %w", err) + } + s, err := signer.Parse(string(data)) + if err != nil { + return nil, fmt.Errorf("uploader: parse signer: %w", err) + } + return s, nil +} + +// LoadDelegations reads a CAR-encoded delegation from path. The input +// is expected to be a single delegation per file; callers needing +// multiple delegations should pass multiple paths and concatenate the +// results. +func LoadDelegations(path string) ([]delegation.Delegation, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("uploader: read delegation: %w", err) + } + d, err := delegation.Extract(data) + if err != nil { + return nil, fmt.Errorf("uploader: parse delegation %s: %w", path, err) + } + return []delegation.Delegation{d}, nil +} + +// NewForgeClient assembles a guppy client targeting the given upload +// service. servicePrincipal is the DID of the upload service (e.g. +// sprue's did:web for production, or the local sprue did:key under +// smelt). serviceURL is the HTTP endpoint for UCAN invocations. +func NewForgeClient( + serviceURL *url.URL, + servicePrincipal did.DID, + s principal.Signer, + proofs []delegation.Delegation, +) (*guppyclient.Client, error) { + channel := uhttp.NewChannel(serviceURL) + codec := car.NewOutboundCodec() + conn, err := uclient.NewConnection(servicePrincipal, channel, uclient.WithOutboundCodec(codec)) + if err != nil { + return nil, fmt.Errorf("uploader: build connection: %w", err) + } + c, err := guppyclient.NewClient( + guppyclient.WithConnection(conn), + guppyclient.WithPrincipal(s), + guppyclient.WithAdditionalProofs(proofs...), + ) + if err != nil { + return nil, fmt.Errorf("uploader: build guppy client: %w", err) + } + return c, nil +} diff --git a/pkg/ms3t/uploader/guppy.go b/pkg/ms3t/uploader/guppy.go new file mode 100644 index 0000000..ddb4fd5 --- /dev/null +++ b/pkg/ms3t/uploader/guppy.go @@ -0,0 +1,149 @@ +package uploader + +import ( + "bytes" + "context" + "fmt" + "io" + + "github.com/storacha/sprue/pkg/ms3t/cars" + "github.com/ipfs/go-cid" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + block "github.com/ipfs/go-block-format" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/storacha/go-libstoracha/blobindex" + "github.com/storacha/go-ucanto/did" + guppyclient "github.com/storacha/guppy/pkg/client" +) + +// placeholderCID is the smallest legal raw-codec CID with an +// identity-hashed two-byte payload. It mirrors guppy's internal +// PlaceholderCID and is used as the "root" for the +// ShardedDagIndexView and the SpaceIndexAdd invocation: the index's +// Content() field and SpaceIndexAdd's rootCID parameter aren't +// load-bearing for inner-CID lookups (per guppy's own usage), so +// instead of inventing a synthetic root for each multi-root CAR we +// just pass this placeholder through. +var placeholderCID = cid.NewCidV1(cid.Raw, []byte{0x00, 0x00}) + +// Guppy is an Uploader that ships each Submit's CAR to Forge via the +// guppy client, then uploads a per-CAR index and registers it with +// the indexing-service so individual inner CIDs become resolvable. +// +// One Submit produces three Forge round trips: +// +// 1. SpaceBlobAdd of the CAR (one piri blob, multihash-keyed) +// 2. SpaceBlobAdd of the index (a small CAR encoding the inner +// CID → byte-range mappings) +// 3. SpaceIndexAdd (registers the index → placeholder +// root association with the indexer) +// +// Multi-root CARs ride as one logical batch. The index covers every +// inner block from every root; SpaceIndexAdd is called once per +// CAR, not once per root, since the rootCID parameter is treated as +// a placeholder by the upstream pattern. +// +// Synchronous: Submit blocks until all three calls have returned. +// Wrap in uploader.Batched if you want size/time-driven batching of +// multiple S3 ops into one CAR before each Submit fires. +type Guppy struct { + client *guppyclient.Client + spaceDID did.DID +} + +// GuppyConfig wires a *guppyclient.Client (already configured with +// connection, principal, and delegation proofs) plus the destination +// space DID into a Guppy uploader. +type GuppyConfig struct { + Client *guppyclient.Client + SpaceDID did.DID +} + +// NewGuppy constructs a Guppy uploader from a configured client. +func NewGuppy(cfg GuppyConfig) (*Guppy, error) { + if cfg.Client == nil { + return nil, fmt.Errorf("uploader: guppy client is required") + } + if cfg.SpaceDID == (did.DID{}) { + return nil, fmt.Errorf("uploader: space DID is required") + } + return &Guppy{client: cfg.Client, spaceDID: cfg.SpaceDID}, nil +} + +func (g *Guppy) Submit(ctx context.Context, roots []cid.Cid, blocks []block.Block) error { + if len(roots) == 0 { + return fmt.Errorf("uploader: at least one root required") + } + if len(blocks) == 0 { + return nil + } + + // 1. Encode CAR + record each inner block's byte position. + var carBuf bytes.Buffer + positions, err := cars.WriteWithPositions(&carBuf, roots, blocks) + if err != nil { + return fmt.Errorf("uploader: encode car: %w", err) + } + carBytes := carBuf.Bytes() + + carDigest, err := multihash.Sum(carBytes, multihash.SHA2_256, -1) + if err != nil { + return fmt.Errorf("uploader: hash car: %w", err) + } + + // 2. Upload the CAR as one piri blob. + if _, err := g.client.SpaceBlobAdd(ctx, + bytes.NewReader(carBytes), g.spaceDID, + guppyclient.WithPrecomputedDigest(carDigest, uint64(len(carBytes))), + ); err != nil { + return fmt.Errorf("uploader: SpaceBlobAdd(car): %w", err) + } + + // 3. Build a ShardedDagIndexView that points every inner CID at + // its slice of the CAR. Single shard (the CAR we just uploaded), + // placeholder content (see comment above placeholderCID). + view := blobindex.NewShardedDagIndexView(cidlink.Link{Cid: placeholderCID}, 1) + for _, p := range positions { + view.SetSlice(carDigest, p.CID.Hash(), blobindex.Position{ + Offset: p.Offset, + Length: p.Length, + }) + } + archReader, err := view.Archive() + if err != nil { + return fmt.Errorf("uploader: archive index: %w", err) + } + indexBytes, err := io.ReadAll(archReader) + if err != nil { + return fmt.Errorf("uploader: read archived index: %w", err) + } + indexDigest, err := multihash.Sum(indexBytes, multihash.SHA2_256, -1) + if err != nil { + return fmt.Errorf("uploader: hash index: %w", err) + } + + // 4. Upload the index as its own piri blob. + if _, err := g.client.SpaceBlobAdd(ctx, + bytes.NewReader(indexBytes), g.spaceDID, + guppyclient.WithPrecomputedDigest(indexDigest, uint64(len(indexBytes))), + ); err != nil { + return fmt.Errorf("uploader: SpaceBlobAdd(index): %w", err) + } + + // 5. Register the index with the indexing-service. The index CID + // uses the CAR multicodec, since the index is itself a CAR + // (matching how guppy frames its blobs). + indexCID := cid.NewCidV1(uint64(multicodec.Car), indexDigest) + if err := g.client.SpaceIndexAdd(ctx, + indexCID, uint64(len(indexBytes)), placeholderCID, g.spaceDID, + ); err != nil { + return fmt.Errorf("uploader: SpaceIndexAdd: %w", err) + } + return nil +} + +func (g *Guppy) Flush(context.Context) error { return nil } +func (g *Guppy) Close(context.Context) error { return nil } + +var _ Uploader = (*Guppy)(nil) diff --git a/pkg/ms3t/uploader/internal.go b/pkg/ms3t/uploader/internal.go new file mode 100644 index 0000000..982feab --- /dev/null +++ b/pkg/ms3t/uploader/internal.go @@ -0,0 +1,393 @@ +package uploader + +import ( + "bytes" + "context" + "crypto/ed25519" + "errors" + "fmt" + "io" + nethttp "net/http" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + "github.com/ipld/go-ipld-prime/datamodel" + "github.com/ipld/go-ipld-prime/fluent/qp" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + basicnode "github.com/ipld/go-ipld-prime/node/basic" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/storacha/go-libstoracha/blobindex" + httpcap "github.com/storacha/go-libstoracha/capabilities/http" + spaceblobcap "github.com/storacha/go-libstoracha/capabilities/space/blob" + contentcap "github.com/storacha/go-libstoracha/capabilities/space/content" + captypes "github.com/storacha/go-libstoracha/capabilities/types" + "github.com/storacha/go-ucanto/core/delegation" + "github.com/storacha/go-ucanto/core/invocation" + "github.com/storacha/go-ucanto/did" + "github.com/storacha/go-ucanto/principal" + ed25519signer "github.com/storacha/go-ucanto/principal/ed25519/signer" + "github.com/storacha/go-ucanto/ucan" + "go.uber.org/zap" + + "github.com/storacha/sprue/pkg/indexerclient" + "github.com/storacha/sprue/pkg/ms3t/cars" + "github.com/storacha/sprue/pkg/piriclient" + "github.com/storacha/sprue/pkg/routing" +) + +// Internal is an Uploader that ships CARs to Forge from inside sprue, +// using sprue's own piriclient and indexerclient. No UCAN-over-HTTP +// loopback to sprue's own UCAN endpoint, no separate principal or +// delegation file: sprue's identity is the signer, and storage +// provider delegations are pulled live from sprue's routing service. +// +// One Submit: +// +// 1. Encode the CAR for this batch (with byte positions for each +// inner block). +// 2. Allocate + HTTP PUT + Accept the CAR through a piri selected +// by routing.Service. +// 3. Build a ShardedDagIndexView and archive it. +// 4. Allocate + HTTP PUT + Accept the index through a piri. +// 5. PublishIndexClaim against the indexing-service. +// +// Steps 2 and 4 share a helper that synthesizes the cause and put +// invocations that the existing space_blob_add handler builds from +// the inbound user UCAN. Here there's no inbound user UCAN — sprue's +// signer self-issues them so the audit shape matches. +type Internal struct { + router *routing.Service + piriProvider piriclient.Provider + indexerClient *indexerclient.Client + signer principal.Signer + spaceSigner principal.Signer + httpClient *nethttp.Client + logger *zap.Logger +} + +// InternalConfig wires sprue's existing services into an Internal +// uploader. All fields are required. +// +// Signer is sprue's upload-service identity — used for piriclient +// invocations and as the audience of the self-issued retrieval +// delegation. +// +// SpaceSigner is the keypair of the space ms3t owns. ms3t generates +// and persists this on first run; its DID is the space resource for +// every PUT, and it acts as the root authority for self-issued +// space/content/retrieve delegations (so the indexer can fetch the +// index blob from piri on assert/index validation). +type InternalConfig struct { + Router *routing.Service + PiriProvider piriclient.Provider + IndexerClient *indexerclient.Client + Signer principal.Signer + SpaceSigner principal.Signer + HTTPClient *nethttp.Client // optional; defaults to nethttp.DefaultClient + Logger *zap.Logger +} + +// NewInternal validates the config and returns an Uploader that +// writes through sprue's internal services. +func NewInternal(cfg InternalConfig) (*Internal, error) { + if cfg.Router == nil { + return nil, errors.New("uploader: routing service is required") + } + if cfg.PiriProvider == nil { + return nil, errors.New("uploader: piri provider is required") + } + if cfg.IndexerClient == nil { + return nil, errors.New("uploader: indexer client is required") + } + if cfg.Signer == nil { + return nil, errors.New("uploader: signer is required") + } + if cfg.SpaceSigner == nil { + return nil, errors.New("uploader: space signer is required") + } + httpc := cfg.HTTPClient + if httpc == nil { + httpc = nethttp.DefaultClient + } + logger := cfg.Logger + if logger == nil { + logger = zap.NewNop() + } + return &Internal{ + router: cfg.Router, + piriProvider: cfg.PiriProvider, + indexerClient: cfg.IndexerClient, + signer: cfg.Signer, + spaceSigner: cfg.SpaceSigner, + httpClient: httpc, + logger: logger, + }, nil +} + +// SpaceDID returns the DID of the space ms3t owns. +func (u *Internal) SpaceDID() did.DID { return u.spaceSigner.DID() } + +func (u *Internal) Submit(ctx context.Context, roots []cid.Cid, blocks []block.Block) error { + if len(roots) == 0 { + return errors.New("uploader: at least one root required") + } + if len(blocks) == 0 { + return nil + } + + // 1. Encode CAR + record positions. + var carBuf bytes.Buffer + positions, err := cars.WriteWithPositions(&carBuf, roots, blocks) + if err != nil { + return fmt.Errorf("uploader: encode car: %w", err) + } + carBytes := carBuf.Bytes() + carDigest, err := multihash.Sum(carBytes, multihash.SHA2_256, -1) + if err != nil { + return fmt.Errorf("uploader: hash car: %w", err) + } + + // 2. Allocate + PUT + Accept the data CAR. + if err := u.uploadBlob(ctx, carBytes, carDigest); err != nil { + return fmt.Errorf("uploader: ship car: %w", err) + } + + // 3. Build a ShardedDagIndexView keyed off the CAR's multihash. + view := blobindex.NewShardedDagIndexView(cidlink.Link{Cid: placeholderCID}, 1) + for _, p := range positions { + view.SetSlice(carDigest, p.CID.Hash(), blobindex.Position{ + Offset: p.Offset, + Length: p.Length, + }) + } + archReader, err := view.Archive() + if err != nil { + return fmt.Errorf("uploader: archive index: %w", err) + } + indexBytes, err := io.ReadAll(archReader) + if err != nil { + return fmt.Errorf("uploader: read archived index: %w", err) + } + indexDigest, err := multihash.Sum(indexBytes, multihash.SHA2_256, -1) + if err != nil { + return fmt.Errorf("uploader: hash index: %w", err) + } + + // 4. Allocate + PUT + Accept the index blob. + if err := u.uploadBlob(ctx, indexBytes, indexDigest); err != nil { + return fmt.Errorf("uploader: ship index: %w", err) + } + + // 5. Publish the index claim. The indexer needs to fetch our + // index blob from piri to validate the assertion, and piri + // requires UCAN auth on retrieval. We self-issue a + // space/content/retrieve delegation scoped to this specific + // index blob and pass it as clientAuth; sprue's + // indexerclient re-delegates from us to the indexer using + // that as the proof chain (mirrors the user-facing flow, + // just with sprue's signer playing the user's role). + indexCID := cid.NewCidV1(uint64(multicodec.Car), indexDigest) + retrievalAuth, err := contentcap.Retrieve.Delegate( + u.spaceSigner, // issuer = space owner (root authority) + u.signer, // audience = sprue (next hop) + u.SpaceDID().String(), + contentcap.RetrieveCaveats{ + Blob: contentcap.BlobDigest{Digest: indexDigest}, + Range: contentcap.Range{Start: 0, End: uint64(len(indexBytes)) - 1}, + }, + delegation.WithNoExpiration(), + ) + if err != nil { + return fmt.Errorf("uploader: build retrieval auth: %w", err) + } + if err := u.indexerClient.PublishIndexClaim(ctx, u.SpaceDID(), placeholderCID, indexCID, retrievalAuth); err != nil { + return fmt.Errorf("uploader: publish index claim: %w", err) + } + return nil +} + +func (u *Internal) Flush(context.Context) error { return nil } +func (u *Internal) Close(context.Context) error { return nil } + +// uploadBlob runs the allocate → PUT → accept dance for one blob. +// Retries the allocate on ErrCandidateUnavailable by excluding failed +// providers, mirroring sprue's space_blob_add handler. +func (u *Internal) uploadBlob(ctx context.Context, data []byte, digest multihash.Multihash) error { + blob := captypes.Blob{Digest: digest, Size: uint64(len(data))} + + // Synthesize a self-issued space/blob/add invocation as the cause. + // Its link feeds the audit chain piri's handlers expect; never sent + // over the wire. + causeInv, err := spaceblobcap.Add.Invoke( + u.signer, u.signer, u.SpaceDID().String(), + spaceblobcap.AddCaveats{Blob: blob}, + ) + if err != nil { + return fmt.Errorf("synthesize cause: %w", err) + } + cause := causeInv.Link() + + var exclusions []ucan.Principal + for { + provider, err := u.router.SelectStorageProvider(ctx, blob, routing.WithExclusions(exclusions...)) + if err != nil { + return fmt.Errorf("select provider: %w", err) + } + log := u.logger.With( + zap.Stringer("provider", provider.ID.DID()), + zap.String("endpoint", provider.Endpoint.String()), + ) + + client, err := u.piriProvider.Client(provider.ID, provider.Endpoint) + if err != nil { + return fmt.Errorf("piri client: %w", err) + } + fetcher := internalDelegationFetcher{proof: provider.Proof} + + allocResp, allocInv, _, err := client.Allocate(ctx, &piriclient.AllocateRequest{ + Space: u.SpaceDID(), + Digest: digest, + Size: blob.Size, + Cause: cause, + }, fetcher) + if err != nil { + if errors.Is(err, routing.ErrCandidateUnavailable) { + log.Warn("provider unavailable, excluding and retrying", zap.Error(err)) + exclusions = append(exclusions, provider.ID) + continue + } + return fmt.Errorf("allocate: %w", err) + } + + // PUT bytes if piri allocated a fresh slot. If Address is nil + // piri already has the blob; skip the upload. + if allocResp.Address != nil { + if err := httpPut(ctx, u.httpClient, allocResp.Address.URL.String(), allocResp.Address.Headers, data); err != nil { + return fmt.Errorf("http put: %w", err) + } + } + + // Synthesize the http/put invocation (matches genPut in + // sprue/pkg/service/handlers/space_blob_add.go) so Accept has + // a stable Put link to chain off. + putInv, err := synthesizePut(blob, allocInv) + if err != nil { + return fmt.Errorf("synthesize put: %w", err) + } + + if _, _, _, err := client.Accept(ctx, &piriclient.AcceptRequest{ + Space: u.SpaceDID(), + Digest: digest, + Size: blob.Size, + Put: putInv.Link(), + }, fetcher); err != nil { + return fmt.Errorf("accept: %w", err) + } + return nil + } +} + +// synthesizePut mirrors genPut in space_blob_add.go: derive a +// principal from the blob's digest, issue an http/put invocation +// with caveats that promise to fulfill from the alloc invocation's +// effects. The invocation is never executed; we only need its Link +// for AcceptRequest.Put. +func synthesizePut(blob captypes.Blob, allocInv invocation.Invocation) (invocation.Invocation, error) { + provider, err := deriveDIDFromDigest(blob.Digest) + if err != nil { + return nil, err + } + fct := httpPutFact{id: provider.DID().String(), key: provider.Encode()} + return httpcap.Put.Invoke( + provider, provider, provider.DID().String(), + httpcap.PutCaveats{ + URL: captypes.Promise{ + UcanAwait: captypes.Await{ + Selector: ".out.ok.address.url", + Link: allocInv.Link(), + }, + }, + Headers: captypes.Promise{ + UcanAwait: captypes.Await{ + Selector: ".out.ok.address.headers", + Link: allocInv.Link(), + }, + }, + Body: httpcap.Body{Digest: blob.Digest, Size: blob.Size}, + }, + delegation.WithFacts([]ucan.FactBuilder{fct}), + ) +} + +func httpPut(ctx context.Context, client *nethttp.Client, urlStr string, headers nethttp.Header, body []byte) error { + req, err := nethttp.NewRequestWithContext(ctx, nethttp.MethodPut, urlStr, bytes.NewReader(body)) + if err != nil { + return err + } + for k, v := range headers { + if len(v) > 0 { + req.Header.Set(k, v[0]) + } + } + resp, err := client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("http put status %s", resp.Status) + } + return nil +} + +// internalDelegationFetcher matches the shape of the unexported +// delegationFetcher in space_blob_add.go: returns the storage +// provider's pre-issued delegation when the audience matches. +type internalDelegationFetcher struct { + proof delegation.Delegation +} + +func (df internalDelegationFetcher) GetDelegation(ctx context.Context, audience ucan.Principal) (delegation.Delegation, error) { + if df.proof == nil { + return nil, nil + } + if df.proof.Audience().DID() != audience.DID() { + return nil, fmt.Errorf("delegation audience is %s, but invocation requires proof with audience %s", + df.proof.Audience().DID(), audience.DID()) + } + return df.proof, nil +} + +// deriveDIDFromDigest mirrors deriveDID in space_blob_add.go. The +// derived principal is deterministic per digest. +func deriveDIDFromDigest(digest multihash.Multihash) (principal.Signer, error) { + if len(digest) < ed25519.SeedSize { + return nil, fmt.Errorf("digest too short for ed25519 seed: %d < %d", len(digest), ed25519.SeedSize) + } + seed := digest[len(digest)-ed25519.SeedSize:] + pk := ed25519.NewKeyFromSeed(seed) + return ed25519signer.FromRaw(pk) +} + +// httpPutFact mirrors the unexported fact in space_blob_add.go. +// Embeds the derived principal's keys so downstream actors can +// re-derive and sign receipts. +type httpPutFact struct { + id string + key []byte +} + +func (hpf httpPutFact) ToIPLD() (map[string]datamodel.Node, error) { + keys, err := qp.BuildMap(basicnode.Prototype.Any, 1, func(ma datamodel.MapAssembler) { + qp.MapEntry(ma, hpf.id, qp.Bytes(hpf.key)) + }) + if err != nil { + return nil, err + } + return map[string]datamodel.Node{ + "keys": keys, + }, nil +} + +var _ Uploader = (*Internal)(nil) diff --git a/pkg/ms3t/uploader/uploader.go b/pkg/ms3t/uploader/uploader.go new file mode 100644 index 0000000..e0d1fa0 --- /dev/null +++ b/pkg/ms3t/uploader/uploader.go @@ -0,0 +1,285 @@ +// Package uploader hands batches of IPLD blocks off to durable storage as +// CAR files. The interface separates submission (queueing a logical PUT's +// blocks) from flushing (forcing buffered work out), so a buffered +// implementation can amortize many small S3 ops into one larger upload +// without changing the caller's flow. +package uploader + +import ( + "bytes" + "context" + "errors" + "fmt" + "os" + "path/filepath" + "sync" + "time" + + "github.com/storacha/sprue/pkg/ms3t/cars" + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" +) + +// Uploader is the seam between the bucket service and durable storage. +// +// Submit hands one logical batch of blocks (typically the result of a +// single S3 op) to the uploader along with the root CID(s) that +// summarize what was written. The implementation may flush immediately +// or buffer and flush later. +// +// Flush forces any buffered work out to durable storage. Callers use +// this for explicit boundaries (multipart Complete, shutdown) or +// recovery loops. +// +// Close flushes any remaining buffered work and releases resources +// (background goroutines, file handles, network clients). +type Uploader interface { + Submit(ctx context.Context, roots []cid.Cid, blocks []block.Block) error + Flush(ctx context.Context) error + Close(ctx context.Context) error +} + +// === Disk === + +// Disk is a synchronous Uploader that writes one CAR file per Submit +// call into a directory. Useful for development, debugging, and as the +// inner sink of a Batched uploader. +type Disk struct { + dir string + + mu sync.Mutex + count uint64 // unique suffix for files when collisions could occur +} + +// NewDisk creates the target directory if needed and returns a Disk +// uploader. +func NewDisk(dir string) (*Disk, error) { + if err := os.MkdirAll(dir, 0o755); err != nil { + return nil, fmt.Errorf("uploader: mkdir %s: %w", dir, err) + } + return &Disk{dir: dir}, nil +} + +func (d *Disk) Submit(_ context.Context, roots []cid.Cid, blocks []block.Block) error { + if len(roots) == 0 { + return errors.New("uploader: at least one root required") + } + if len(blocks) == 0 { + return nil + } + + var carBuf bytes.Buffer + if err := cars.Write(&carBuf, roots, blocks); err != nil { + return fmt.Errorf("uploader: encode car: %w", err) + } + + final := filepath.Join(d.dir, d.fileName(roots)) + tmp, err := os.CreateTemp(d.dir, ".tmp-*.car") + if err != nil { + return fmt.Errorf("uploader: tmpfile: %w", err) + } + tmpPath := tmp.Name() + committed := false + defer func() { + if !committed { + _ = tmp.Close() + _ = os.Remove(tmpPath) + } + }() + + if _, err := tmp.Write(carBuf.Bytes()); err != nil { + return fmt.Errorf("uploader: write: %w", err) + } + if err := tmp.Sync(); err != nil { + return fmt.Errorf("uploader: sync: %w", err) + } + if err := tmp.Close(); err != nil { + return fmt.Errorf("uploader: close: %w", err) + } + if err := os.Rename(tmpPath, final); err != nil { + return fmt.Errorf("uploader: rename: %w", err) + } + committed = true + return nil +} + +func (d *Disk) Flush(context.Context) error { return nil } +func (d *Disk) Close(context.Context) error { return nil } + +// fileName produces a human-recognizable filename for a CAR, derived +// from the first root and the total root count for multi-root batches. +func (d *Disk) fileName(roots []cid.Cid) string { + first := roots[0].String() + if len(roots) == 1 { + return first + ".car" + } + d.mu.Lock() + d.count++ + n := d.count + d.mu.Unlock() + return fmt.Sprintf("%s+%d-%d.car", first, len(roots)-1, n) +} + +// === Noop === + +// Noop discards all submissions. Useful for tests/benchmarks. +type Noop struct{} + +func (Noop) Submit(context.Context, []cid.Cid, []block.Block) error { return nil } +func (Noop) Flush(context.Context) error { return nil } +func (Noop) Close(context.Context) error { return nil } + +// === Batched === + +// BatchedOptions configures a Batched uploader. +type BatchedOptions struct { + // MaxBytes triggers a flush when the buffered block bytes exceed + // this size. 0 → 64 MiB. + MaxBytes int64 + // MaxAge triggers a flush when the time since the last submit + // exceeds this duration. 0 → 5 seconds. + MaxAge time.Duration + // CheckInterval is how often the background loop wakes to evaluate + // the time-based threshold. 0 → MaxAge / 4 (clamped to a minimum). + CheckInterval time.Duration +} + +func (o *BatchedOptions) defaults() { + if o.MaxBytes <= 0 { + o.MaxBytes = 64 << 20 + } + if o.MaxAge <= 0 { + o.MaxAge = 5 * time.Second + } + if o.CheckInterval <= 0 { + o.CheckInterval = o.MaxAge / 4 + if o.CheckInterval < 100*time.Millisecond { + o.CheckInterval = 100 * time.Millisecond + } + } +} + +// Batched buffers Submit calls in memory and flushes them to an inner +// Uploader as one combined batch when a size or time threshold is hit. +// Multiple roots accumulate; the eventual CAR has all of them. +// +// Crash recovery: Batched does not persist its in-memory queue. If the +// process dies, blocks that were Submitted but not yet Flushed remain +// in the underlying blockstore (canonical) but were not shipped via +// the inner Uploader. Recovery is the responsibility of the caller — +// see bucket.Service.Recover. +type Batched struct { + inner Uploader + opts BatchedOptions + + mu sync.Mutex + rootSet map[cid.Cid]struct{} + roots []cid.Cid + blockSet map[cid.Cid]struct{} + blocks []block.Block + pendingBytes int64 + lastSubmit time.Time + + stop chan struct{} + done chan struct{} +} + +// NewBatched wraps inner with size+time-driven flushing. +func NewBatched(inner Uploader, opts BatchedOptions) *Batched { + opts.defaults() + b := &Batched{ + inner: inner, + opts: opts, + rootSet: map[cid.Cid]struct{}{}, + blockSet: map[cid.Cid]struct{}{}, + stop: make(chan struct{}), + done: make(chan struct{}), + } + go b.loop() + return b +} + +func (b *Batched) Submit(ctx context.Context, roots []cid.Cid, blocks []block.Block) error { + if len(roots) == 0 { + return errors.New("uploader: at least one root required") + } + + b.mu.Lock() + for _, r := range roots { + if _, ok := b.rootSet[r]; !ok { + b.rootSet[r] = struct{}{} + b.roots = append(b.roots, r) + } + } + for _, blk := range blocks { + c := blk.Cid() + if _, ok := b.blockSet[c]; !ok { + b.blockSet[c] = struct{}{} + b.blocks = append(b.blocks, blk) + b.pendingBytes += int64(len(blk.RawData())) + } + } + b.lastSubmit = time.Now() + overSize := b.pendingBytes >= b.opts.MaxBytes + b.mu.Unlock() + + if overSize { + return b.Flush(ctx) + } + return nil +} + +func (b *Batched) Flush(ctx context.Context) error { + b.mu.Lock() + if len(b.blocks) == 0 { + b.mu.Unlock() + return nil + } + roots := b.roots + blocks := b.blocks + b.roots = nil + b.blocks = nil + b.rootSet = map[cid.Cid]struct{}{} + b.blockSet = map[cid.Cid]struct{}{} + b.pendingBytes = 0 + b.mu.Unlock() + + return b.inner.Submit(ctx, roots, blocks) +} + +func (b *Batched) Close(ctx context.Context) error { + close(b.stop) + <-b.done + if err := b.Flush(ctx); err != nil { + return err + } + return b.inner.Close(ctx) +} + +func (b *Batched) loop() { + defer close(b.done) + ticker := time.NewTicker(b.opts.CheckInterval) + defer ticker.Stop() + for { + select { + case <-b.stop: + return + case <-ticker.C: + b.mu.Lock() + shouldFlush := len(b.blocks) > 0 && !b.lastSubmit.IsZero() && + time.Since(b.lastSubmit) >= b.opts.MaxAge + b.mu.Unlock() + if shouldFlush { + _ = b.Flush(context.Background()) + } + } + } +} + +// === Compile-time assertions === + +var ( + _ Uploader = (*Disk)(nil) + _ Uploader = Noop{} + _ Uploader = (*Batched)(nil) +) From 6e643a910a01a093f0bcd958fffbb652b5b27202 Mon Sep 17 00:00:00 2001 From: frrist Date: Fri, 1 May 2026 14:21:20 -0700 Subject: [PATCH 2/3] feat: lsm tree style buffering/aggregation - s3 versitygw wired in with pass/fail sets --- config.example.yaml | 38 +- go.mod | 90 ++- go.sum | 224 ++++-- internal/config/config.go | 68 +- internal/fx/app.go | 3 +- internal/fx/ms3t.go | 314 -------- pkg/ms3t/architectural.md | 62 ++ pkg/ms3t/blockstore/buffered.go | 111 --- pkg/ms3t/blockstore/forge.go | 29 +- pkg/ms3t/blockstore/layered.go | 81 ++ pkg/ms3t/blockstore/log.go | 54 ++ pkg/ms3t/blockstore/sqlite.go | 69 -- pkg/ms3t/blockstore/staging.go | 137 ++++ pkg/ms3t/blockstore/staging_test.go | 211 +++++ pkg/ms3t/blockstore/store.go | 123 +++ pkg/ms3t/blockstore/walk.go | 63 -- pkg/ms3t/bucket/bucket.go | 518 ------------ pkg/ms3t/bucket/cbor_gen.go | 250 ++++-- pkg/ms3t/bucket/chunker.go | 200 +++-- pkg/ms3t/bucket/manifest.go | 39 +- pkg/ms3t/bucketop/bucketop.go | 340 ++++++++ pkg/ms3t/cars/encoder.go | 41 + pkg/ms3t/cars/reader.go | 161 ++++ pkg/ms3t/gen/main.go | 9 +- pkg/ms3t/logstore/config.go | 75 ++ pkg/ms3t/logstore/recovery.go | 199 +++++ pkg/ms3t/logstore/segment.go | 826 ++++++++++++++++++++ pkg/ms3t/logstore/store.go | 430 ++++++++++ pkg/ms3t/logstore/store_test.go | 501 ++++++++++++ pkg/ms3t/logstore/types.go | 108 +++ pkg/ms3t/migrations/migrations.go | 64 ++ pkg/ms3t/migrations/sql/00001_init.sql | 20 + pkg/ms3t/migrations/sql/00002_segments.sql | 49 ++ pkg/ms3t/module.go | 190 +++++ pkg/ms3t/mst/diff.go | 11 +- pkg/ms3t/mst/mst.go | 36 +- pkg/ms3t/mst/mst_util.go | 25 +- pkg/ms3t/registry/postgres.go | 180 +++++ pkg/ms3t/registry/segments.go | 277 +++++++ pkg/ms3t/registry/sqlite.go | 212 ----- pkg/ms3t/s3frontend/backend.go | 89 +++ pkg/ms3t/s3frontend/bucket.go | 211 +++++ pkg/ms3t/s3frontend/object.go | 441 +++++++++++ pkg/ms3t/server.go | 319 ++++++++ pkg/ms3t/server/handlers.go | 281 ------- pkg/ms3t/server/server.go | 132 ---- pkg/ms3t/server/xml.go | 68 -- pkg/ms3t/testing/harness.go | 443 +++++++++++ pkg/ms3t/testing/harness_test.go | 52 ++ pkg/ms3t/testing/integration.go | 228 ++++++ pkg/ms3t/testing/smoke_test.go | 374 +++++++++ pkg/ms3t/uploader/{internal.go => forge.go} | 165 +++- pkg/ms3t/uploader/forgeauth.go | 92 --- pkg/ms3t/uploader/guppy.go | 149 ---- pkg/ms3t/uploader/uploader.go | 285 ------- pkg/ms3t/util.go | 42 + 56 files changed, 7108 insertions(+), 2701 deletions(-) delete mode 100644 internal/fx/ms3t.go delete mode 100644 pkg/ms3t/blockstore/buffered.go create mode 100644 pkg/ms3t/blockstore/layered.go create mode 100644 pkg/ms3t/blockstore/log.go delete mode 100644 pkg/ms3t/blockstore/sqlite.go create mode 100644 pkg/ms3t/blockstore/staging.go create mode 100644 pkg/ms3t/blockstore/staging_test.go create mode 100644 pkg/ms3t/blockstore/store.go delete mode 100644 pkg/ms3t/blockstore/walk.go delete mode 100644 pkg/ms3t/bucket/bucket.go create mode 100644 pkg/ms3t/bucketop/bucketop.go create mode 100644 pkg/ms3t/cars/reader.go create mode 100644 pkg/ms3t/logstore/config.go create mode 100644 pkg/ms3t/logstore/recovery.go create mode 100644 pkg/ms3t/logstore/segment.go create mode 100644 pkg/ms3t/logstore/store.go create mode 100644 pkg/ms3t/logstore/store_test.go create mode 100644 pkg/ms3t/logstore/types.go create mode 100644 pkg/ms3t/migrations/migrations.go create mode 100644 pkg/ms3t/migrations/sql/00001_init.sql create mode 100644 pkg/ms3t/migrations/sql/00002_segments.sql create mode 100644 pkg/ms3t/module.go create mode 100644 pkg/ms3t/registry/postgres.go create mode 100644 pkg/ms3t/registry/segments.go delete mode 100644 pkg/ms3t/registry/sqlite.go create mode 100644 pkg/ms3t/s3frontend/backend.go create mode 100644 pkg/ms3t/s3frontend/bucket.go create mode 100644 pkg/ms3t/s3frontend/object.go create mode 100644 pkg/ms3t/server.go delete mode 100644 pkg/ms3t/server/handlers.go delete mode 100644 pkg/ms3t/server/server.go delete mode 100644 pkg/ms3t/server/xml.go create mode 100644 pkg/ms3t/testing/harness.go create mode 100644 pkg/ms3t/testing/harness_test.go create mode 100644 pkg/ms3t/testing/integration.go create mode 100644 pkg/ms3t/testing/smoke_test.go rename pkg/ms3t/uploader/{internal.go => forge.go} (68%) delete mode 100644 pkg/ms3t/uploader/forgeauth.go delete mode 100644 pkg/ms3t/uploader/guppy.go delete mode 100644 pkg/ms3t/uploader/uploader.go create mode 100644 pkg/ms3t/util.go diff --git a/config.example.yaml b/config.example.yaml index c06c281..e8a1c01 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -119,32 +119,18 @@ ms3t: # must set forcePathStyle=true and disable streaming uploads # (request_checksum_calculation=when_required for AWS CLI v2.23+). addr: ":9000" - # ms3t persists its SQLite database and (when forge is disabled) - # CAR files under this directory. Created if missing. + # ms3t persists its log segments and space keypair under this + # directory. Created if missing. data_dir: "./ms3t-data" # Body chunk size in bytes for new objects. Default 1 MiB. chunk_size: 1048576 - # Uploader: flush a CAR after this many buffered bytes. Default 64 MiB. - batch_bytes: 67108864 - # Uploader: flush a CAR after this idle duration. Default 5s. - batch_age: "5s" - - forge: - # When true, every batched CAR is shipped to piri through - # sprue's routing, piriclient, and indexerclient — no - # UCAN-over-HTTP loopback. ms3t reuses sprue's identity as the - # signer; no separate principal or delegation file is needed. - # When false, CARs are written to data_dir/cars only. - enabled: false - # Path to ms3t's persisted space keypair. Generated on first - # run if missing. ms3t is the root UCAN authority over its own - # space, which lets self-issued retrieval delegations validate - # against piri's auth on index-blob fetches. - # Defaults to /space.key. - space_key_file: "" - # When true, all block reads (MST nodes, manifests, body - # chunks) go through indexer→piri instead of a local SQLite - # cache, AND writes go synchronously to Forge (Batched is - # bypassed). Closes the read-after-write race; raises per-PUT - # latency to the Forge round trip cost. Requires enabled = true. - no_cache: false + # Log: seal the open segment when its on-disk size reaches this + # threshold. Default 64 MiB. + seal_bytes: 67108864 + # Log: seal the open segment when it has been open this long, even + # if size has not been reached. Default 5s. + seal_age: "5s" + # Log: number of most-recent sealed segments to keep on disk after + # they have been shipped to Forge. Older flushed segments are + # unlinked. Higher values trade disk for read locality. Default 6. + retain: 6 diff --git a/go.mod b/go.mod index b21fef5..83a2619 100644 --- a/go.mod +++ b/go.mod @@ -3,13 +3,14 @@ module github.com/storacha/sprue go 1.25.3 require ( - github.com/aws/aws-sdk-go-v2 v1.41.3 - github.com/aws/aws-sdk-go-v2/config v1.32.11 - github.com/aws/aws-sdk-go-v2/credentials v1.19.11 + github.com/aws/aws-sdk-go-v2 v1.41.6 + github.com/aws/aws-sdk-go-v2/config v1.32.16 + github.com/aws/aws-sdk-go-v2/credentials v1.19.15 github.com/aws/aws-sdk-go-v2/feature/dynamodb/attributevalue v1.20.34 github.com/aws/aws-sdk-go-v2/service/dynamodb v1.56.1 - github.com/aws/aws-sdk-go-v2/service/s3 v1.96.4 + github.com/aws/aws-sdk-go-v2/service/s3 v1.99.1 github.com/google/uuid v1.6.0 + github.com/hashicorp/golang-lru/arc/v2 v2.0.7 github.com/ipfs/go-cid v0.6.0 github.com/ipfs/go-log/v2 v2.9.0 github.com/ipld/go-ipld-prime v0.21.1-0.20240917223228-6148356a4c2e @@ -25,66 +26,81 @@ require ( github.com/storacha/go-libstoracha v0.7.5 github.com/storacha/go-ucanto v0.8.2 github.com/storacha/guppy v0.7.0 + github.com/storacha/indexing-service v1.12.2 github.com/stretchr/testify v1.11.1 github.com/testcontainers/testcontainers-go v0.42.0 github.com/testcontainers/testcontainers-go/modules/dynamodb v0.41.0 github.com/testcontainers/testcontainers-go/modules/minio v0.40.0 github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0 + github.com/versity/versitygw v1.4.1 go.uber.org/fx v1.24.0 go.uber.org/zap v1.27.0 - go.uber.org/zap/exp v0.3.0 - modernc.org/sqlite v1.46.1 ) require ( + github.com/Azure/go-ntlmssp v0.1.0 // indirect + github.com/DataDog/datadog-go/v5 v5.8.3 // indirect + github.com/andybalholm/brotli v1.2.1 // indirect + github.com/aws/aws-sdk-go-v2/feature/s3/transfermanager v0.1.17 // indirect github.com/cenkalti/backoff/v5 v5.0.3 // indirect - github.com/dustin/go-humanize v1.0.1 // indirect + github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 // indirect + github.com/go-ldap/ldap/v3 v3.4.13 // indirect + github.com/gofiber/fiber/v2 v2.52.12 // indirect + github.com/hashicorp/go-cleanhttp v0.5.2 // indirect + github.com/hashicorp/go-retryablehttp v0.7.8 // indirect + github.com/hashicorp/go-rootcerts v1.0.2 // indirect + github.com/hashicorp/go-secure-stdlib/strutil v0.1.2 // indirect + github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect + github.com/hashicorp/vault-client-go v0.4.3 // indirect github.com/ipfs/boxo v0.30.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect github.com/mfridman/interpolate v0.0.2 // indirect + github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/moby/moby/api v1.54.1 // indirect github.com/moby/moby/client v0.4.0 // indirect github.com/moby/sys/atomicwriter v0.1.0 // indirect - github.com/ncruces/go-strftime v1.0.0 // indirect - github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect + github.com/nats-io/nats.go v1.51.0 // indirect + github.com/nats-io/nkeys v0.4.15 // indirect + github.com/nats-io/nuid v1.0.1 // indirect + github.com/pierrec/lz4/v4 v4.1.26 // indirect + github.com/rabbitmq/amqp091-go v1.10.0 // indirect + github.com/ryanuber/go-glob v1.0.0 // indirect + github.com/segmentio/kafka-go v0.4.50 // indirect github.com/sethvargo/go-retry v0.3.0 // indirect - github.com/storacha/indexing-service v1.12.2 // indirect + github.com/smira/go-statsd v1.3.4 // indirect github.com/stretchr/objx v0.5.3 // indirect + github.com/valyala/fasthttp v1.70.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 // indirect go.opentelemetry.io/otel/sdk v1.43.0 // indirect go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect golang.org/x/sync v0.20.0 // indirect - modernc.org/libc v1.68.0 // indirect - modernc.org/mathutil v1.7.1 // indirect - modernc.org/memory v1.11.0 // indirect ) require ( dario.cat/mergo v1.0.2 // indirect github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/Microsoft/go-winio v0.6.2 // indirect - github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.6 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.19 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.19 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.19 // indirect - github.com/aws/aws-sdk-go-v2/internal/ini v1.8.5 // indirect - github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.20 // indirect + github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.9 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.22 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.23 // indirect github.com/aws/aws-sdk-go-v2/service/dynamodbstreams v1.32.12 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.6 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.11 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.8 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.14 // indirect github.com/aws/aws-sdk-go-v2/service/internal/endpoint-discovery v1.11.19 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.19 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.19 // indirect - github.com/aws/aws-sdk-go-v2/service/signin v1.0.7 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.30.12 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.16 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.41.8 // indirect - github.com/aws/smithy-go v1.24.2 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.22 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.22 // indirect + github.com/aws/aws-sdk-go-v2/service/signin v1.0.10 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.30.16 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.20 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.42.0 // indirect + github.com/aws/smithy-go v1.25.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/clipperhouse/uax29/v2 v2.6.0 // indirect + github.com/clipperhouse/uax29/v2 v2.7.0 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/containerd/log v0.1.0 // indirect @@ -129,15 +145,15 @@ require ( github.com/ipld/go-codec-dagpb v1.7.0 // indirect github.com/ipni/go-libipni v0.6.18 // indirect github.com/klauspost/compress v1.18.5 // indirect - github.com/klauspost/cpuid/v2 v2.2.10 // indirect + github.com/klauspost/cpuid/v2 v2.3.0 // indirect github.com/labstack/gommon v0.4.2 // indirect github.com/libp2p/go-buffer-pool v0.1.0 // indirect github.com/libp2p/go-libp2p v0.41.1 // indirect github.com/lufia/plan9stats v0.0.0-20240513124658-fba389f38bae // indirect github.com/magiconair/properties v1.8.10 // indirect github.com/mattn/go-colorable v0.1.14 // indirect - github.com/mattn/go-isatty v0.0.20 // indirect - github.com/mattn/go-runewidth v0.0.19 // indirect + github.com/mattn/go-isatty v0.0.21 // indirect + github.com/mattn/go-runewidth v0.0.23 // indirect github.com/minio/sha256-simd v1.0.1 // indirect github.com/moby/docker-image-spec v1.3.1 // indirect github.com/moby/go-archive v0.2.0 // indirect @@ -187,12 +203,12 @@ require ( go.uber.org/dig v1.19.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.49.0 // indirect + golang.org/x/crypto v0.50.0 // indirect golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90 // indirect - golang.org/x/net v0.52.0 // indirect - golang.org/x/sys v0.42.0 // indirect - golang.org/x/text v0.35.0 // indirect - golang.org/x/time v0.14.0 // indirect + golang.org/x/net v0.53.0 // indirect + golang.org/x/sys v0.43.0 // indirect + golang.org/x/text v0.36.0 // indirect + golang.org/x/time v0.15.0 // indirect golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect diff --git a/go.sum b/go.sum index 8d1c36f..ace4b18 100644 --- a/go.sum +++ b/go.sum @@ -43,62 +43,71 @@ github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8af github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A= +github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/DataDog/datadog-go/v5 v5.8.3 h1:s58CUJ9s8lezjhTNJO/SxkPBv2qZjS3ktpRSqGF5n0s= +github.com/DataDog/datadog-go/v5 v5.8.3/go.mod h1:K9kcYBlxkcPP8tvvjZZKs/m1edNAUFzBbdpTUKfCsuw= +github.com/Microsoft/go-winio v0.5.0/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI= +github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4= +github.com/andybalholm/brotli v1.2.1 h1:R+f5xP285VArJDRgowrfb9DqL18yVK0gKAW/F+eTWro= +github.com/andybalholm/brotli v1.2.1/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/asaskevich/EventBus v0.0.0-20200907212545-49d423059eef h1:2JGTg6JapxP9/R33ZaagQtAM4EkkSYnIAlOG5EI8gkM= github.com/asaskevich/EventBus v0.0.0-20200907212545-49d423059eef/go.mod h1:JS7hed4L1fj0hXcyEejnW57/7LCetXggd+vwrRnYeII= -github.com/aws/aws-sdk-go-v2 v1.41.3 h1:4kQ/fa22KjDt13QCy1+bYADvdgcxpfH18f0zP542kZA= -github.com/aws/aws-sdk-go-v2 v1.41.3/go.mod h1:mwsPRE8ceUUpiTgF7QmQIJ7lgsKUPQOUl3o72QBrE1o= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.6 h1:N4lRUXZpZ1KVEUn6hxtco/1d2lgYhNn1fHkkl8WhlyQ= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.6/go.mod h1:lyw7GFp3qENLh7kwzf7iMzAxDn+NzjXEAGjKS2UOKqI= -github.com/aws/aws-sdk-go-v2/config v1.32.11 h1:ftxI5sgz8jZkckuUHXfC/wMUc8u3fG1vQS0plr2F2Zs= -github.com/aws/aws-sdk-go-v2/config v1.32.11/go.mod h1:twF11+6ps9aNRKEDimksp923o44w/Thk9+8YIlzWMmo= -github.com/aws/aws-sdk-go-v2/credentials v1.19.11 h1:NdV8cwCcAXrCWyxArt58BrvZJ9pZ9Fhf9w6Uh5W3Uyc= -github.com/aws/aws-sdk-go-v2/credentials v1.19.11/go.mod h1:30yY2zqkMPdrvxBqzI9xQCM+WrlrZKSOpSJEsylVU+8= +github.com/aws/aws-sdk-go-v2 v1.41.6 h1:1AX0AthnBQzMx1vbmir3Y4WsnJgiydmnJjiLu+LvXOg= +github.com/aws/aws-sdk-go-v2 v1.41.6/go.mod h1:dy0UzBIfwSeot4grGvY1AqFWN5zgziMmWGzysDnHFcQ= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.9 h1:adBsCIIpLbLmYnkQU+nAChU5yhVTvu5PerROm+/Kq2A= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.9/go.mod h1:uOYhgfgThm/ZyAuJGNQ5YgNyOlYfqnGpTHXvk3cpykg= +github.com/aws/aws-sdk-go-v2/config v1.32.16 h1:Q0iQ7quUgJP0F/SCRTieScnaMdXr9h/2+wze1u3cNeM= +github.com/aws/aws-sdk-go-v2/config v1.32.16/go.mod h1:duCCnJEFqpt2RC6no1iK6q+8HpwOAkiUua0pY507dQc= +github.com/aws/aws-sdk-go-v2/credentials v1.19.15 h1:fyvgWTszojq8hEnMi8PPBTvZdTtEVmAVyo+NFLHBhH4= +github.com/aws/aws-sdk-go-v2/credentials v1.19.15/go.mod h1:gJiYyMOjNg8OEdRWOf3CrFQxM2a98qmrtjx1zuiQfB8= github.com/aws/aws-sdk-go-v2/feature/dynamodb/attributevalue v1.20.34 h1:gBoK/UF+CltS2dkNgpUwEROtNBtAsVCfWqIi+0qRDVA= github.com/aws/aws-sdk-go-v2/feature/dynamodb/attributevalue v1.20.34/go.mod h1:B4x2ogC2wSey/swvEainiBzLXiY89+xJaa85vcJFvD8= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.19 h1:INUvJxmhdEbVulJYHI061k4TVuS3jzzthNvjqvVvTKM= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.19/go.mod h1:FpZN2QISLdEBWkayloda+sZjVJL+e9Gl0k1SyTgcswU= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.19 h1:/sECfyq2JTifMI2JPyZ4bdRN77zJmr6SrS1eL3augIA= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.19/go.mod h1:dMf8A5oAqr9/oxOfLkC/c2LU/uMcALP0Rgn2BD5LWn0= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.19 h1:AWeJMk33GTBf6J20XJe6qZoRSJo0WfUhsMdUKhoODXE= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.19/go.mod h1:+GWrYoaAsV7/4pNHpwh1kiNLXkKaSoppxQq9lbH8Ejw= -github.com/aws/aws-sdk-go-v2/internal/ini v1.8.5 h1:clHU5fm//kWS1C2HgtgWxfQbFbx4b6rx+5jzhgX9HrI= -github.com/aws/aws-sdk-go-v2/internal/ini v1.8.5/go.mod h1:O3h0IK87yXci+kg6flUKzJnWeziQUKciKrLjcatSNcY= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.20 h1:qi3e/dmpdONhj1RyIZdi6DKKpDXS5Lb8ftr3p7cyHJc= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.20/go.mod h1:V1K+TeJVD5JOk3D9e5tsX2KUdL7BlB+FV6cBhdobN8c= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.22 h1:IOGsJ1xVWhsi+ZO7/NW8OuZZBtMJLZbk4P5HDjJO0jQ= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.22/go.mod h1:b+hYdbU+jGKfXE8kKM6g1+h+L/Go3vMvzlxBsiuGsxg= +github.com/aws/aws-sdk-go-v2/feature/s3/transfermanager v0.1.17 h1:95y7/EqethAhFwMKJ9cDutzBhsS1h8uBwkJ5rp8pNTU= +github.com/aws/aws-sdk-go-v2/feature/s3/transfermanager v0.1.17/go.mod h1:77baheqr62SkTw77HWH8qpdWTd2gXKN0xg0qLvDSkpk= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22 h1:GmLa5Kw1ESqtFpXsx5MmC84QWa/ZrLZvlJGa2y+4kcQ= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22/go.mod h1:6sW9iWm9DK9YRpRGga/qzrzNLgKpT2cIxb7Vo2eNOp0= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22 h1:dY4kWZiSaXIzxnKlj17nHnBcXXBfac6UlsAx2qL6XrU= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22/go.mod h1:KIpEUx0JuRZLO7U6cbV204cWAEco2iC3l061IxlwLtI= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.23 h1:FPXsW9+gMuIeKmz7j6ENWcWtBGTe1kH8r9thNt5Uxx4= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.23/go.mod h1:7J8iGMdRKk6lw2C+cMIphgAnT8uTwBwNOsGkyOCm80U= github.com/aws/aws-sdk-go-v2/service/dynamodb v1.56.1 h1:EkW4NqA2mwCkL7YCDYh6OpA/bCMhKYbZgpRHt2FD2Ow= github.com/aws/aws-sdk-go-v2/service/dynamodb v1.56.1/go.mod h1:OQp5333OH1IjmJmJpTU4IwoaOoCMnDrThg0zIx169rE= github.com/aws/aws-sdk-go-v2/service/dynamodbstreams v1.32.12 h1:EhZjf2GKn/V3yPfYmUGdYmrcbxaGu2LO0M6ZrOt/qu8= github.com/aws/aws-sdk-go-v2/service/dynamodbstreams v1.32.12/go.mod h1:KPi0H5VdX4011P0gF806TZt8EiP3FkeRkt6+lzMUvxQ= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.6 h1:XAq62tBTJP/85lFD5oqOOe7YYgWxY9LvWq8plyDvDVg= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.6/go.mod h1:x0nZssQ3qZSnIcePWLvcoFisRXJzcTVvYpAAdYX8+GI= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.11 h1:BYf7XNsJMzl4mObARUBUib+j2tf0U//JAAtTnYqvqCw= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.11/go.mod h1:aEUS4WrNk/+FxkBZZa7tVgp4pGH+kFGW40Y8rCPqt5g= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.8 h1:HtOTYcbVcGABLOVuPYaIihj6IlkqubBwFj10K5fxRek= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.8/go.mod h1:VsK9abqQeGlzPgUr+isNWzPlK2vKe9INMLWnY65f5Xs= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.14 h1:xnvDEnw+pnj5mctWiYuFbigrEzSm35x7k4KS/ZkCANg= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.14/go.mod h1:yS5rNogD8e0Wu9+l3MUwr6eENBzEeGejvINpN5PAYfY= github.com/aws/aws-sdk-go-v2/service/internal/endpoint-discovery v1.11.19 h1:jdCj9vbCXwzTcIJX+MVd2UdssFhRJFTrWlPZwZB8Hpk= github.com/aws/aws-sdk-go-v2/service/internal/endpoint-discovery v1.11.19/go.mod h1:Dgg2d5WGRr7YB8JJsELskBxLUhgwWppXPwlvmuQKhbc= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.19 h1:X1Tow7suZk9UCJHE1Iw9GMZJJl0dAnKXXP1NaSDHwmw= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.19/go.mod h1:/rARO8psX+4sfjUQXp5LLifjUt8DuATZ31WptNJTyQA= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.19 h1:JnQeStZvPHFHeyky/7LbMlyQjUa+jIBj36OlWm0pzIk= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.19/go.mod h1:HGyasyHvYdFQeJhvDHfH7HXkHh57htcJGKDZ+7z+I24= -github.com/aws/aws-sdk-go-v2/service/s3 v1.96.4 h1:4ExZyubQ6LQQVuF2Qp9OsfEvsTdAWh5Gfwf6PgIdLdk= -github.com/aws/aws-sdk-go-v2/service/s3 v1.96.4/go.mod h1:NF3JcMGOiARAss1ld3WGORCw71+4ExDD2cbbdKS5PpA= -github.com/aws/aws-sdk-go-v2/service/signin v1.0.7 h1:Y2cAXlClHsXkkOvWZFXATr34b0hxxloeQu/pAZz2row= -github.com/aws/aws-sdk-go-v2/service/signin v1.0.7/go.mod h1:idzZ7gmDeqeNrSPkdbtMp9qWMgcBwykA7P7Rzh5DXVU= -github.com/aws/aws-sdk-go-v2/service/sso v1.30.12 h1:iSsvB9EtQ09YrsmIc44Heqlx5ByGErqhPK1ZQLppias= -github.com/aws/aws-sdk-go-v2/service/sso v1.30.12/go.mod h1:fEWYKTRGoZNl8tZ77i61/ccwOMJdGxwOhWCkp6TXAr0= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.16 h1:EnUdUqRP1CNzt2DkV67tJx6XDN4xlfBFm+bzeNOQVb0= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.16/go.mod h1:Jic/xv0Rq/pFNCh3WwpH4BEqdbSAl+IyHro8LbibHD8= -github.com/aws/aws-sdk-go-v2/service/sts v1.41.8 h1:XQTQTF75vnug2TXS8m7CVJfC2nniYPZnO1D4Np761Oo= -github.com/aws/aws-sdk-go-v2/service/sts v1.41.8/go.mod h1:Xgx+PR1NUOjNmQY+tRMnouRp83JRM8pRMw/vCaVhPkI= -github.com/aws/smithy-go v1.24.2 h1:FzA3bu/nt/vDvmnkg+R8Xl46gmzEDam6mZ1hzmwXFng= -github.com/aws/smithy-go v1.24.2/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.22 h1:PUmZeJU6Y1Lbvt9WFuJ0ugUK2xn6hIWUBBbKuOWF30s= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.22/go.mod h1:nO6egFBoAaoXze24a2C0NjQCvdpk8OueRoYimvEB9jo= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.22 h1:SE+aQ4DEqG53RRCAIHlCf//B2ycxGH7jFkpnAh/kKPM= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.22/go.mod h1:ES3ynECd7fYeJIL6+oax+uIEljmfps0S70BaQzbMd/o= +github.com/aws/aws-sdk-go-v2/service/s3 v1.99.1 h1:kU/eBN5+MWNo/LcbNa4hWDdN76hdcd7hocU5kvu7IsU= +github.com/aws/aws-sdk-go-v2/service/s3 v1.99.1/go.mod h1:Fw9aqhJicIVee1VytBBjH+l+5ov6/PhbtIK/u3rt/ls= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.10 h1:a1Fq/KXn75wSzoJaPQTgZO0wHGqE9mjFnylnqEPTchA= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.10/go.mod h1:p6+MXNxW7IA6dMgHfTAzljuwSKD0NCm/4lbS4t6+7vI= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.16 h1:x6bKbmDhsgSZwv6q19wY/u3rLk/3FGjJWyqKcIRufpE= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.16/go.mod h1:CudnEVKRtLn0+3uMV0yEXZ+YZOKnAtUJ5DmDhilVnIw= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.20 h1:oK/njaL8GtyEihkWMD4k3VgHCT64RQKkZwh0DG5j8ak= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.20/go.mod h1:JHs8/y1f3zY7U5WcuzoJ/yAYGYtNIVPKLIbp61euvmg= +github.com/aws/aws-sdk-go-v2/service/sts v1.42.0 h1:ks8KBcZPh3PYISr5dAiXCM5/Thcuxk8l+PG4+A0exds= +github.com/aws/aws-sdk-go-v2/service/sts v1.42.0/go.mod h1:pFw33T0WLvXU3rw1WBkpMlkgIn54eCB5FYLhjDc9Foo= +github.com/aws/smithy-go v1.25.0 h1:Sz/XJ64rwuiKtB6j98nDIPyYrV1nVNJ4YU74gttcl5U= +github.com/aws/smithy-go v1.25.0/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc= github.com/benbjohnson/clock v1.3.5 h1:VvXlSJBzZpA/zum6Sj74hxwYI2DIxRWuNIoXAzHZz5o= github.com/benbjohnson/clock v1.3.5/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -116,8 +125,8 @@ github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWR github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= -github.com/clipperhouse/uax29/v2 v2.6.0 h1:z0cDbUV+aPASdFb2/ndFnS9ts/WNXgTNNGFoKXuhpos= -github.com/clipperhouse/uax29/v2 v2.6.0/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g= +github.com/clipperhouse/uax29/v2 v2.7.0 h1:+gs4oBZ2gPfVrKPthwbMzWZDaAFPGYK72F0NJv2v7Vk= +github.com/clipperhouse/uax29/v2 v2.7.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20200629203442-efcf912fb354/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= @@ -169,6 +178,8 @@ github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.m github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= +github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/filecoin-project/go-clock v0.1.0 h1:SFbYIM75M8NnFm1yMHhN9Ahy3W5bEZV9gd6MPfXbKVU= @@ -189,9 +200,13 @@ github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4 github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= +github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 h1:BP4M0CvQ4S3TGls2FvczZtj5Re/2ZzkV9VwqPHH/3Bo= +github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-ldap/ldap/v3 v3.4.13 h1:+x1nG9h+MZN7h/lUi5Q3UZ0fJ1GyDQYbPvbuH38baDQ= +github.com/go-ldap/ldap/v3 v3.4.13/go.mod h1:LxsGZV6vbaK0sIvYfsv47rfh4ca0JXokCoKjZxsszv0= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -208,6 +223,8 @@ github.com/go-yaml/yaml v2.1.0+incompatible/go.mod h1:w2MrLa16VYP0jy6N7M5kHaCkaL github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/gofiber/fiber/v2 v2.52.12 h1:0LdToKclcPOj8PktUdIKo9BUohjjwfnQl42Dhw8/WUw= +github.com/gofiber/fiber/v2 v2.52.12/go.mod h1:YEcBbO/FB+5M1IZNBP9FO3J9281zgPAreiI1oqg8nDw= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= @@ -222,6 +239,7 @@ github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= github.com/golang/mock v1.5.0/go.mod h1:CWnOUgYIOo4TcNZ0wHX3YZCqsaM1I1Jvs6v3mP3KVu8= +github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -292,19 +310,33 @@ github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBt github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= +github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k= +github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= +github.com/hashicorp/go-retryablehttp v0.7.8 h1:ylXZWnqa7Lhqpk0L1P1LzDtGcCR0rPVUrx/c8Unxc48= +github.com/hashicorp/go-retryablehttp v0.7.8/go.mod h1:rjiScheydd+CxvumBsIrFKlx3iS0jrZ7LvzFGFmuKbw= github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU= +github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= +github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= +github.com/hashicorp/go-secure-stdlib/strutil v0.1.2 h1:kes8mmyCpxJsI7FTwtzRqEy9CdjCtrXrXGuOpxEA7Ts= +github.com/hashicorp/go-secure-stdlib/strutil v0.1.2/go.mod h1:Gou2R9+il93BqX25LAKCLuM+y9U2T4hlwvT1yprcna4= github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU= github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4= github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8= +github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v1.0.2 h1:dV3g9Z/unq5DpblPpw+Oqcv4dU/1omnb4Ok8iPY6p1c= github.com/hashicorp/golang-lru v1.0.2/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= +github.com/hashicorp/golang-lru/arc/v2 v2.0.7 h1:QxkVTxwColcduO+LP7eJO56r2hFiG8zEbfAAzRv52KQ= +github.com/hashicorp/golang-lru/arc/v2 v2.0.7/go.mod h1:Pe7gBlGdc8clY5LJ0LpJXMt5AmgmWNH1g+oFFVUHOEc= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= @@ -312,6 +344,8 @@ github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ= github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I= github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc= +github.com/hashicorp/vault-client-go v0.4.3 h1:zG7STGVgn/VK6rnZc0k8PGbfv2x/sJExRKHSUg3ljWc= +github.com/hashicorp/vault-client-go v0.4.3/go.mod h1:4tDw7Uhq5XOxS1fO+oMtotHL7j4sB9cp0T7U6m4FzDY= github.com/huin/goupnp v1.3.0 h1:UvLUlWDNpoUdYzb2TCn+MuTWtcjXKSza2n6CBdQ0xXc= github.com/huin/goupnp v1.3.0/go.mod h1:gnGPsThkYa7bFi/KWmEysQRf48l2dvR5bxr2OFckNX8= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= @@ -396,6 +430,18 @@ github.com/jbenet/go-temp-err-catcher v0.1.0 h1:zpb3ZH6wIE8Shj2sKS+khgRvf7T7RABo github.com/jbenet/go-temp-err-catcher v0.1.0/go.mod h1:0kJRvmDZXNMIiJirNPEYfhpPwbGVtZVWC34vc5WLsDk= github.com/jbenet/goprocess v0.1.4 h1:DRGOFReOMqqDNXwW70QkacFW0YN9QnwLV0Vqk+3oU0o= github.com/jbenet/goprocess v0.1.4/go.mod h1:5yspPrukOVuOLORacaBi858NqyClJPQxYZlqdZVfqY4= +github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= +github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs= +github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= +github.com/jcmturner/dnsutils/v2 v2.0.0/go.mod h1:b0TnjGOvI/n42bZa+hmXL+kFJZsFT7G4t3HTlQ184QM= +github.com/jcmturner/gofork v1.7.6 h1:QH0l3hzAU1tfT3rZCnW5zXl+orbkNMMRGJfdJjHVETg= +github.com/jcmturner/gofork v1.7.6/go.mod h1:1622LH6i/EZqLloHfE7IeZ0uEJwMSUyQ/nDd82IeqRo= +github.com/jcmturner/goidentity/v6 v6.0.1 h1:VKnZd2oEIMorCTsFBnJWbExfNN7yZr3EhJAxwOkZg6o= +github.com/jcmturner/goidentity/v6 v6.0.1/go.mod h1:X1YW3bgtvwAXju7V3LCIMpY0Gbxyjn/mY9zx4tFonSg= +github.com/jcmturner/gokrb5/v8 v8.4.4 h1:x1Sv4HaTpepFkXbt2IkL29DXRf8sOfZXo8eRKh687T8= +github.com/jcmturner/gokrb5/v8 v8.4.4/go.mod h1:1btQEpgT6k+unzCwX1KdWMEwPPkkgBtP+F6aCACiMrs= +github.com/jcmturner/rpc/v2 v2.0.3 h1:7FXXj8Ti1IaVFpSAziCZWNzbNuZmnvw/i6CqLNdWfZY= +github.com/jcmturner/rpc/v2 v2.0.3/go.mod h1:VUJYCIDm3PVOEHw8sgt091/20OJjskO/YJki3ELg/Hc= github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= @@ -407,8 +453,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= -github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE= -github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= +github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y= +github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= github.com/koron/go-ssdp v0.0.5 h1:E1iSMxIs4WqxTbIBLtmNBeOOC+1sCIXQeqTWVnpmwhk= github.com/koron/go-ssdp v0.0.5/go.mod h1:Qm59B7hpKpDqfyRNWRNr00jGwLdXjDyZh6y7rH6VS0w= github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= @@ -448,11 +494,11 @@ github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaO github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= -github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= -github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-isatty v0.0.21 h1:xYae+lCNBP7QuW4PUnNG61ffM4hVIfm+zUzDuSzYLGs= +github.com/mattn/go-isatty v0.0.21/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4= github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= -github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw= -github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= +github.com/mattn/go-runewidth v0.0.23 h1:7ykA0T0jkPpzSvMS5i9uoNn2Xy3R383f9HDx3RybWcw= +github.com/mattn/go-runewidth v0.0.23/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= github.com/mdelapenya/tlscert v0.2.0 h1:7H81W6Z/4weDvZBNOfQte5GpIMo0lGYEeWbkGp5LJHI= github.com/mdelapenya/tlscert v0.2.0/go.mod h1:O4njj3ELLnJjGdkN7M/vIVCpZ+Cf0L6muqOG4tLSl8o= github.com/mfridman/interpolate v0.0.2 h1:pnuTK7MQIxxFz1Gr+rjSIx9u7qVjf5VOoM/u6BbAxPY= @@ -466,6 +512,8 @@ github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dz github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5xJjtbRSN8= github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc= github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= +github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= +github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI= github.com/mitchellh/gox v0.4.0/go.mod h1:Sd9lOJ0+aimLBi73mGofS1ycjY8lL3uZM3JPS42BGNg= github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0QubkSMEySY= @@ -523,10 +571,18 @@ github.com/multiformats/go-varint v0.1.0 h1:i2wqFp4sdl3IcIxfAonHQV9qU5OsZ4Ts9IOo github.com/multiformats/go-varint v0.1.0/go.mod h1:5KVAVXegtfmNQQm/lCY+ATvDzvJJhSkUlGQV9wgObdI= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/nats-io/nats.go v1.51.0 h1:ByW84XTz6W03GSSsygsZcA+xgKK8vPGaa/FCAAEHnAI= +github.com/nats-io/nats.go v1.51.0/go.mod h1:26HypzazeOkyO3/mqd1zZd53STJN0EjCYF9Uy2ZOBno= +github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4= +github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs= +github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= +github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/neelance/astrewrite v0.0.0-20160511093645-99348263ae86/go.mod h1:kHJEU3ofeGjhHklVoIGuVj85JJwZ6kWPaJwCIxgnFmo= github.com/neelance/sourcemap v0.0.0-20200213170602-2833bce08e4c/go.mod h1:Qr6/a/Q4r9LP1IltGz7tA7iOK1WonHEYhu1HRBA7ZiM= +github.com/oklog/ulid/v2 v2.1.1 h1:suPZ4ARWLOJLegGFiZZ1dFAkqzhMjL3J1TzI+5wHz8s= +github.com/oklog/ulid/v2 v2.1.1/go.mod h1:rcEKHmBBKfef9DhnvX7y1HZBYxjXb0cP5ExxNsTT1QQ= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= @@ -542,6 +598,8 @@ github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FI github.com/pelletier/go-toml v1.9.3/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= +github.com/pierrec/lz4/v4 v4.1.26 h1:GrpZw1gZttORinvzBdXPUXATeqlJjqUG/D87TKMnhjY= +github.com/pierrec/lz4/v4 v4.1.26/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4= github.com/pion/datachannel v1.6.0 h1:XecBlj+cvsxhAMZWFfFcPyUaDZtd7IJvrXqlXD/53i0= github.com/pion/datachannel v1.6.0/go.mod h1:ur+wzYF8mWdC+Mkis5Thosk+u/VOL287apDNEbFpsIk= github.com/pion/dtls/v2 v2.2.12 h1:KP7H5/c1EiVAAKUmXyCzPiQe5+bCJrpOeKg/L05dunk= @@ -608,6 +666,8 @@ github.com/quic-go/quic-go v0.50.1 h1:unsgjFIUqW8a2oopkY7YNONpV1gYND6Nt9hnt1PN94 github.com/quic-go/quic-go v0.50.1/go.mod h1:Vim6OmUvlYdwBhXP9ZVrtGmCMWa3wEqhq3NgYrI8b4E= github.com/quic-go/webtransport-go v0.8.1-0.20241018022711-4ac2c9250e66 h1:4WFk6u3sOT6pLa1kQ50ZVdm8BQFgJNA117cepZxtLIg= github.com/quic-go/webtransport-go v0.8.1-0.20241018022711-4ac2c9250e66/go.mod h1:Vp72IJajgeOL6ddqrAhmp7IM9zbTcgkQxD/YdxrVwMw= +github.com/rabbitmq/amqp091-go v1.10.0 h1:STpn5XsHlHGcecLmMFCtg7mqq0RnD+zFr4uzukfVhBw= +github.com/rabbitmq/amqp091-go v1.10.0/go.mod h1:Hy4jKW5kQART1u+JkDTF9YYOQUHXqMuhrgxOEeS7G4o= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= @@ -619,9 +679,13 @@ github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= +github.com/ryanuber/go-glob v1.0.0 h1:iQh3xXAumdQ+4Ufa5b25cRpC5TYKlno6hsv6Cb3pkBk= +github.com/ryanuber/go-glob v1.0.0/go.mod h1:807d1WSdnB0XRJzKNil9Om6lcp/3a0v4qIHxIXzX/Yc= github.com/sagikazarmark/locafero v0.11.0 h1:1iurJgmM9G3PA/I+wWYIOw/5SyBtxapeHDcg+AAIFXc= github.com/sagikazarmark/locafero v0.11.0/go.mod h1:nVIGvgyzw595SUSUE6tvCp3YYTeHs15MvlmU87WwIik= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= +github.com/segmentio/kafka-go v0.4.50 h1:mcyC3tT5WeyWzrFbd6O374t+hmcu1NKt2Pu1L3QaXmc= +github.com/segmentio/kafka-go v0.4.50/go.mod h1:Y1gn60kzLEEaW28YshXyk2+VCUKbJ3Qr6DrnT3i4+9E= github.com/sethvargo/go-retry v0.3.0 h1:EEt31A35QhrcRZtrYFDTBg91cqZVnFL2navjDrah2SE= github.com/sethvargo/go-retry v0.3.0/go.mod h1:mNX17F0C/HguQMyMyJxcnU471gOZGxCLyYaFyAZraas= github.com/shirou/gopsutil/v4 v4.26.3 h1:2ESdQt90yU3oXF/CdOlRCJxrP+Am1aBYubTMTfxJ1qc= @@ -630,6 +694,7 @@ github.com/shurcooL/go v0.0.0-20200502201357-93f07166e636/go.mod h1:TDJrrUr11Vxr github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749/go.mod h1:ZY1cvUeJuFPAdZ/B6v7RHavJWZn2YPVFQ1OSXhCGOkg= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/shurcooL/vfsgen v0.0.0-20200824052919-0d455de96546/go.mod h1:TrYk7fJVaAttu97ZZKrO9UbRa8izdowaMIZcxYMbVaw= +github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= @@ -639,6 +704,8 @@ github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1 github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/smartystreets/goconvey v1.8.1 h1:qGjIddxOk4grTu9JPOU31tVfq3cNdBlNa5sSznIX1xY= github.com/smartystreets/goconvey v1.8.1/go.mod h1:+/u4qLyY6x1jReYOp7GOM2FSt8aP9CzCZL03bI28W60= +github.com/smira/go-statsd v1.3.4 h1:kBYWcLSGT+qC6JVbvfz48kX7mQys32fjDOPrfmsSx2c= +github.com/smira/go-statsd v1.3.4/go.mod h1:RjdsESPgDODtg1VpVVf9MJrEW2Hw0wtRNbmB1CAhu6A= github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 h1:+jumHNA0Wrelhe64i8F6HNlS8pkoyMv5sreGx2Ry5Rw= github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8/go.mod h1:3n1Cwaq1E1/1lhQhtRK2ts/ZwZEhjcQeJQ1RuC6Q/8U= github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= @@ -669,6 +736,8 @@ github.com/storacha/guppy v0.7.0/go.mod h1:n5aeC6UKuTRVhxE6V90jhsbv8XYzNdeTfA8Gq github.com/storacha/indexing-service v1.12.2 h1:DrcIzvM36Ux7i0UmGoSZiU8lR8WjVIqsTULSE1kA+7I= github.com/storacha/indexing-service v1.12.2/go.mod h1:Yk+uHoTA6qaTE13Ptq6FArsR9hESOetzej9194KwjhM= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.3 h1:jmXUvGomnU1o3W/V5h2VEradbpJDwGrzugQQvL0POH4= github.com/stretchr/objx v0.5.3/go.mod h1:rDQraq+vQZU7Fde9LOZLr8Tax6zZvy4kuNKF+QYS+U0= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= @@ -677,6 +746,9 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw= @@ -699,8 +771,12 @@ github.com/ucan-wg/go-ucan v0.0.0-20240916120445-37f52863156c/go.mod h1:IiRc1OKW github.com/urfave/cli v1.22.10/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= +github.com/valyala/fasthttp v1.70.0 h1:LAhMGcWk13QZWm85+eg8ZBNbrq5mnkWFGbHMUJHIdXA= +github.com/valyala/fasthttp v1.70.0/go.mod h1:oDZEHHkJ/Buyklg6uURmYs19442zFSnCIfX3j1FY3pE= github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo= github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ= +github.com/versity/versitygw v1.4.1 h1:eNh58r3bmcwET0QWdkNmZgoo/QOO3GysT0YGtpnCyYc= +github.com/versity/versitygw v1.4.1/go.mod h1:8xbEr+kFj/wT/Tf7AOP/biagykBN3iSB5zMhd35Hi0g= github.com/warpfork/go-testmark v0.12.1 h1:rMgCpJfwy1sJ50x0M0NgyphxYYPMOODIJHhsXyEHU0s= github.com/warpfork/go-testmark v0.12.1/go.mod h1:kHwy7wfvGSPh1rQJYKayD4AbtNaeyZdcGi9tNJTaa5Y= github.com/warpfork/go-wish v0.0.0-20220906213052-39a1cc7a02d0 h1:GDDkbFiaK8jsSDJfjId/PEGEShv6ugrt4kYsC5UIDaQ= @@ -709,6 +785,14 @@ github.com/whyrusleeping/cbor-gen v0.3.1 h1:82ioxmhEYut7LBVGhGq8xoRkXPLElVuh5mV6 github.com/whyrusleeping/cbor-gen v0.3.1/go.mod h1:pM99HXyEbSQHcosHc0iW7YFmwnscr+t9Te4ibko05so= github.com/wlynxg/anet v0.0.5 h1:J3VJGi1gvo0JwZ/P1/Yc/8p63SoW98B5dHkYDmpgvvU= github.com/wlynxg/anet v0.0.5/go.mod h1:eay5PRQr7fIVAMbTbchTnO9gG65Hg/uYGdc7mguHxoA= +github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= +github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= +github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= +github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= +github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -768,8 +852,6 @@ go.uber.org/zap v1.16.0/go.mod h1:MA8QOfq0BHJwdXa996Y4dYkAqRKB8/1K1QMMZVaNZjQ= go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.uber.org/zap/exp v0.3.0 h1:6JYzdifzYkGmTdRR59oYH+Ng7k49H9qVpWwNSsGJj3U= -go.uber.org/zap/exp v0.3.0/go.mod h1:5I384qq7XGxYyByIhHm6jg5CHkGY0nsTfbDLgDDlgJQ= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= @@ -781,8 +863,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= -golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= +golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= +golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -864,8 +946,8 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= -golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= -golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= +golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -930,6 +1012,7 @@ golang.org/x/sys v0.0.0-20201201145000-ef89a241ccb3/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210104204734-6f8348627aad/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210220050731-9a76102bfb43/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210305230114-8fe3ee5dd75b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210315160823-c6e025ad8005/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -945,14 +1028,14 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= -golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= +golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= -golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= -golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= +golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= +golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -963,13 +1046,13 @@ golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= -golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= +golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= +golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= -golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= +golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= +golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -1022,6 +1105,7 @@ golang.org/x/tools v0.0.0-20201208233053-a543418bbed2/go.mod h1:emZCQorbCU4vsT4f golang.org/x/tools v0.0.0-20210105154028-b0ab187a4818/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= +golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= @@ -1173,34 +1257,14 @@ honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9 honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= lukechampine.com/blake3 v1.4.1 h1:I3Smz7gso8w4/TunLKec6K2fn+kyKtDxr/xcQEN84Wg= lukechampine.com/blake3 v1.4.1/go.mod h1:QFosUxmjB8mnrWFSNwKmvxHpfY72bmD2tQ0kBMM3kwo= -modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis= -modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= -modernc.org/ccgo/v4 v4.30.2 h1:4yPaaq9dXYXZ2V8s1UgrC3KIj580l2N4ClrLwnbv2so= -modernc.org/ccgo/v4 v4.30.2/go.mod h1:yZMnhWEdW0qw3EtCndG1+ldRrVGS+bIwyWmAWzS0XEw= -modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA= -modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc= -modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= -modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= -modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo= -modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= -modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= -modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= modernc.org/libc v1.68.0 h1:PJ5ikFOV5pwpW+VqCK1hKJuEWsonkIJhhIXyuF/91pQ= modernc.org/libc v1.68.0/go.mod h1:NnKCYeoYgsEqnY3PgvNgAeaJnso968ygU8Z0DxjoEc0= modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= -modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8= -modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= -modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= -modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= modernc.org/sqlite v1.46.1 h1:eFJ2ShBLIEnUWlLy12raN0Z1plqmFX9Qe3rjQTKt6sU= modernc.org/sqlite v1.46.1/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA= -modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= -modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= -modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= -modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk= pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= diff --git a/internal/config/config.go b/internal/config/config.go index 70cd97e..47b0a32 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -155,48 +155,34 @@ type MS3TConfig struct { Enabled bool `mapstructure:"enabled"` // Addr is the host:port to bind the S3 listener to. Addr string `mapstructure:"addr"` - // DataDir is where ms3t persists its SQLite blockstore + bucket - // registry and (when Forge is disabled) emits CARs to disk. + // DataDir is where ms3t persists its log segments, space key, and + // any other on-disk state. DataDir string `mapstructure:"data_dir"` // ChunkSize is the body chunk size used for new objects, in bytes. // 0 → ms3t default (1 MiB). ChunkSize int64 `mapstructure:"chunk_size"` - // BatchBytes is the buffered-CAR size at which the uploader - // flushes. 0 → ms3t default (64 MiB). - BatchBytes int64 `mapstructure:"batch_bytes"` - // BatchAge is the idle interval after which the uploader flushes. - // 0 → ms3t default (5s). - BatchAge string `mapstructure:"batch_age"` - - // Forge controls whether ms3t ships CARs to a Storacha Forge - // stack via guppy. When disabled (the default), CARs go to disk - // only under DataDir/cars. - Forge MS3TForgeConfig `mapstructure:"forge"` -} - -// MS3TForgeConfig holds the optional Forge upload integration. -// When MS3T.Forge.Enabled is true, every batched CAR is shipped to -// piri through sprue's own routing, piriclient, and indexerclient -// — no UCAN-over-HTTP loopback, no separate principal/delegation -// setup. -// -// ms3t generates and persists its own space keypair on first run. -// The space's DID is derived from that keypair, and ms3t acts as -// the root UCAN authority over its own space (so it can issue the -// retrieval delegations the indexer needs to validate writes). -type MS3TForgeConfig struct { - Enabled bool `mapstructure:"enabled"` - // SpaceKeyFile is the path to the persisted space keypair. - // Generated on first run if missing. Defaults to - // /space.key. - SpaceKeyFile string `mapstructure:"space_key_file"` - // NoCache routes all block reads (MST nodes, manifests, body - // chunks) through the indexing-service + piri instead of a local - // SQLite cache, AND makes writes synchronous to Forge (Batched - // is bypassed). Closes the read-after-write race; raises per-PUT - // latency to the cost of the Forge round trip. Requires - // Enabled = true. - NoCache bool `mapstructure:"no_cache"` + // SealBytes is the open-segment size at which the log seals and + // sends the segment to the background flusher. 0 → 64 MiB. + SealBytes int64 `mapstructure:"seal_bytes"` + // SealAge is the maximum time the log will keep an open segment + // before sealing it. Drives the seal cadence under low write + // volume. 0 → 5s. + SealAge string `mapstructure:"seal_age"` + // Retain is the number of most-recent sealed segments to keep on + // disk after a successful Forge flush. Older flushed segments are + // unlinked. Higher values trade disk for read locality. 0 → 6. + Retain int `mapstructure:"retain"` + + // Region is the AWS region advertised to S3 clients. Used by the + // versitygw protocol layer for sigv4 verification. + Region string `mapstructure:"region"` + // RootAccess is the access key id of the single-account IAM + // root user. Required when Enabled is true. + RootAccess string `mapstructure:"root_access"` + // RootSecret is the secret access key paired with RootAccess. + // Required when Enabled is true. Provide via env + // (SPRUE_MS3T_ROOT_SECRET) — do not commit to config files. + RootSecret string `mapstructure:"root_secret"` } type MailerConfig struct { @@ -266,8 +252,10 @@ func SetDefaults(v *viper.Viper) { v.SetDefault("ms3t.enabled", false) v.SetDefault("ms3t.addr", ":9000") v.SetDefault("ms3t.data_dir", "./ms3t-data") - v.SetDefault("ms3t.batch_age", "5s") - v.SetDefault("ms3t.forge.enabled", false) + v.SetDefault("ms3t.seal_bytes", 64<<20) + v.SetDefault("ms3t.seal_age", "5s") + v.SetDefault("ms3t.retain", 6) + v.SetDefault("ms3t.region", "us-east-1") } // BindEnvVars sets up environment variable binding with SPRUE_ prefix. diff --git a/internal/fx/app.go b/internal/fx/app.go index e6fb3a8..b49e155 100644 --- a/internal/fx/app.go +++ b/internal/fx/app.go @@ -9,6 +9,7 @@ import ( "github.com/storacha/sprue/internal/fx/store/aws" "github.com/storacha/sprue/internal/fx/store/memory" "github.com/storacha/sprue/internal/fx/store/postgres" + "github.com/storacha/sprue/pkg/ms3t" "go.uber.org/fx" ) @@ -24,7 +25,7 @@ var AppModule = func(cfg *config.Config) fx.Option { service.Module, handlers.Module, ServerModule, - MS3TModule, + ms3t.Module, } switch cfg.Storage.Type { case config.StorageTypeMemory: diff --git a/internal/fx/ms3t.go b/internal/fx/ms3t.go deleted file mode 100644 index 7f9cd89..0000000 --- a/internal/fx/ms3t.go +++ /dev/null @@ -1,314 +0,0 @@ -package fx - -import ( - "context" - "database/sql" - "errors" - "fmt" - "log/slog" - "net/http" - "os" - "path/filepath" - "time" - - cbor "github.com/ipfs/go-ipld-cbor" - "github.com/storacha/go-ucanto/did" - "github.com/storacha/go-ucanto/principal" - "go.uber.org/fx" - "go.uber.org/zap" - "go.uber.org/zap/exp/zapslog" - - "github.com/storacha/sprue/internal/config" - "github.com/storacha/sprue/pkg/identity" - "github.com/storacha/sprue/pkg/indexerclient" - "github.com/storacha/sprue/pkg/ms3t/blockstore" - "github.com/storacha/sprue/pkg/ms3t/bucket" - "github.com/storacha/sprue/pkg/ms3t/registry" - "github.com/storacha/sprue/pkg/ms3t/server" - "github.com/storacha/sprue/pkg/ms3t/uploader" - "github.com/storacha/sprue/pkg/piriclient" - "github.com/storacha/sprue/pkg/routing" - - _ "modernc.org/sqlite" -) - -// MS3TModule registers the embedded ms3t S3 listener. When -// config.MS3T.Enabled is false the module is a no-op, so it's safe -// to always include in the app graph. -var MS3TModule = fx.Module("ms3t", - fx.Invoke(RegisterMS3TLifecycle), -) - -// MS3TDeps bundles the sprue-internal services ms3t pulls when -// config.MS3T.Forge.Enabled is true. Marked optional so disabled -// deployments don't fail to construct (e.g., the indexer client -// short-circuits to nil when the indexer endpoint isn't set). -type MS3TDeps struct { - fx.In - - Identity *identity.Identity - Router *routing.Service - PiriProvider piriclient.Provider - IndexerClient *indexerclient.Client `optional:"true"` -} - -// RegisterMS3TLifecycle wires ms3t's bucket service, HTTP handler, -// and listener into the fx lifecycle. Construction failures (bad -// config, missing service for forge mode) are returned synchronously -// so fx can abort startup before any other module initializes. -func RegisterMS3TLifecycle( - lc fx.Lifecycle, - cfg *config.Config, - zlog *zap.Logger, - deps MS3TDeps, -) error { - mc := cfg.MS3T - if !mc.Enabled { - return nil - } - - if err := os.MkdirAll(mc.DataDir, 0o755); err != nil { - return fmt.Errorf("ms3t: mkdir data dir: %w", err) - } - - noCache := mc.Forge.Enabled && mc.Forge.NoCache - - // When Forge is enabled, load or generate ms3t's own space - // keypair. ms3t IS the space owner (root UCAN authority) so that - // self-issued space/content/retrieve delegations validate down - // the chain to piri's retrieval auth check. - var spaceSigner principal.Signer - if mc.Forge.Enabled { - keyPath := mc.Forge.SpaceKeyFile - if keyPath == "" { - keyPath = filepath.Join(mc.DataDir, "space.key") - } - s, err := uploader.LoadOrCreateSigner(keyPath) - if err != nil { - return fmt.Errorf("ms3t: space signer: %w", err) - } - spaceSigner = s - zlog.Info("ms3t space loaded", - zap.String("space_did", spaceSigner.DID().String()), - zap.String("key_file", keyPath), - ) - } - - // Build the blockstore. In no_cache mode this is a Forge-backed - // read-only store (every Get hits indexer + piri); SQLite is - // skipped entirely. Otherwise we open a SQLite file under - // data_dir. - var bs cbor.IpldBlockstore - var sqliteDB *sql.DB - - if noCache { - fb, err := blockstore.NewForge(blockstore.ForgeConfig{ - IndexerEndpoint: cfg.Indexer.Endpoint, - IndexerDID: cfg.Indexer.DID, - Spaces: []did.DID{spaceSigner.DID()}, - Signer: deps.Identity.Signer, - SpaceSigner: spaceSigner, - Logger: zlog, - }) - if err != nil { - return fmt.Errorf("ms3t: forge blockstore: %w", err) - } - bs = fb - } else { - dbPath := filepath.Join(mc.DataDir, "ms3t.db") - db, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(WAL)&_pragma=foreign_keys(on)") - if err != nil { - return fmt.Errorf("ms3t: open sqlite: %w", err) - } - db.SetMaxOpenConns(1) - sb, err := blockstore.New(db) - if err != nil { - _ = db.Close() - return fmt.Errorf("ms3t: blockstore: %w", err) - } - bs = sb - sqliteDB = db - } - - // Registry always lives in SQLite; in no_cache mode it's the - // only thing in the SQLite file. Open a (different) DB so - // reusing one connection isn't a concern. - regDBPath := filepath.Join(mc.DataDir, "ms3t-registry.db") - if !noCache { - // reuse the same db for registry when SQLite blockstore is in use - regDBPath = "" - } - regDB, err := openRegistryDB(regDBPath, sqliteDB) - if err != nil { - if sqliteDB != nil { - _ = sqliteDB.Close() - } - return fmt.Errorf("ms3t: registry db: %w", err) - } - reg, err := registry.NewSQL(regDB) - if err != nil { - _ = regDB.Close() - if sqliteDB != nil && sqliteDB != regDB { - _ = sqliteDB.Close() - } - return fmt.Errorf("ms3t: registry: %w", err) - } - - carDir := filepath.Join(mc.DataDir, "cars") - innerUp, err := buildMS3TInnerUploader(mc, carDir, deps, spaceSigner, zlog) - if err != nil { - _ = regDB.Close() - if sqliteDB != nil && sqliteDB != regDB { - _ = sqliteDB.Close() - } - return fmt.Errorf("ms3t: uploader: %w", err) - } - - // Wrap in Batched unless no_cache is on. In no_cache mode every - // PUT blocks on the full Forge round trip, closing the - // read-after-write race. - var up uploader.Uploader - if noCache { - up = innerUp - } else { - batchAge, err := parseDurationOr(mc.BatchAge, 5*time.Second) - if err != nil { - _ = regDB.Close() - if sqliteDB != nil && sqliteDB != regDB { - _ = sqliteDB.Close() - } - return fmt.Errorf("ms3t: batch_age: %w", err) - } - batchBytes := mc.BatchBytes - if batchBytes <= 0 { - batchBytes = 64 << 20 - } - up = uploader.NewBatched(innerUp, uploader.BatchedOptions{ - MaxBytes: batchBytes, - MaxAge: batchAge, - }) - } - - chunkSize := mc.ChunkSize - if chunkSize <= 0 { - chunkSize = bucket.DefaultChunkSize - } - svc := bucket.New(bs, reg, bucket.Options{ - ChunkSize: chunkSize, - Uploader: up, - }) - - // Adapt sprue's zap logger into ms3t's slog interface so log - // output funnels through one pipeline. - slogger := slog.New(zapslog.NewHandler(zlog.Core(), zapslog.WithName("ms3t"))) - httpHandler := server.New(svc, slogger) - srv := &http.Server{Addr: mc.Addr, Handler: httpHandler} - - lc.Append(fx.Hook{ - OnStart: func(ctx context.Context) error { - if err := svc.Recover(ctx); err != nil { - return fmt.Errorf("ms3t: recover: %w", err) - } - zlog.Info("starting ms3t S3 listener", - zap.String("addr", mc.Addr), - zap.String("data_dir", mc.DataDir), - zap.Bool("forge", mc.Forge.Enabled), - zap.Bool("no_cache", noCache), - zap.Int64("chunk_size", chunkSize), - ) - go func() { - if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { - zlog.Error("ms3t listener error", zap.Error(err)) - } - }() - return nil - }, - OnStop: func(ctx context.Context) error { - zlog.Info("shutting down ms3t S3 listener") - shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second) - defer cancel() - - var errs []error - if err := srv.Shutdown(shutdownCtx); err != nil { - errs = append(errs, fmt.Errorf("http shutdown: %w", err)) - } - if err := svc.Shutdown(shutdownCtx); err != nil { - errs = append(errs, fmt.Errorf("service shutdown: %w", err)) - } - if err := regDB.Close(); err != nil { - errs = append(errs, fmt.Errorf("registry db close: %w", err)) - } - if sqliteDB != nil && sqliteDB != regDB { - if err := sqliteDB.Close(); err != nil { - errs = append(errs, fmt.Errorf("blockstore db close: %w", err)) - } - } - if len(errs) > 0 { - return fmt.Errorf("ms3t shutdown: %v", errs) - } - return nil - }, - }) - return nil -} - -// openRegistryDB returns a *sql.DB for the registry. If reuse is -// non-nil, returns it (registry shares the SQLite file with the -// blockstore). Otherwise opens a fresh sqlite db at path. -func openRegistryDB(path string, reuse *sql.DB) (*sql.DB, error) { - if reuse != nil { - return reuse, nil - } - db, err := sql.Open("sqlite", path+"?_pragma=journal_mode(WAL)&_pragma=foreign_keys(on)") - if err != nil { - return nil, err - } - db.SetMaxOpenConns(1) - return db, nil -} - -// buildMS3TInnerUploader returns the inner uploader. With -// ms3t.forge.enabled = false the inner is uploader.Disk (writes CARs -// to a local directory). With it set to true, it's uploader.Internal -// — sprue's piriclient and indexerclient with sprue's identity as -// the signer and ms3t's self-generated space keypair as the space -// owner. -func buildMS3TInnerUploader( - mc config.MS3TConfig, - carDir string, - deps MS3TDeps, - spaceSigner principal.Signer, - zlog *zap.Logger, -) (uploader.Uploader, error) { - if !mc.Forge.Enabled { - return uploader.NewDisk(carDir) - } - if deps.IndexerClient == nil { - return nil, fmt.Errorf("ms3t.forge requires the indexer client; configure indexer.endpoint") - } - if spaceSigner == nil { - return nil, fmt.Errorf("ms3t.forge requires a space signer (internal error)") - } - - zlog.Info("ms3t internal uploader configured", - zap.String("space_did", spaceSigner.DID().String()), - zap.String("signer_did", deps.Identity.DID()), - zap.Bool("no_cache", mc.Forge.NoCache), - ) - - return uploader.NewInternal(uploader.InternalConfig{ - Router: deps.Router, - PiriProvider: deps.PiriProvider, - IndexerClient: deps.IndexerClient, - Signer: deps.Identity.Signer, - SpaceSigner: spaceSigner, - Logger: zlog, - }) -} - -func parseDurationOr(s string, dflt time.Duration) (time.Duration, error) { - if s == "" { - return dflt, nil - } - return time.ParseDuration(s) -} diff --git a/pkg/ms3t/architectural.md b/pkg/ms3t/architectural.md index 27db919..1678c26 100644 --- a/pkg/ms3t/architectural.md +++ b/pkg/ms3t/architectural.md @@ -398,6 +398,68 @@ We have not measured anything precisely. These are rough impressions. (when forge disabled, in the cache mode that isn't currently deployed). +## Aligning with the RFCs + +There's a parallel design effort for the per-object data layout that +predates this prototype: + +- `shard.rfc` (in this repo) — Forge S3 Facade sharding strategy +- [storacha/RFC #65](https://github.com/storacha/RFC/pull/65) — Filepack archive format +- [storacha/RFC #66](https://github.com/storacha/RFC/pull/66) — Virtual DAG in Sharded DAG Index + +Together these propose: shard at 256 MB; each shard is a Filepack +data archive (raw concatenated bytes, no CAR overhead); a UnixFS +File root links the shards in order; a v0.2 Sharded DAG Index +inlines that UnixFS root via its new `blocks` property. + +The MST-as-bucket idea is **orthogonal to all three RFCs** — they +address per-object data layout, not how a bucket is structured. So +the MST work in this PR is independent of whether we adopt the +RFCs' direction. + +The per-object layer of this prototype diverges from the RFCs: + +| | RFCs | this PR | +|---|---|---| +| shard format | Filepack (raw bytes) | raw IPLD blocks inside one CAR | +| per-object root | UnixFS File node | `ObjectManifest` (CBOR) | +| SDI version | v0.2 with inline `blocks` | v0.1 | +| chunk/shard size | 256 MB | 1 MiB | + +Aligning would mean replacing the body fields of `ObjectManifest` +with a single `cid.Cid` pointing at the UnixFS root, and producing +Filepack shards instead of raw blocks inside a CAR. The MST +machinery is unaffected. + +### ObjectManifest still has a place under the RFCs + +Even after aligning with the RFCs, we'd still want a small per-object +manifest block: + +```go +type ObjectManifest struct { + Content cid.Cid // → UnixFS File root + ContentType string // S3 needs this on GET + Created int64 // S3 needs this for Last-Modified + // user metadata, cache-control, etc. as needed +} +``` + +The S3 protocol metadata (Content-Type, Last-Modified, user +`x-amz-meta-*` headers) doesn't have a natural home in UnixFS or +the SDI. UnixFS-Plus extensibility is thin and not well-supported. +Inlining the manifest as a block in the SDI's `blocks` (alongside +the UnixFS root) is possible but mixes layers. + +Decision: keep ObjectManifest as a separate CBOR block in the same +CAR as the MST mutation, with the MST leaf pointing at the manifest +CID — the same shape we have today. Just smaller, with the body +fields replaced by a single Content link to the UnixFS root. + +If GET latency becomes a real concern, inlining the manifest block +in the per-object SDI is a one-line change and saves a network hop. +Defer until needed. + ## Future directions (not implemented) ### Direct passthrough diff --git a/pkg/ms3t/blockstore/buffered.go b/pkg/ms3t/blockstore/buffered.go deleted file mode 100644 index 1b15a0c..0000000 --- a/pkg/ms3t/blockstore/buffered.go +++ /dev/null @@ -1,111 +0,0 @@ -package blockstore - -import ( - "context" - "fmt" - "sync" - - "github.com/storacha/sprue/pkg/ms3t/uploader" - block "github.com/ipfs/go-block-format" - "github.com/ipfs/go-cid" - cbor "github.com/ipfs/go-ipld-cbor" -) - -// CARBuffer is a per-S3-op IpldBlockstore that captures every Put — body -// chunks, MST nodes, ObjectManifests — in memory. On Commit it submits -// the entire batch to the configured Uploader (which may flush -// immediately or buffer further) and then flushes the blocks to the -// underlying canonical store. -// -// Reads check the in-memory buffer first and fall through to the -// underlying store on miss. This lets the MST's GetPointer recompute -// path Put a node and immediately re-Read it during the same op. -// -// Single-shot per session: create a CARBuffer at the start of an S3 op, -// Put any number of blocks, then call Commit(root) exactly once on -// success or Discard on failure. -// -// Safe for concurrent reads but writes are serialized within one -// session — the bucket service holds a per-bucket mutex around the -// whole flow. -type CARBuffer struct { - underlying cbor.IpldBlockstore - uploader uploader.Uploader - - mu sync.RWMutex - blocks map[cid.Cid]block.Block - order []cid.Cid -} - -// NewCARBuffer constructs a per-op buffer backed by underlying for -// reads and Submitting to up at Commit. -func NewCARBuffer(underlying cbor.IpldBlockstore, up uploader.Uploader) *CARBuffer { - return &CARBuffer{ - underlying: underlying, - uploader: up, - blocks: map[cid.Cid]block.Block{}, - } -} - -func (b *CARBuffer) Get(ctx context.Context, c cid.Cid) (block.Block, error) { - b.mu.RLock() - blk, ok := b.blocks[c] - b.mu.RUnlock() - if ok { - return blk, nil - } - return b.underlying.Get(ctx, c) -} - -func (b *CARBuffer) Put(_ context.Context, blk block.Block) error { - b.mu.Lock() - defer b.mu.Unlock() - if _, exists := b.blocks[blk.Cid()]; !exists { - b.blocks[blk.Cid()] = blk - b.order = append(b.order, blk.Cid()) - } - return nil -} - -// Commit submits the buffered blocks to the Uploader rooted at root, -// then flushes them to the underlying canonical store. Empties the -// buffer on success. -func (b *CARBuffer) Commit(ctx context.Context, root cid.Cid) error { - b.mu.Lock() - defer b.mu.Unlock() - - if len(b.order) == 0 { - return nil - } - - blks := make([]block.Block, len(b.order)) - for i, c := range b.order { - blks[i] = b.blocks[c] - } - - if err := b.uploader.Submit(ctx, []cid.Cid{root}, blks); err != nil { - return fmt.Errorf("carbuffer: submit: %w", err) - } - - for _, blk := range blks { - if err := b.underlying.Put(ctx, blk); err != nil { - return fmt.Errorf("carbuffer: flush %s: %w", blk.Cid(), err) - } - } - - b.blocks = map[cid.Cid]block.Block{} - b.order = nil - return nil -} - -// Discard drops any buffered blocks without submitting or flushing -// them. Use this when the surrounding op has failed and in-flight -// blocks should be abandoned. -func (b *CARBuffer) Discard() { - b.mu.Lock() - defer b.mu.Unlock() - b.blocks = map[cid.Cid]block.Block{} - b.order = nil -} - -var _ cbor.IpldBlockstore = (*CARBuffer)(nil) diff --git a/pkg/ms3t/blockstore/forge.go b/pkg/ms3t/blockstore/forge.go index fb2e69e..c370520 100644 --- a/pkg/ms3t/blockstore/forge.go +++ b/pkg/ms3t/blockstore/forge.go @@ -11,10 +11,9 @@ import ( block "github.com/ipfs/go-block-format" "github.com/ipfs/go-cid" - cbor "github.com/ipfs/go-ipld-cbor" - captypes "github.com/storacha/go-libstoracha/capabilities/types" "github.com/storacha/go-libstoracha/capabilities/assert" contentcap "github.com/storacha/go-libstoracha/capabilities/space/content" + captypes "github.com/storacha/go-libstoracha/capabilities/types" "github.com/storacha/go-libstoracha/failure" rclient "github.com/storacha/go-ucanto/client/retrieval" "github.com/storacha/go-ucanto/core/dag/blockstore" @@ -29,6 +28,10 @@ import ( "go.uber.org/zap" ) +// ErrNotFound is returned by Get when the indexing-service has no +// location commitment for the requested CID. +var ErrNotFound = errors.New("blockstore: not found") + // Forge is a read-only IpldBlockstore that resolves CIDs through the // Storacha indexing-service and fetches the underlying bytes via // authorized UCAN-wrapped GETs against piri storage nodes. @@ -50,6 +53,8 @@ type Forge struct { logger *zap.Logger } +var _ BlockReader = (*Forge)(nil) + // ForgeConfig wires sprue's existing services into a read-only Forge // blockstore. type ForgeConfig struct { @@ -130,11 +135,11 @@ func NewForge(cfg ForgeConfig) (*Forge, error) { }, nil } -// Get resolves the CID through the indexer and retrieves the +// GetBlock resolves the CID through the indexer and retrieves the // underlying byte slice from piri via a UCAN-authorized // `space/content/retrieve` invocation. The request is scoped to // the inner block's offset/length within the containing CAR shard. -func (f *Forge) Get(ctx context.Context, c cid.Cid) (block.Block, error) { +func (f *Forge) GetBlock(ctx context.Context, c cid.Cid) (block.Block, error) { locations, err := f.locator.Locate(ctx, f.spaces, c.Hash()) if err != nil { var nf locator.NotFoundError @@ -190,9 +195,9 @@ func (f *Forge) Get(ctx context.Context, c cid.Cid) (block.Block, error) { rangeEnd := rangeStart + loc.Position.Length - 1 inv, err := contentcap.Retrieve.Invoke( - f.signer, // issuer = sprue - storageProvider, // audience = piri - space.String(), // with = space + f.signer, // issuer = sprue + storageProvider, // audience = piri + space.String(), // with = space contentcap.RetrieveCaveats{ Blob: contentcap.BlobDigest{Digest: caveats.Content.Hash()}, Range: contentcap.Range{Start: rangeStart, End: rangeEnd}, @@ -247,14 +252,6 @@ func (f *Forge) Get(ctx context.Context, c cid.Cid) (block.Block, error) { return block.NewBlockWithCid(body, c) } -// Put is a no-op. CARBuffer.Commit calls Put on its underlying -// blockstore for every freshly-Submitted block; in no_cache mode we -// don't want to persist anything locally because the uploader has -// already shipped the data to Forge. -func (f *Forge) Put(_ context.Context, _ block.Block) error { - return nil -} - // newAuthorizeRetrieval returns the AuthorizeRetrievalFunc the // IndexLocator calls before each indexer query. The space signer // (root authority) directly authorizes the indexer to fetch any @@ -284,5 +281,3 @@ func newAuthorizeRetrieval(spaceSigner principal.Signer, indexerDID did.DID) loc ) } } - -var _ cbor.IpldBlockstore = (*Forge)(nil) diff --git a/pkg/ms3t/blockstore/layered.go b/pkg/ms3t/blockstore/layered.go new file mode 100644 index 0000000..e59955e --- /dev/null +++ b/pkg/ms3t/blockstore/layered.go @@ -0,0 +1,81 @@ +package blockstore + +import ( + "context" + "errors" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" +) + +// Layered is the production ReadStore: a read-only seam that +// consults a small in-memory cache first, then the local LSM log, +// then a base blockstore (typically *Forge — indexing-service + +// piri). +// +// It exposes both halves of ReadStore from a single underlying +// traversal: +// +// - GetBlock returns raw blocks (body chunks). +// - Get fetches the same blocks and CBOR-decodes them (manifests, +// MST nodes), via an internal Store wrapped around our own +// GetBlock so the cache + log → base ordering is preserved. +// +// Layered has no Put: real writes flow through bucketop.Tx → +// OpStaging → Log.AppendBatch. +type Layered struct { + log Log + base BlockReader + + // cstSelf is a CBOR view backed by Layered's own GetBlock. The + // adapter exposes Layered as a BaseStore so CborStore can wrap + // it; the CBOR decoder's block fetches come back through + // GetBlock and reuse the cache + fallthrough. + cstSelf Store +} + +// NewLayered wires a log store in front of a base blockstore. +func NewLayered(log Log, base BlockReader) *Layered { + l := &Layered{log: log, base: base} + l.cstSelf = CborStore(layeredAsBlockstore{l}) + return l +} + +// Get fetches a CBOR-encoded value at c and decodes it into out. +// Same read order as GetBlock (cache → log → base) — the decoder +// fetches via GetBlock under the hood. +func (l *Layered) Get(ctx context.Context, c cid.Cid, out any) error { + return l.cstSelf.Get(ctx, c, out) +} + +// GetBlock fetches a raw block: cache → log → base. +func (l *Layered) GetBlock(ctx context.Context, c cid.Cid) (blk block.Block, retErr error) { + if l.log != nil { + b, err := l.log.Get(ctx, c) + if err == nil { + return b, nil + } + if !errors.Is(err, ErrNotFound) { + return nil, err + } + } + return l.base.GetBlock(ctx, c) +} + +// layeredAsBlockstore lifts Layered into a BaseStore for the +// CborStore wrapper. Internal-only — exists so the CBOR decoder +// reuses Layered's cache + fallthrough order rather than going +// around them. +type layeredAsBlockstore struct{ inner *Layered } + +func (a layeredAsBlockstore) Get(ctx context.Context, c cid.Cid) (block.Block, error) { + return a.inner.GetBlock(ctx, c) +} + +// Put is unused: Layered is read-only, but BaseStore (= cbor +// IpldBlockstore) requires it. The CBOR codec only ever invokes +// Get on this adapter, so this stays a no-op. +func (a layeredAsBlockstore) Put(_ context.Context, _ block.Block) error { return nil } + +// Compile-time assertion: Layered is the production ReadStore. +var _ ReadStore = (*Layered)(nil) diff --git a/pkg/ms3t/blockstore/log.go b/pkg/ms3t/blockstore/log.go new file mode 100644 index 0000000..b2f5c60 --- /dev/null +++ b/pkg/ms3t/blockstore/log.go @@ -0,0 +1,54 @@ +package blockstore + +import ( + "context" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" +) + +// OpRoot ties a single batch of block writes to the bucket Root +// they collectively materialize. Every AppendBatch on a Log +// records exactly one OpRoot; the flush pipeline replays these +// to advance per-bucket forge_root_cid as segments ship. +type OpRoot struct { + // Bucket is the bucket whose Root this batch advances. + Bucket string + // Root is the new MST root the batch produces. + Root cid.Cid +} + +// BlockLoc points at a block's payload bytes inside a CAR file — +// the byte offset of the frame and the frame length. Logs +// populate one entry per block at append time; consumers (most +// notably the flush path that builds a ShardedDagIndexView) read +// the entries to avoid rescanning the file. +type BlockLoc struct { + Offset uint64 + Length uint64 +} + +// Log is the journaling tier — an append-only block store with +// three levels of durability: +// +// - Hot: the open segment. AppendBatch fsyncs the batch into +// the segment's CAR + ops sidecar before returning, so a +// successful AppendBatch is durable on local disk before any +// acked write becomes visible to clients. +// - Warm: sealed segments retained on local disk. Reads hit +// them via Get newest-first; Append never touches them. +// - Cold: segments flushed off-host (to Forge in production). +// Out of scope for the Log interface — the implementation +// manages the flush pipeline outside this contract. +// +// Get is the seam Layered uses to consult the journal before +// falling through to the network base — it returns ErrNotFound +// when no local segment holds the requested CID. Close drains +// the flush pipeline at process shutdown. +// +// Implemented by *logstore.Store. +type Log interface { + AppendBatch(ctx context.Context, blocks []block.Block, opRoot OpRoot) error + Get(ctx context.Context, c cid.Cid) (block.Block, error) + Close(ctx context.Context) error +} diff --git a/pkg/ms3t/blockstore/sqlite.go b/pkg/ms3t/blockstore/sqlite.go deleted file mode 100644 index a1ee74b..0000000 --- a/pkg/ms3t/blockstore/sqlite.go +++ /dev/null @@ -1,69 +0,0 @@ -// Package blockstore provides a SQLite-backed implementation of the -// go-ipld-cbor IpldBlockstore interface, used to persist MST nodes and -// object manifests as content-addressed blocks. -package blockstore - -import ( - "context" - "database/sql" - "errors" - "fmt" - - block "github.com/ipfs/go-block-format" - "github.com/ipfs/go-cid" - cbor "github.com/ipfs/go-ipld-cbor" -) - -// Schema is the DDL for the blocks table. Vanilla SQL so it works on SQLite, -// Postgres, etc. -const Schema = ` -CREATE TABLE IF NOT EXISTS blocks ( - cid BLOB PRIMARY KEY, - data BLOB NOT NULL -); -` - -// Store is a SQLite-backed IPLD blockstore. -type Store struct { - db *sql.DB -} - -// New wraps an open *sql.DB and ensures the schema exists. -func New(db *sql.DB) (*Store, error) { - if _, err := db.Exec(Schema); err != nil { - return nil, fmt.Errorf("blockstore: ensure schema: %w", err) - } - return &Store{db: db}, nil -} - -// Get fetches a block by CID. Returns ErrNotFound if absent. -func (s *Store) Get(ctx context.Context, c cid.Cid) (block.Block, error) { - var data []byte - err := s.db.QueryRowContext(ctx, - `SELECT data FROM blocks WHERE cid = ?`, - c.Bytes()).Scan(&data) - if errors.Is(err, sql.ErrNoRows) { - return nil, ErrNotFound - } - if err != nil { - return nil, fmt.Errorf("blockstore: get %s: %w", c, err) - } - return block.NewBlockWithCid(data, c) -} - -// Put writes a block. Idempotent: re-inserting the same CID is a no-op. -func (s *Store) Put(ctx context.Context, b block.Block) error { - _, err := s.db.ExecContext(ctx, - `INSERT INTO blocks (cid, data) VALUES (?, ?) ON CONFLICT (cid) DO NOTHING`, - b.Cid().Bytes(), b.RawData()) - if err != nil { - return fmt.Errorf("blockstore: put %s: %w", b.Cid(), err) - } - return nil -} - -// ErrNotFound is returned by Get when the requested CID is absent. -var ErrNotFound = errors.New("blockstore: block not found") - -// Compile-time assertion: *Store implements cbor.IpldBlockstore. -var _ cbor.IpldBlockstore = (*Store)(nil) diff --git a/pkg/ms3t/blockstore/staging.go b/pkg/ms3t/blockstore/staging.go new file mode 100644 index 0000000..7c350c9 --- /dev/null +++ b/pkg/ms3t/blockstore/staging.go @@ -0,0 +1,137 @@ +package blockstore + +import ( + "context" + "errors" + "fmt" + "sync" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" +) + +// OpStaging is a per-S3-op IpldBlockstore that captures every Put — +// body chunks, MST nodes, ObjectManifests — in memory. On Commit it +// hands the entire ordered batch to the log store in one +// fsynced AppendBatch call, after which the new bucket Root may be +// safely advanced via the registry CAS. +// +// Reads check the in-memory buffer first and fall through to the +// underlying read store on miss. This lets MST.GetPointer recompute +// path Put a node and immediately re-Read it during the same op. +// +// Single-shot per session: create at the start of an S3 op, Put any +// number of blocks, then call Commit(root) on success or Discard on +// failure. Failed ops never touch the log because nothing is written +// until Commit. +// +// TODO(perf): the in-memory `blocks` map bounds the transaction's +// memory footprint at the size of the entire payload until Commit. +// For a multi-GB PutObject this means the full body — every chunk, +// every MST node, the manifest — sits in process memory until the +// log accepts the batch. +// +// An alternative OpStaging implementation could spool to a temp +// file (CAR-shaped, with an in-memory cid → (offset, length) index +// for read-your-writes) instead of an unbounded map, capping the +// per-transaction footprint to roughly one chunk plus the index. +// The interface (Get / Put / Commit / Discard) does not need to +// change — only the storage backend behind these methods. +// +// Discard would unlink the temp file; Commit could hand the file +// off to Log.AppendBatch (or a future SubmitCAR-style entry point +// that takes the path directly) to avoid materializing the batch +// as a Go slice in the hot path. +type OpStaging struct { + underlying ReadStore + log Log + bucket string + + mu sync.RWMutex + // blocks holds every Put for the lifetime of the transaction. + // See the TODO(perf) on OpStaging — this is the field a + // file-backed implementation would replace. + blocks map[string]block.Block // keyed by string(cid.Bytes()) + order []cid.Cid +} + +// NewOpStaging constructs a per-op staging buffer. underlying is the +// read fallback (typically *Layered); log is the durable write +// target; bucket is the bucket whose root this op will advance. +func NewOpStaging(underlying ReadStore, log Log, bucket string) *OpStaging { + return &OpStaging{ + underlying: underlying, + log: log, + bucket: bucket, + blocks: map[string]block.Block{}, + } +} + +func (b *OpStaging) Get(ctx context.Context, c cid.Cid) (block.Block, error) { + b.mu.RLock() + blk, ok := b.blocks[string(c.Bytes())] + b.mu.RUnlock() + if ok { + return blk, nil + } + return b.underlying.GetBlock(ctx, c) +} + +func (b *OpStaging) Put(_ context.Context, blk block.Block) error { + b.mu.Lock() + defer b.mu.Unlock() + key := string(blk.Cid().Bytes()) + if _, exists := b.blocks[key]; !exists { + b.blocks[key] = blk + b.order = append(b.order, blk.Cid()) + } + return nil +} + +// Commit hands every staged block + (bucket, root) to the log in one +// AppendBatch. After Commit returns nil, the blocks AND the op-root +// are durable on disk; the caller may advance the bucket's published +// Root. +// +// An empty blocks slice is legal: an MST mutation can produce a +// new root that points at a node already materialized in a prior +// segment (e.g., trimTop after Delete unwraps to an existing +// subtree). The bucket Root still needs to advance, so AppendBatch +// is called with an empty payload and the OpRoot record alone. +func (b *OpStaging) Commit(ctx context.Context, root cid.Cid) error { + b.mu.Lock() + defer b.mu.Unlock() + + if !root.Defined() { + return errors.New("opstaging: commit with undefined root") + } + + blks := make([]block.Block, len(b.order)) + for i, c := range b.order { + blks[i] = b.blocks[string(c.Bytes())] + } + if err := b.log.AppendBatch(ctx, blks, OpRoot{Bucket: b.bucket, Root: root}); err != nil { + return fmt.Errorf("opstaging: append: %w", err) + } + + b.blocks = map[string]block.Block{} + b.order = nil + return nil +} + +// Discard drops any staged blocks without writing them. Use when the +// surrounding op has failed and the in-flight batch should be +// abandoned. +func (b *OpStaging) Discard() { + b.mu.Lock() + defer b.mu.Unlock() + b.blocks = map[string]block.Block{} + b.order = nil +} + +// OpStaging is passed to CborStore in bucketop.Tx construction, so +// it must satisfy BaseStore (the IPFS-standard Get/Put-on-blocks +// shape). The two halves are: Get → check in-memory map then fall +// through to the underlying ReadStore's GetBlock; Put → append to +// the in-memory map. +var _ BaseStore = (*OpStaging)(nil) diff --git a/pkg/ms3t/blockstore/staging_test.go b/pkg/ms3t/blockstore/staging_test.go new file mode 100644 index 0000000..5c99402 --- /dev/null +++ b/pkg/ms3t/blockstore/staging_test.go @@ -0,0 +1,211 @@ +package blockstore_test + +import ( + "context" + "errors" + "testing" + "time" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + "github.com/multiformats/go-multihash" + "go.uber.org/zap/zaptest" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" + "github.com/storacha/sprue/pkg/ms3t/logstore" +) + +// In-memory Meta — minimal subset duplicated here to avoid pulling +// the logstore test fake out of its package. +type fakeMeta struct { + seq uint64 + rows map[uint64]*logstore.SegmentMeta + roots []blockstore.OpRoot +} + +func newFakeMeta() *fakeMeta { return &fakeMeta{rows: map[uint64]*logstore.SegmentMeta{}} } + +func (f *fakeMeta) NextSegmentSeq(_ context.Context) (uint64, error) { f.seq++; return f.seq, nil } +func (f *fakeMeta) InsertSegmentOpen(_ context.Context, seq uint64) error { + f.rows[seq] = &logstore.SegmentMeta{Seq: seq, State: logstore.StateOpen} + return nil +} +func (f *fakeMeta) MarkSegmentSealed(_ context.Context, seq uint64, sealedAt int64, sizeBytes int64, sha256 []byte, opRoots []blockstore.OpRoot) error { + r, ok := f.rows[seq] + if !ok || r.State != logstore.StateOpen { + return nil + } + r.State = logstore.StateSealed + r.OpRoots = append([]blockstore.OpRoot(nil), opRoots...) + f.roots = append(f.roots, opRoots...) + return nil +} +func (f *fakeMeta) MarkSegmentFlushed(_ context.Context, seq uint64, _ int64, _ []blockstore.OpRoot) error { + if r, ok := f.rows[seq]; ok { + r.State = logstore.StateFlushed + } + return nil +} +func (f *fakeMeta) DeleteSegment(_ context.Context, seq uint64) error { + delete(f.rows, seq) + return nil +} +func (f *fakeMeta) ListUnflushedSegments(_ context.Context) ([]logstore.SegmentMeta, error) { + var out []logstore.SegmentMeta + for _, r := range f.rows { + if r.State == logstore.StateOpen || r.State == logstore.StateSealed { + out = append(out, *r) + } + } + return out, nil +} +func (f *fakeMeta) RehydrateSegment(_ context.Context, m logstore.SegmentMeta) error { + cp := m + f.rows[m.Seq] = &cp + return nil +} + +// noopBase satisfies blockstore.BlockReader but always returns +// errUnknownBase so we can detect when a GetBlock falls through +// past the log layer. +type noopBase struct{} + +var errUnknownBase = errors.New("base: unknown") + +func (noopBase) GetBlock(_ context.Context, _ cid.Cid) (block.Block, error) { + return nil, errUnknownBase +} + +func makeBlock(t *testing.T, payload []byte) block.Block { + t.Helper() + mh, err := multihash.Sum(payload, multihash.SHA2_256, -1) + if err != nil { + t.Fatalf("mh: %v", err) + } + c := cid.NewCidV1(cid.Raw, mh) + blk, err := block.NewBlockWithCid(payload, c) + if err != nil { + t.Fatalf("blk: %v", err) + } + return blk +} + +func makeRoot(t *testing.T, name string) cid.Cid { + t.Helper() + mh, err := multihash.Sum([]byte("r:"+name), multihash.SHA2_256, -1) + if err != nil { + t.Fatalf("mh: %v", err) + } + return cid.NewCidV1(cid.DagCBOR, mh) +} + +func TestLayeredAndStagingHappyPath(t *testing.T) { + dir := t.TempDir() + meta := newFakeMeta() + logger := zaptest.NewLogger(t) + + log, err := logstore.Open(context.Background(), logstore.Config{ + Dir: dir, + Meta: meta, + SealBytes: 1 << 30, + SealAge: 1 * time.Hour, + Retain: 6, + Flush: func(ctx context.Context, seg *logstore.Segment) error { + return meta.MarkSegmentFlushed(ctx, seg.Seq(), time.Now().Unix(), seg.OpRoots()) + }, + Logger: logger, + }) + if err != nil { + t.Fatalf("logstore Open: %v", err) + } + t.Cleanup(func() { _ = log.Close(context.Background()) }) + + bs := blockstore.NewLayered(log, noopBase{}) + + // Stage two blocks for bucket "alpha", commit, then Get them back + // via the layered store. + stage := blockstore.NewOpStaging(bs, log, "alpha") + a := makeBlock(t, []byte("alpha-1")) + b := makeBlock(t, []byte("alpha-2")) + for _, blk := range []block.Block{a, b} { + if err := stage.Put(context.Background(), blk); err != nil { + t.Fatalf("stage.Put: %v", err) + } + } + if err := stage.Commit(context.Background(), makeRoot(t, "alpha")); err != nil { + t.Fatalf("Commit: %v", err) + } + + for _, blk := range []block.Block{a, b} { + got, err := bs.GetBlock(context.Background(), blk.Cid()) + if err != nil { + t.Fatalf("layered.Get %s: %v", blk.Cid(), err) + } + if string(got.RawData()) != string(blk.RawData()) { + t.Fatalf("layered.Get %s mismatch: got %q want %q", blk.Cid(), got.RawData(), blk.RawData()) + } + } +} + +func TestLayeredFallsThroughToBaseOnMiss(t *testing.T) { + dir := t.TempDir() + meta := newFakeMeta() + logger := zaptest.NewLogger(t) + + log, err := logstore.Open(context.Background(), logstore.Config{ + Dir: dir, + Meta: meta, + SealBytes: 1 << 30, + SealAge: 1 * time.Hour, + Retain: 6, + Flush: func(ctx context.Context, seg *logstore.Segment) error { + return meta.MarkSegmentFlushed(ctx, seg.Seq(), time.Now().Unix(), seg.OpRoots()) + }, + Logger: logger, + }) + if err != nil { + t.Fatalf("logstore Open: %v", err) + } + t.Cleanup(func() { _ = log.Close(context.Background()) }) + + bs := blockstore.NewLayered(log, noopBase{}) + missing := makeBlock(t, []byte("nope")).Cid() + _, err = bs.GetBlock(context.Background(), missing) + if !errors.Is(err, errUnknownBase) { + t.Fatalf("expected base sentinel, got %v", err) + } +} + +func TestStagingDiscardLeavesLogUntouched(t *testing.T) { + dir := t.TempDir() + meta := newFakeMeta() + logger := zaptest.NewLogger(t) + + log, err := logstore.Open(context.Background(), logstore.Config{ + Dir: dir, + Meta: meta, + SealBytes: 1 << 30, + SealAge: 1 * time.Hour, + Retain: 6, + Flush: func(ctx context.Context, seg *logstore.Segment) error { + return meta.MarkSegmentFlushed(ctx, seg.Seq(), time.Now().Unix(), seg.OpRoots()) + }, + Logger: logger, + }) + if err != nil { + t.Fatalf("logstore Open: %v", err) + } + t.Cleanup(func() { _ = log.Close(context.Background()) }) + + bs := blockstore.NewLayered(log, noopBase{}) + stage := blockstore.NewOpStaging(bs, log, "alpha") + blk := makeBlock(t, []byte("never-committed")) + if err := stage.Put(context.Background(), blk); err != nil { + t.Fatalf("Put: %v", err) + } + stage.Discard() + + if _, err := log.Get(context.Background(), blk.Cid()); !errors.Is(err, blockstore.ErrNotFound) { + t.Fatalf("Discard should leave log empty, got %v", err) + } +} diff --git a/pkg/ms3t/blockstore/store.go b/pkg/ms3t/blockstore/store.go new file mode 100644 index 0000000..5cfcd06 --- /dev/null +++ b/pkg/ms3t/blockstore/store.go @@ -0,0 +1,123 @@ +// Package blockstore is the home for ms3t's block I/O abstractions. +// It declares the contracts (Reader, Writer, Store, BlockReader, +// BlockWriter, ReadStore, BaseStore, Log) and the in-process +// implementations of the read tier (Layered), the transactional +// tier (OpStaging), and the network base (Forge). The on-disk LSM +// implementation of Log lives in pkg/ms3t/logstore. +// +// Tiered architecture: +// +// WRITE PATH +// client → OpStaging → (Commit) → Log → (Flush) → BaseStore (Forge) +// ↑ ↑ +// buffered until Commit; hot (open) + +// reads see own writes warm (sealed local) + +// cold (off-host) +// +// READ PATH +// client → Layered (cache → Log → BaseStore) +// +// Conventions: +// +// - Reader / Writer / Store: CBOR-typed I/O, mirroring the shape +// of cbor.IpldStore. Method names are Get / Put. +// - BlockReader / BlockWriter: raw-block I/O. Method names are +// GetBlock / PutBlock so a single type can expose both halves +// without method-name collision against the CBOR-typed Get/Put. +// - ReadStore = Reader + BlockReader: the read seam s3frontend +// drives. Layered is the production implementation. +// - Log: the journaling tier — see log.go. +// - BaseStore: alias for cbor.IpldBlockstore. The bottom tier +// keeps the IPFS-standard naming convention so anything +// implementing the cbor IpldBlockstore interface (Forge, +// third-party IPLD blockstores) drops in without an adapter. +// - OpStaging: per-transaction store. Get/Put buffer in memory; +// Commit hands the entire batch to a Log via AppendBatch and +// returns once the journal has fsynced; Discard rolls back. +// +// CborStore is the helper that wraps a BaseStore into a Store with +// the multihash fixed to SHA2_256, so encoded blocks address-equal +// across the codebase regardless of where in the layer stack they +// were materialized. +package blockstore + +import ( + "context" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" + mh "github.com/multiformats/go-multihash" +) + +// Reader fetches a CBOR-encoded value at c into out. Same shape as +// cbor.IpldStore.Get; mst.LoadMST and any code path that walks the +// MST without materializing it accept a Reader. +type Reader interface { + Get(ctx context.Context, c cid.Cid, out any) error +} + +// Writer writes a CBOR-encoded value, returning its CID. Same shape +// as cbor.IpldStore.Put. +type Writer interface { + Put(ctx context.Context, v any) (cid.Cid, error) +} + +// Store is Reader + Writer — the CBOR-typed I/O surface (manifests, +// MST nodes). Equivalent in shape to cbor.IpldStore but defined +// here so consumers don't have to import cbor. +type Store interface { + Reader + Writer +} + +// BlockReader fetches a raw block. Used by chunker.OpenBody for +// streaming body chunks. Same shape as cbor.IpldBlockstore.Get but +// renamed to GetBlock so a single type can expose both a CBOR-typed +// Get (Reader) and a raw-block GetBlock without method-name +// collision. +type BlockReader interface { + GetBlock(ctx context.Context, c cid.Cid) (block.Block, error) +} + +// BlockWriter writes a raw block. Used by chunker.PutBody for body +// chunks. Same shape as cbor.IpldBlockstore.Put but renamed to +// PutBlock for the same reason as BlockReader. +type BlockWriter interface { + PutBlock(ctx context.Context, blk block.Block) error +} + +// ReadStore is the read-only seam the s3frontend.Backend uses for +// both CBOR-decoded reads (manifest, MST nodes) and raw block reads +// (body chunks). Layered is the production implementation. +type ReadStore interface { + Reader + BlockReader +} + +// WriteStore is the write seam a body codec uses: CBOR-typed Put +// (for format-specific index blocks) plus raw-block PutBlock (for +// chunk bytes). bucketop.Tx satisfies it. +type WriteStore interface { + Writer + BlockWriter +} + +// BaseStore is the bottom-tier raw-block interface. Aliased to +// cbor.IpldBlockstore so anything implementing the IPFS-standard +// convention (Forge, third-party IPLD blockstores) drops in +// without an adapter. ms3t's higher-layer interfaces (BlockReader, +// BlockWriter, Store) use the GetBlock / PutBlock / typed Get / Put +// naming convention; only this layer and CborStore work in the IPFS +// convention. +type BaseStore = cbor.IpldBlockstore + +// CborStore wraps a BaseStore in a Store, fixing the multihash to +// SHA2_256 so encoded blocks address-equal across the codebase. +// Used by Layered to expose itself as a CBOR view, and by bucketop +// to wrap an OpStaging into the per-tx CBOR view. +func CborStore(bs BaseStore) Store { + cst := cbor.NewCborStore(bs) + cst.DefaultMultihash = mh.SHA2_256 + return cst +} diff --git a/pkg/ms3t/blockstore/walk.go b/pkg/ms3t/blockstore/walk.go deleted file mode 100644 index 2d42fbf..0000000 --- a/pkg/ms3t/blockstore/walk.go +++ /dev/null @@ -1,63 +0,0 @@ -package blockstore - -import ( - "bytes" - "context" - "fmt" - - block "github.com/ipfs/go-block-format" - "github.com/ipfs/go-cid" - cbor "github.com/ipfs/go-ipld-cbor" - cbg "github.com/whyrusleeping/cbor-gen" -) - -// WalkReachable returns every block reachable from root in the given -// blockstore via IPLD links. DAG-CBOR blocks are scanned for child CIDs; -// raw blocks are leaves. -// -// Cycles are detected and not revisited. Block order is BFS by -// discovery — useful for CAR-friendly streaming (root first, then its -// direct children, etc.). -// -// Used by recovery: walking from a bucket's HEAD collects every -// structural block plus every body chunk reachable from any current -// ObjectManifest, which is exactly the set we want to ship to Forge. -func WalkReachable(ctx context.Context, bs cbor.IpldBlockstore, root cid.Cid) ([]block.Block, error) { - if !root.Defined() { - return nil, nil - } - - visited := map[cid.Cid]struct{}{} - var out []block.Block - queue := []cid.Cid{root} - - for len(queue) > 0 { - c := queue[0] - queue = queue[1:] - if _, seen := visited[c]; seen { - continue - } - visited[c] = struct{}{} - - blk, err := bs.Get(ctx, c) - if err != nil { - return nil, fmt.Errorf("walk %s: %w", c, err) - } - out = append(out, blk) - - // Only DAG-CBOR blocks have IPLD links to follow. Raw blocks - // (codec 0x55) are body chunks — leaves of the DAG. - if c.Prefix().Codec != cid.DagCBOR { - continue - } - err = cbg.ScanForLinks(bytes.NewReader(blk.RawData()), func(child cid.Cid) { - if _, seen := visited[child]; !seen { - queue = append(queue, child) - } - }) - if err != nil { - return nil, fmt.Errorf("scan links %s: %w", c, err) - } - } - return out, nil -} diff --git a/pkg/ms3t/bucket/bucket.go b/pkg/ms3t/bucket/bucket.go deleted file mode 100644 index 513545e..0000000 --- a/pkg/ms3t/bucket/bucket.go +++ /dev/null @@ -1,518 +0,0 @@ -// Package bucket implements S3-style CRUD operations on top of the forked -// MST (one tree per bucket) and an IPLD blockstore that holds both the -// structural blocks (MST nodes, ObjectManifests) and the raw body chunks. -package bucket - -import ( - "context" - "errors" - "fmt" - "io" - "strings" - "sync" - "time" - - "github.com/ipfs/go-cid" - cbor "github.com/ipfs/go-ipld-cbor" - - "github.com/storacha/sprue/pkg/ms3t/blockstore" - "github.com/storacha/sprue/pkg/ms3t/mst" - "github.com/storacha/sprue/pkg/ms3t/registry" - "github.com/storacha/sprue/pkg/ms3t/uploader" -) - -// Service is the entry point for bucket operations. -type Service struct { - bs cbor.IpldBlockstore - cst cbor.IpldStore // long-lived, read-only over bs (for List/Head) - reg registry.Registry - uploader uploader.Uploader - - chunkSize int64 - - mu sync.Mutex - locks map[string]*sync.Mutex -} - -// Options configures a Service. Zero-valued options take sensible defaults. -type Options struct { - // ChunkSize is the body chunk size for new objects. 0 → DefaultChunkSize. - ChunkSize int64 - // Uploader receives a CAR per mutation (PutObject, DeleteObject) - // containing the structural blocks created by that op. nil → - // uploader.Noop (CARs are dropped on the floor). - Uploader uploader.Uploader -} - -// New wires the dependencies into a Service. The blockstore can be -// any cbor.IpldBlockstore implementation; sprue's fx wiring chooses -// either a SQLite-backed local store (default) or a Forge-backed -// pass-through store (no_cache mode). -func New(bs cbor.IpldBlockstore, reg registry.Registry, opt Options) *Service { - cs := opt.ChunkSize - if cs <= 0 { - cs = DefaultChunkSize - } - up := opt.Uploader - if up == nil { - up = uploader.Noop{} - } - return &Service{ - bs: bs, - cst: mst.CborStore(bs), - reg: reg, - uploader: up, - chunkSize: cs, - locks: map[string]*sync.Mutex{}, - } -} - -func (s *Service) bucketLock(name string) *sync.Mutex { - s.mu.Lock() - defer s.mu.Unlock() - if m, ok := s.locks[name]; ok { - return m - } - m := &sync.Mutex{} - s.locks[name] = m - return m -} - -var ( - ErrBucketExists = registry.ErrExists - ErrBucketNotFound = registry.ErrNotFound - ErrObjectNotFound = errors.New("bucket: object not found") - ErrInvalidKey = errors.New("bucket: invalid object key") - ErrInvalidBucket = errors.New("bucket: invalid bucket name") - ErrBucketNotEmpty = errors.New("bucket: bucket not empty") - ErrInvalidRange = errors.New("bucket: invalid range") -) - -// === Bucket lifecycle === - -func (s *Service) CreateBucket(ctx context.Context, name string) error { - if !validBucketName(name) { - return ErrInvalidBucket - } - return s.reg.Create(ctx, name, time.Now().Unix()) -} - -func (s *Service) ListBuckets(ctx context.Context) ([]*registry.State, error) { - return s.reg.List(ctx) -} - -// === Forge sync (recovery + shutdown) === - -// Recover ensures every bucket's current root has been shipped to the -// Uploader. For each bucket, if the persisted ForgeRoot does not equal -// the current Root, walks the entire DAG reachable from Root, submits -// the blocks, flushes the Uploader, and advances ForgeRoot. -// -// Intended to be called once at startup, before the HTTP listener -// begins serving. Idempotent: if everything is already in sync, it's -// a fast scan and a no-op flush. -func (s *Service) Recover(ctx context.Context) error { - states, err := s.reg.List(ctx) - if err != nil { - return fmt.Errorf("recover: list buckets: %w", err) - } - - var dirty []*registry.State - for _, st := range states { - if !st.Root.Defined() { - continue - } - if st.ForgeRoot.Defined() && st.ForgeRoot.Equals(st.Root) { - continue - } - blocks, err := blockstore.WalkReachable(ctx, s.bs, st.Root) - if err != nil { - return fmt.Errorf("recover %q: walk: %w", st.Name, err) - } - if len(blocks) == 0 { - continue - } - if err := s.uploader.Submit(ctx, []cid.Cid{st.Root}, blocks); err != nil { - return fmt.Errorf("recover %q: submit: %w", st.Name, err) - } - dirty = append(dirty, st) - } - - if len(dirty) == 0 { - return nil - } - if err := s.uploader.Flush(ctx); err != nil { - return fmt.Errorf("recover: flush: %w", err) - } - for _, st := range dirty { - if err := s.reg.SetForgeRoot(ctx, st.Name, st.Root); err != nil { - return fmt.Errorf("recover %q: set forge root: %w", st.Name, err) - } - } - return nil -} - -// Shutdown cleanly drains the Uploader and advances ForgeRoot to the -// current Root for every bucket. After Shutdown returns successfully, -// a subsequent Recover at the next startup is a no-op. -func (s *Service) Shutdown(ctx context.Context) error { - if err := s.uploader.Close(ctx); err != nil { - return fmt.Errorf("shutdown: close uploader: %w", err) - } - states, err := s.reg.List(ctx) - if err != nil { - return fmt.Errorf("shutdown: list buckets: %w", err) - } - for _, st := range states { - if !st.Root.Defined() { - continue - } - if st.ForgeRoot.Defined() && st.ForgeRoot.Equals(st.Root) { - continue - } - if err := s.reg.SetForgeRoot(ctx, st.Name, st.Root); err != nil { - return fmt.Errorf("shutdown %q: set forge root: %w", st.Name, err) - } - } - return nil -} - -func (s *Service) DeleteBucket(ctx context.Context, name string) error { - lock := s.bucketLock(name) - lock.Lock() - defer lock.Unlock() - - st, err := s.reg.Get(ctx, name) - if err != nil { - return err - } - if st.Root.Defined() { - t := mst.LoadMST(s.cst, st.Root) - var seen bool - walkErr := t.WalkLeavesFromNocache(ctx, "", func(string, cid.Cid) error { - seen = true - return mst.ErrStopWalk - }) - if walkErr != nil { - return fmt.Errorf("bucket: scan empty: %w", walkErr) - } - if seen { - return ErrBucketNotEmpty - } - } - return s.reg.Delete(ctx, name) -} - -// === Object operations === - -// PutObject stores body under (bucket, key), creating or replacing as -// needed. Body bytes are chunked into raw IPLD blocks (written directly -// to the underlying blockstore). Structural blocks (manifest + mutated -// MST nodes) are captured into a per-op CARBuffer and emitted as a -// single CAR via the configured Uploader at commit time. -func (s *Service) PutObject(ctx context.Context, bucket, key string, body io.Reader, contentType string) (*ObjectManifest, error) { - if !mst.IsValidKey(key) { - return nil, ErrInvalidKey - } - - lock := s.bucketLock(bucket) - lock.Lock() - defer lock.Unlock() - - st, err := s.reg.Get(ctx, bucket) - if err != nil { - return nil, err - } - - // All blocks for this PUT — body chunks, manifest, MST mutation path — - // flow through the same per-op CARBuffer. The Uploader receives one - // Submit per S3 op containing every block reachable from the new root, - // individually addressable inside the resulting CAR via the indexer. - buf := blockstore.NewCARBuffer(s.bs, s.uploader) - - bodyRec, err := putBody(ctx, buf, body, s.chunkSize) - if err != nil { - buf.Discard() - return nil, fmt.Errorf("bucket: chunk body: %w", err) - } - - if contentType == "" { - contentType = "application/octet-stream" - } - mf := &ObjectManifest{ - Key: key, - ContentType: contentType, - Created: time.Now().Unix(), - Body: bodyRec, - } - - opCst := mst.CborStore(buf) - - mfCid, err := opCst.Put(ctx, mf) - if err != nil { - buf.Discard() - return nil, fmt.Errorf("bucket: manifest put: %w", err) - } - - t := loadOrEmpty(opCst, st.Root) - t2, err := t.Add(ctx, key, mfCid, -1) - if errors.Is(err, mst.ErrAlreadyExists) { - t2, err = t.Update(ctx, key, mfCid) - } - if err != nil { - buf.Discard() - return nil, fmt.Errorf("bucket: mst write: %w", err) - } - - newRoot, err := t2.GetPointer(ctx) - if err != nil { - buf.Discard() - return nil, fmt.Errorf("bucket: mst pointer: %w", err) - } - - if err := buf.Commit(ctx, newRoot); err != nil { - return nil, fmt.Errorf("bucket: car commit: %w", err) - } - - if err := s.reg.CASRoot(ctx, bucket, st.Root, newRoot); err != nil { - return nil, fmt.Errorf("bucket: advance root: %w", err) - } - return mf, nil -} - -// GetObject opens the body and returns the manifest. Caller must Close. -// If rng is non-nil, returns a reader over the requested byte range. -func (s *Service) GetObject(ctx context.Context, bucket, key string, rng *Range) (io.ReadCloser, *ObjectManifest, error) { - mf, err := s.HeadObject(ctx, bucket, key) - if err != nil { - return nil, nil, err - } - if rng == nil { - return openBody(ctx, s.bs, mf.Body), mf, nil - } - if err := rng.resolve(mf.Body.Size); err != nil { - return nil, mf, err - } - return openBodyRange(ctx, s.bs, mf.Body, rng.Start, rng.End), mf, nil -} - -// HeadObject returns just the manifest. -func (s *Service) HeadObject(ctx context.Context, bucket, key string) (*ObjectManifest, error) { - st, err := s.reg.Get(ctx, bucket) - if err != nil { - return nil, err - } - if !st.Root.Defined() { - return nil, ErrObjectNotFound - } - t := mst.LoadMST(s.cst, st.Root) - mfCid, err := t.Get(ctx, key) - if errors.Is(err, mst.ErrNotFound) { - return nil, ErrObjectNotFound - } - if err != nil { - return nil, fmt.Errorf("bucket: mst get: %w", err) - } - var mf ObjectManifest - if err := s.cst.Get(ctx, mfCid, &mf); err != nil { - return nil, fmt.Errorf("bucket: manifest get: %w", err) - } - return &mf, nil -} - -// DeleteObject removes a key from the bucket. Missing keys return nil -// (matching S3's idempotent DELETE semantics). Body chunks are NOT -// deleted from the blockstore; GC is a future, separate pass over live -// manifests. -func (s *Service) DeleteObject(ctx context.Context, bucket, key string) error { - lock := s.bucketLock(bucket) - lock.Lock() - defer lock.Unlock() - - st, err := s.reg.Get(ctx, bucket) - if err != nil { - return err - } - if !st.Root.Defined() { - return nil - } - - buf := blockstore.NewCARBuffer(s.bs, s.uploader) - opCst := mst.CborStore(buf) - - t := mst.LoadMST(opCst, st.Root) - t2, err := t.Delete(ctx, key) - if errors.Is(err, mst.ErrNotFound) { - buf.Discard() - return nil - } - if err != nil { - buf.Discard() - return fmt.Errorf("bucket: mst delete: %w", err) - } - - newRoot, err := t2.GetPointer(ctx) - if err != nil { - buf.Discard() - return fmt.Errorf("bucket: mst pointer: %w", err) - } - if err := buf.Commit(ctx, newRoot); err != nil { - return fmt.Errorf("bucket: car commit: %w", err) - } - if err := s.reg.CASRoot(ctx, bucket, st.Root, newRoot); err != nil { - return fmt.Errorf("bucket: advance root: %w", err) - } - return nil -} - -// === Range support === - -// Range describes an inclusive byte range, matching HTTP Range semantics. -// -// To support the open-ended ("bytes=N-") and suffix ("bytes=-N") forms -// without forcing the HTTP layer to do a separate HEAD before the GET, -// callers may set sentinel values: -// - Start = -1 means "the last End bytes" (suffix form) -// - End = -1 means "from Start to the end of the object" -// -// resolve() is called by the service once the body size is known. -type Range struct { - Start int64 - End int64 -} - -// resolve fills in any sentinel values (-1) using size and validates the -// resulting range. Returns ErrInvalidRange if the range is unsatisfiable. -func (r *Range) resolve(size int64) error { - if size <= 0 { - return ErrInvalidRange - } - switch { - case r.Start < 0 && r.End >= 0: - // suffix: last End bytes - if r.End == 0 { - return ErrInvalidRange - } - if r.End > size { - r.End = size - } - r.Start = size - r.End - r.End = size - 1 - case r.Start >= 0 && r.End < 0: - // open-ended - r.End = size - 1 - } - if r.Start < 0 || r.End < r.Start || r.End >= size { - return ErrInvalidRange - } - return nil -} - -// === Listing === - -type ListResult struct { - Objects []*ObjectManifest - CommonPrefixes []string - Truncated bool - NextToken string -} - -type ListOptions struct { - Prefix string - Delimiter string - StartAfter string - MaxKeys int -} - -const defaultMaxKeys = 1000 - -func (s *Service) List(ctx context.Context, bucket string, opt ListOptions) (*ListResult, error) { - if opt.MaxKeys <= 0 { - opt.MaxKeys = defaultMaxKeys - } - - st, err := s.reg.Get(ctx, bucket) - if err != nil { - return nil, err - } - res := &ListResult{} - if !st.Root.Defined() { - return res, nil - } - - t := mst.LoadMST(s.cst, st.Root) - - from := opt.Prefix - if opt.StartAfter != "" && opt.StartAfter > from { - from = opt.StartAfter + "\x01" - } - - seenPrefix := map[string]struct{}{} - walkErr := t.WalkLeavesFromNocache(ctx, from, func(k string, mfCid cid.Cid) error { - if opt.Prefix != "" && !strings.HasPrefix(k, opt.Prefix) { - return mst.ErrStopWalk - } - - if opt.Delimiter != "" { - tail := k[len(opt.Prefix):] - if i := strings.Index(tail, opt.Delimiter); i >= 0 { - cp := opt.Prefix + tail[:i+len(opt.Delimiter)] - if _, dup := seenPrefix[cp]; !dup { - seenPrefix[cp] = struct{}{} - res.CommonPrefixes = append(res.CommonPrefixes, cp) - if len(res.Objects)+len(res.CommonPrefixes) >= opt.MaxKeys { - res.Truncated = true - res.NextToken = cp - return mst.ErrStopWalk - } - } - return nil - } - } - - var mf ObjectManifest - if err := s.cst.Get(ctx, mfCid, &mf); err != nil { - return fmt.Errorf("manifest get %s: %w", mfCid, err) - } - res.Objects = append(res.Objects, &mf) - - if len(res.Objects)+len(res.CommonPrefixes) >= opt.MaxKeys { - res.Truncated = true - res.NextToken = k - return mst.ErrStopWalk - } - return nil - }) - if walkErr != nil { - return nil, fmt.Errorf("bucket: walk: %w", walkErr) - } - return res, nil -} - -// === Internal helpers === - -func loadOrEmpty(cst cbor.IpldStore, root cid.Cid) *mst.MerkleSearchTree { - if root.Defined() { - return mst.LoadMST(cst, root) - } - return mst.NewEmptyMST(cst) -} - -func validBucketName(s string) bool { - if len(s) < 3 || len(s) > 63 { - return false - } - for i, r := range s { - switch { - case r >= 'a' && r <= 'z': - case r >= '0' && r <= '9': - case r == '-' || r == '.': - if i == 0 { - return false - } - default: - return false - } - } - return true -} diff --git a/pkg/ms3t/bucket/cbor_gen.go b/pkg/ms3t/bucket/cbor_gen.go index 3b35d4f..ffbf8a6 100644 --- a/pkg/ms3t/bucket/cbor_gen.go +++ b/pkg/ms3t/bucket/cbor_gen.go @@ -238,7 +238,7 @@ func (t *Body) MarshalCBOR(w io.Writer) error { return err } - // t.Chunks ([]cid.Cid) (slice) + // t.Content (cid.Cid) (struct) if len("c") > 1000000 { return xerrors.Errorf("Value in field \"c\" was too long") } @@ -250,19 +250,31 @@ func (t *Body) MarshalCBOR(w io.Writer) error { return err } - if len(t.Chunks) > 8192 { - return xerrors.Errorf("Slice value in field t.Chunks was too long") + if err := cbg.WriteCid(cw, t.Content); err != nil { + return xerrors.Errorf("failed to write cid field t.Content: %w", err) } - if err := cw.WriteMajorTypeHeader(cbg.MajArray, uint64(len(t.Chunks))); err != nil { + // t.Format (string) (string) + if len("f") > 1000000 { + return xerrors.Errorf("Value in field \"f\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("f"))); err != nil { + return err + } + if _, err := cw.WriteString(string("f")); err != nil { return err } - for _, v := range t.Chunks { - if err := cbg.WriteCid(cw, v); err != nil { - return xerrors.Errorf("failed to write cid field v: %w", err) - } + if len(t.Format) > 1000000 { + return xerrors.Errorf("Value in field t.Format was too long") + } + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len(t.Format))); err != nil { + return err + } + if _, err := cw.WriteString(string(t.Format)); err != nil { + return err } // t.SHA256 ([]uint8) (slice) @@ -311,6 +323,173 @@ func (t *Body) MarshalCBOR(w io.Writer) error { } } + return nil +} + +func (t *Body) UnmarshalCBOR(r io.Reader) (err error) { + *t = Body{} + + cr := cbg.NewCborReader(r) + + maj, extra, err := cr.ReadHeader() + if err != nil { + return err + } + defer func() { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + }() + + if maj != cbg.MajMap { + return fmt.Errorf("cbor input should be of type map") + } + + if extra > cbg.MaxLength { + return fmt.Errorf("Body: map struct too large (%d)", extra) + } + + n := extra + + nameBuf := make([]byte, 1) + for i := uint64(0); i < n; i++ { + nameLen, ok, err := cbg.ReadFullStringIntoBuf(cr, nameBuf, 1000000) + if err != nil { + return err + } + + if !ok { + // Field doesn't exist on this type, so ignore it + if err := cbg.ScanForLinks(cr, func(cid.Cid) {}); err != nil { + return err + } + continue + } + + switch string(nameBuf[:nameLen]) { + // t.Content (cid.Cid) (struct) + case "c": + + { + + c, err := cbg.ReadCid(cr) + if err != nil { + return xerrors.Errorf("failed to read cid field t.Content: %w", err) + } + + t.Content = c + + } + // t.Format (string) (string) + case "f": + + { + sval, err := cbg.ReadStringWithMax(cr, 1000000) + if err != nil { + return err + } + + t.Format = string(sval) + } + // t.SHA256 ([]uint8) (slice) + case "h": + + maj, extra, err = cr.ReadHeader() + if err != nil { + return err + } + + if extra > 2097152 { + return fmt.Errorf("t.SHA256: byte array too large (%d)", extra) + } + if maj != cbg.MajByteString { + return fmt.Errorf("expected byte array") + } + + if extra > 0 { + t.SHA256 = make([]uint8, extra) + } + + if _, err := io.ReadFull(cr, t.SHA256); err != nil { + return err + } + + // t.Size (int64) (int64) + case "s": + { + maj, extra, err := cr.ReadHeader() + if err != nil { + return err + } + var extraI int64 + switch maj { + case cbg.MajUnsignedInt: + extraI = int64(extra) + if extraI < 0 { + return fmt.Errorf("int64 positive overflow") + } + case cbg.MajNegativeInt: + extraI = int64(extra) + if extraI < 0 { + return fmt.Errorf("int64 negative overflow") + } + extraI = -1 - extraI + default: + return fmt.Errorf("wrong type for int64 field: %d", maj) + } + + t.Size = int64(extraI) + } + + default: + // Field doesn't exist on this type, so ignore it + if err := cbg.ScanForLinks(r, func(cid.Cid) {}); err != nil { + return err + } + } + } + + return nil +} +func (t *FixedChunkerIndex) MarshalCBOR(w io.Writer) error { + if t == nil { + _, err := w.Write(cbg.CborNull) + return err + } + + cw := cbg.NewCborWriter(w) + + if _, err := cw.Write([]byte{162}); err != nil { + return err + } + + // t.Chunks ([]cid.Cid) (slice) + if len("c") > 1000000 { + return xerrors.Errorf("Value in field \"c\" was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajTextString, uint64(len("c"))); err != nil { + return err + } + if _, err := cw.WriteString(string("c")); err != nil { + return err + } + + if len(t.Chunks) > 8192 { + return xerrors.Errorf("Slice value in field t.Chunks was too long") + } + + if err := cw.WriteMajorTypeHeader(cbg.MajArray, uint64(len(t.Chunks))); err != nil { + return err + } + for _, v := range t.Chunks { + + if err := cbg.WriteCid(cw, v); err != nil { + return xerrors.Errorf("failed to write cid field v: %w", err) + } + + } + // t.ChunkSize (int64) (int64) if len("cs") > 1000000 { return xerrors.Errorf("Value in field \"cs\" was too long") @@ -336,8 +515,8 @@ func (t *Body) MarshalCBOR(w io.Writer) error { return nil } -func (t *Body) UnmarshalCBOR(r io.Reader) (err error) { - *t = Body{} +func (t *FixedChunkerIndex) UnmarshalCBOR(r io.Reader) (err error) { + *t = FixedChunkerIndex{} cr := cbg.NewCborReader(r) @@ -356,7 +535,7 @@ func (t *Body) UnmarshalCBOR(r io.Reader) (err error) { } if extra > cbg.MaxLength { - return fmt.Errorf("Body: map struct too large (%d)", extra) + return fmt.Errorf("FixedChunkerIndex: map struct too large (%d)", extra) } n := extra @@ -419,55 +598,6 @@ func (t *Body) UnmarshalCBOR(r io.Reader) (err error) { } } - // t.SHA256 ([]uint8) (slice) - case "h": - - maj, extra, err = cr.ReadHeader() - if err != nil { - return err - } - - if extra > 2097152 { - return fmt.Errorf("t.SHA256: byte array too large (%d)", extra) - } - if maj != cbg.MajByteString { - return fmt.Errorf("expected byte array") - } - - if extra > 0 { - t.SHA256 = make([]uint8, extra) - } - - if _, err := io.ReadFull(cr, t.SHA256); err != nil { - return err - } - - // t.Size (int64) (int64) - case "s": - { - maj, extra, err := cr.ReadHeader() - if err != nil { - return err - } - var extraI int64 - switch maj { - case cbg.MajUnsignedInt: - extraI = int64(extra) - if extraI < 0 { - return fmt.Errorf("int64 positive overflow") - } - case cbg.MajNegativeInt: - extraI = int64(extra) - if extraI < 0 { - return fmt.Errorf("int64 negative overflow") - } - extraI = -1 - extraI - default: - return fmt.Errorf("wrong type for int64 field: %d", maj) - } - - t.Size = int64(extraI) - } // t.ChunkSize (int64) (int64) case "cs": { diff --git a/pkg/ms3t/bucket/chunker.go b/pkg/ms3t/bucket/chunker.go index 25a096e..46ab0ef 100644 --- a/pkg/ms3t/bucket/chunker.go +++ b/pkg/ms3t/bucket/chunker.go @@ -9,8 +9,9 @@ import ( block "github.com/ipfs/go-block-format" "github.com/ipfs/go-cid" - cbor "github.com/ipfs/go-ipld-cbor" mh "github.com/multiformats/go-multihash" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" ) // DefaultChunkSize is the chunk size used when callers don't supply one. @@ -28,11 +29,61 @@ var rawBlockPrefix = cid.Prefix{ MhLength: -1, } -// putBody reads body bytes from r, splits at chunkSize, writes each chunk -// as a raw IPLD block to bs, and returns a Body record. The body's full -// sha256 is computed once during chunking and stored on the Body for use -// as the ETag. -func putBody(ctx context.Context, bs cbor.IpldBlockstore, r io.Reader, chunkSize int64) (Body, error) { +// BodyWriter writes the bytes from r as a sequence of blocks (raw +// chunks plus whatever index/DAG blocks the codec needs) to w and +// returns a Body record describing how to reconstruct the bytes. +// +// w accepts both raw block writes (PutBlock for chunk bytes) and +// CBOR-typed writes (Put for format-specific index blocks). +// bucketop.Tx satisfies it. +type BodyWriter interface { + Chunk(ctx context.Context, w blockstore.WriteStore, r io.Reader) (Body, error) +} + +// BodyReader streams bytes back out of a Body. Format identifies +// the codec the writer produced; consumers route a Body to the +// matching BodyReader by that string. +// +// bs accepts both raw block reads (GetBlock for chunk bytes) and +// CBOR-typed reads (Get for index blocks). blockstore.Layered +// satisfies it. +type BodyReader interface { + // Format returns the Body.Format value this reader handles. + Format() string + // Open returns a stream over the full body. + Open(ctx context.Context, bs blockstore.ReadStore, body Body) io.ReadCloser + // OpenRange returns a stream over [start, end] inclusive. + OpenRange(ctx context.Context, bs blockstore.ReadStore, body Body, start, end int64) io.ReadCloser +} + +// BodyCodec is the canonical pair: a single concrete impl satisfies +// both halves so a Body produced by Chunk can always be read back +// via Open / OpenRange of the same codec instance. +type BodyCodec interface { + BodyWriter + BodyReader +} + +// FixedChunker is the default codec: fixed-size raw chunks indexed +// by a FixedChunkerIndex CBOR document at Body.Content. Implements +// BodyCodec. +type FixedChunker struct { + // ChunkSize is the body chunk size in bytes. 0 → DefaultChunkSize. + ChunkSize int64 +} + +// Compile-time assertion: FixedChunker is the canonical BodyCodec. +var _ BodyCodec = (*FixedChunker)(nil) + +// Format returns FormatFixed. +func (c *FixedChunker) Format() string { return FormatFixed } + +// Chunk reads body bytes from r, splits them at ChunkSize, writes +// each chunk as a raw block, then writes a FixedChunkerIndex CBOR +// block listing the chunks in order. The Body returned points +// Content at the index block. +func (c *FixedChunker) Chunk(ctx context.Context, w blockstore.WriteStore, r io.Reader) (Body, error) { + chunkSize := c.ChunkSize if chunkSize <= 0 { chunkSize = DefaultChunkSize } @@ -47,11 +98,11 @@ func putBody(ctx context.Context, bs cbor.IpldBlockstore, r io.Reader, chunkSize if n > 0 { chunk := buf[:n] bodyHasher.Write(chunk) - c, perr := putRawBlock(ctx, bs, chunk) + cidv, perr := putRawBlock(ctx, w, chunk) if perr != nil { return Body{}, fmt.Errorf("put chunk: %w", perr) } - chunks = append(chunks, c) + chunks = append(chunks, cidv) total += int64(n) } if err == nil { @@ -63,15 +114,40 @@ func putBody(ctx context.Context, bs cbor.IpldBlockstore, r io.Reader, chunkSize return Body{}, fmt.Errorf("read body: %w", err) } + idx := &FixedChunkerIndex{ChunkSize: chunkSize, Chunks: chunks} + indexCID, err := w.Put(ctx, idx) + if err != nil { + return Body{}, fmt.Errorf("put fixed index: %w", err) + } + return Body{ - Size: total, - ChunkSize: chunkSize, - Chunks: chunks, - SHA256: bodyHasher.Sum(nil), + Size: total, + SHA256: bodyHasher.Sum(nil), + Content: indexCID, + Format: FormatFixed, }, nil } -func putRawBlock(ctx context.Context, bs cbor.IpldBlockstore, data []byte) (cid.Cid, error) { +// Open returns a reader over the full body. +func (c *FixedChunker) Open(ctx context.Context, bs blockstore.ReadStore, body Body) io.ReadCloser { + return &fixedBodyReader{ctx: ctx, bs: bs, body: body, end: body.Size - 1} +} + +// OpenRange returns a reader over [start, end] inclusive of the +// body. Caller must ensure 0 <= start <= end <= Size-1. +func (c *FixedChunker) OpenRange(ctx context.Context, bs blockstore.ReadStore, body Body, start, end int64) io.ReadCloser { + return &fixedBodyReader{ + ctx: ctx, + bs: bs, + body: body, + start: start, + end: end, + needsSeek: true, + pos: start, + } +} + +func putRawBlock(ctx context.Context, w blockstore.BlockWriter, data []byte) (cid.Cid, error) { c, err := rawBlockPrefix.Sum(data) if err != nil { return cid.Undef, err @@ -80,56 +156,55 @@ func putRawBlock(ctx context.Context, bs cbor.IpldBlockstore, data []byte) (cid. if err != nil { return cid.Undef, err } - if err := bs.Put(ctx, blk); err != nil { + if err := w.PutBlock(ctx, blk); err != nil { return cid.Undef, err } return c, nil } -// openBody returns a reader over the full body. -func openBody(ctx context.Context, bs cbor.IpldBlockstore, body Body) io.ReadCloser { - return &bodyReader{ctx: ctx, bs: bs, body: body, end: body.Size - 1} -} - -// openBodyRange returns a reader over [start, end] inclusive of the body. -// Caller must ensure 0 <= start <= end <= Size-1. -func openBodyRange(ctx context.Context, bs cbor.IpldBlockstore, body Body, start, end int64) io.ReadCloser { - cs := body.ChunkSize - startChunk := int(start / cs) - startOffset := start % cs - return &bodyReader{ - ctx: ctx, - bs: bs, - body: body, - nextChunk: startChunk, - startOff: startOffset, - pos: start, - end: end, - havePartial: true, - } -} - -// bodyReader streams chunks lazily. It supports both whole-body and ranged -// reads via the same loop — only the initial offset and the inclusive end -// position differ. -type bodyReader struct { +// fixedBodyReader streams chunks lazily for FixedChunker bodies. It +// fetches the index block on first read, then walks chunks. Both +// whole-body and ranged reads use the same loop — only the initial +// offset and end position differ. +type fixedBodyReader struct { ctx context.Context - bs cbor.IpldBlockstore + bs blockstore.ReadStore body Body - nextChunk int - startOff int64 // offset into the first chunk we read - havePartial bool // whether startOff still applies to the next chunk read + // idx is fetched lazily on first Read. + idx *FixedChunkerIndex - cur []byte // currently materialized chunk bytes - curOff int // read position within cur + start int64 // first byte to return (0 for whole-body) + end int64 // last byte to return (inclusive) + pos int64 // current absolute byte position + needsSeek bool // whether we still owe an initial seek into the start chunk - pos int64 // current absolute byte position (next byte to return) - end int64 // last byte to return (inclusive) - err error + nextChunk int // index into idx.Chunks of the next block to fetch + cur []byte // currently materialized chunk bytes + curOff int // read position within cur + err error } -func (br *bodyReader) Read(p []byte) (int, error) { +func (br *fixedBodyReader) ensureIndex() error { + if br.idx != nil { + return nil + } + var idx FixedChunkerIndex + if err := br.bs.Get(br.ctx, br.body.Content, &idx); err != nil { + return fmt.Errorf("fetch fixed index %s: %w", br.body.Content, err) + } + br.idx = &idx + if br.needsSeek { + // The constructor for ranged reads stored the absolute start + // offset; translate it to (chunk index, in-chunk offset) now + // that we know ChunkSize. + br.nextChunk = int(br.start / idx.ChunkSize) + br.curOff = int(br.start % idx.ChunkSize) + } + return nil +} + +func (br *fixedBodyReader) Read(p []byte) (int, error) { if br.err != nil { return 0, br.err } @@ -137,24 +212,29 @@ func (br *bodyReader) Read(p []byte) (int, error) { br.err = io.EOF return 0, io.EOF } + if err := br.ensureIndex(); err != nil { + br.err = err + return 0, err + } - if br.cur == nil || br.curOff >= len(br.cur) { - if br.nextChunk >= len(br.body.Chunks) { + if br.cur == nil || (br.curOff >= len(br.cur) && !br.needsSeek) { + if br.nextChunk >= len(br.idx.Chunks) { br.err = io.EOF return 0, io.EOF } - blk, err := br.bs.Get(br.ctx, br.body.Chunks[br.nextChunk]) + blk, err := br.bs.GetBlock(br.ctx, br.idx.Chunks[br.nextChunk]) if err != nil { br.err = fmt.Errorf("read chunk %d: %w", br.nextChunk, err) return 0, br.err } br.cur = blk.RawData() - br.curOff = 0 - br.nextChunk++ - if br.havePartial { - br.curOff = int(br.startOff) - br.havePartial = false + // On a ranged read the first chunk is partial — curOff was + // pre-set in ensureIndex; consume it here and clear the flag. + if !br.needsSeek { + br.curOff = 0 } + br.needsSeek = false + br.nextChunk++ } // Don't read past the inclusive end position. @@ -174,4 +254,4 @@ func (br *bodyReader) Read(p []byte) (int, error) { return n, nil } -func (br *bodyReader) Close() error { return nil } +func (br *fixedBodyReader) Close() error { return nil } diff --git a/pkg/ms3t/bucket/manifest.go b/pkg/ms3t/bucket/manifest.go index 27756f0..925ad7b 100644 --- a/pkg/ms3t/bucket/manifest.go +++ b/pkg/ms3t/bucket/manifest.go @@ -2,11 +2,11 @@ package bucket import "github.com/ipfs/go-cid" -// ObjectManifest is the per-object metadata record stored as a CBOR block -// in the IPLD blockstore. The MST leaf for an object key points at this -// record's CID. The body bytes themselves live as raw IPLD blocks (codec -// 0x55) addressed by sha256 multihash; this manifest holds the ordered -// list of chunk CIDs. +// ObjectManifest is the per-object metadata record stored as a CBOR +// block in the IPLD blockstore. The MST leaf for an object key +// points at this record's CID. Body identifies the body DAG; the +// shape of that DAG is determined by Body.Format and read back via +// the matching BodyCodec. type ObjectManifest struct { Key string `cborgen:"k"` ContentType string `cborgen:"ct"` @@ -14,13 +14,30 @@ type ObjectManifest struct { Body Body `cborgen:"b"` } -// Body describes how the object's bytes are split into content-addressed -// chunks. ChunkSize is fixed across the object's chunks; the last chunk -// may be shorter than ChunkSize. Range arithmetic is direct: byte N lives -// in chunk index N/ChunkSize at offset N%ChunkSize. +// Body identifies the bytes of an object via a CID and a format +// tag. Format routes the Body to the right BodyCodec implementation +// at read time; Content is the root of whatever block DAG that +// codec produced. Size and SHA256 are codec-agnostic — the total +// number of body bytes and the sha256 of the full body, respectively +// (the latter is the source for the S3 ETag wire format). type Body struct { - Size int64 `cborgen:"s"` + Size int64 `cborgen:"s"` + SHA256 []byte `cborgen:"h"` + Content cid.Cid `cborgen:"c"` + Format string `cborgen:"f"` +} + +// FormatFixed is the Body.Format value used by FixedChunker — a +// flat array of fixed-size raw blocks indexed by a FixedChunkerIndex +// CBOR document at Body.Content. +const FormatFixed = "fixed-v1" + +// FixedChunkerIndex is the body-DAG root for FormatFixed: an +// ordered list of chunk CIDs plus the per-chunk size. The reader +// fetches the index block from Body.Content, then streams the +// chunks. Range arithmetic is direct: byte N lives in chunk +// index N/ChunkSize at offset N%ChunkSize. +type FixedChunkerIndex struct { ChunkSize int64 `cborgen:"cs"` Chunks []cid.Cid `cborgen:"c"` - SHA256 []byte `cborgen:"h"` // full-body sha256, for ETag } diff --git a/pkg/ms3t/bucketop/bucketop.go b/pkg/ms3t/bucketop/bucketop.go new file mode 100644 index 0000000..fe0eec0 --- /dev/null +++ b/pkg/ms3t/bucketop/bucketop.go @@ -0,0 +1,340 @@ +// Package bucketop provides the per-bucket write-transaction +// primitive for ms3t. Each Tx snapshots the bucket's published Root +// from the registry, exposes a per-op staging buffer that +// write-throughs to the LSM log, and on Commit fsyncs the buffer +// into one log.AppendBatch and CAS-advances the bucket Root from +// the snapshotted value to the caller-supplied newRoot — atomically +// from the caller's perspective. +// +// The package owns the four-way wiring (registry, log, layered read +// tier, MST CBOR view) that S3 verb implementations would otherwise +// compose by hand for every PUT/DELETE. It also owns the per-bucket +// lock map so concurrent transactions against the same bucket +// serialize within a single process and the CAS in Commit always +// sees a fresh snapshot. +// +// Read paths bypass bucketop. They only need the read-side +// blockstore directly; tx-style ceremony would be pure overhead. +package bucketop + +import ( + "context" + "errors" + "fmt" + "strings" + "sync" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" + "github.com/storacha/sprue/pkg/ms3t/mst" + "github.com/storacha/sprue/pkg/ms3t/registry" +) + +// ErrBucketNotFound is returned by Begin when the bucket doesn't +// exist in the registry. Callers map this to NoSuchBucket at the +// protocol layer. +var ErrBucketNotFound = errors.New("bucketop: bucket not found") + +// Deps wires the Coordinator to its three dependencies. Every field +// is an interface so tests can supply in-memory equivalents without +// standing up Postgres, an on-disk log, or a network blockstore. +type Deps struct { + // Reg tracks per-bucket Root. Begin reads State; Commit + // CAS-advances Root from the snapshot to newRoot. + Reg registry.Registry + + // Log is the durability boundary. Tx.Commit calls + // log.AppendBatch with the per-tx blocks plus an op-root + // record of (bucket, newRoot). + Log blockstore.Log + + // Reads is the read tier the staging buffer falls through to + // on miss during the transaction. + Reads blockstore.ReadStore +} + +// Coordinator manages per-bucket transactions. One per ms3t backend. +// Its job is to hand out Tx instances that share the same Deps, +// serialize concurrent transactions per bucket, and own the log's +// shutdown. +type Coordinator struct { + deps Deps + + mu sync.Mutex + locks map[string]*sync.Mutex +} + +// NewCoordinator returns a Coordinator wired to the given deps. +func NewCoordinator(deps Deps) *Coordinator { + return &Coordinator{ + deps: deps, + locks: map[string]*sync.Mutex{}, + } +} + +// Begin starts a write transaction against bucket. Steps: +// 1. Acquire the per-bucket lock. +// 2. Snapshot the bucket's State from the registry. If the bucket +// doesn't exist, release the lock and return ErrBucketNotFound. +// 3. Allocate a per-op staging buffer plus a CBOR view over it. +// +// Caller MUST defer tx.Discard() and call tx.Commit on success. +// Both Commit and Discard are idempotent against the lock — either +// one releases it; calling the other afterwards is a no-op. +// +// The bucket name is cloned defensively: protocol layers like +// versitygw/fiber return string headers that alias the request +// buffer (valid only inside the handler), and we persist the +// bucket name in Tx.bucket → OpRoot.Bucket → segment.opRoots, +// which the async flush path reads after the handler returns. +func (c *Coordinator) Begin(ctx context.Context, bucket string) (*Tx, error) { + bucket = strings.Clone(bucket) + release := c.Lock(bucket) + + state, err := c.deps.Reg.Get(ctx, bucket) + if err != nil { + release() + if errors.Is(err, registry.ErrNotFound) { + return nil, ErrBucketNotFound + } + return nil, fmt.Errorf("bucketop: get bucket %q: %w", bucket, err) + } + + staging := blockstore.NewOpStaging(c.deps.Reads, c.deps.Log, bucket) + return &Tx{ + deps: c.deps, + bucket: bucket, + state: state, + staging: staging, + cst: blockstore.CborStore(staging), + release: release, + }, nil +} + +// Lock acquires the per-bucket lock without starting a transaction +// and returns a release func the caller MUST defer. Used by +// non-write operations that still need to serialize against +// concurrent writes — DeleteBucket, for example, walks the MST to +// confirm the bucket is empty and then deletes the registry row; +// without serialization a concurrent PUT could squeeze in between. +// +// Most callers should prefer WithLock, which removes the +// defer-or-leak hazard. +func (c *Coordinator) Lock(bucket string) func() { + lock := c.lockFor(bucket) + lock.Lock() + return lock.Unlock +} + +// MutateFn is the closure passed to WithTx. It receives the +// transaction's bucket-state snapshot and the per-op staging +// view, and returns the MST root the transaction should advance +// to. +// +// - Returning (newRoot, nil) with newRoot.Defined() commits the +// transaction: log.AppendBatch fsyncs the staging buffer and +// reg.CASRoot advances the bucket Root. +// - Returning (cid.Undef, nil) signals "no-op success": the +// staging buffer is discarded with no log append and no Root +// advance. Used by S3 DELETE-on-missing-key, which is +// idempotent: the protocol wants a 200 even though the tree +// didn't change. +// - Returning (_, non-nil err) discards and propagates err. +type MutateFn func(ctx context.Context, tx *Tx) (newRoot cid.Cid, err error) + +// WithTx runs fn against a fresh transaction. Begin/Commit/Discard +// happen automatically based on what fn returns; the caller can +// neither leak the bucket lock by forgetting Discard nor leak +// in-flight bytes by forgetting Commit. +// +// Errors mapped to the caller: +// - ErrBucketNotFound from Begin propagates verbatim (fn is not +// invoked). +// - registry.ErrConflict from the inner CASRoot propagates +// wrapped (only reachable in cross-process races; the +// in-process bucket lock prevents it within one Coordinator). +// - Any error fn returns propagates verbatim. +func (c *Coordinator) WithTx(ctx context.Context, bucket string, fn MutateFn) error { + tx, err := c.Begin(ctx, bucket) + if err != nil { + return err + } + + newRoot, fnErr := fn(ctx, tx) + if fnErr != nil { + tx.Discard() + return fnErr + } + if !newRoot.Defined() { + tx.Discard() + return nil + } + return tx.Commit(ctx, newRoot) +} + +// LockFn is the closure passed to WithLock. It runs while the +// per-bucket lock is held; the lock is released as soon as fn +// returns, regardless of whether fn errored. +type LockFn func(ctx context.Context) error + +// WithLock runs fn while holding the per-bucket lock. Counterpart +// to WithTx for non-mutating bucket-level operations +// (DeleteBucket's empty-check + delete; future bucket-policy +// updates). +func (c *Coordinator) WithLock(ctx context.Context, bucket string, fn LockFn) error { + release := c.Lock(bucket) + defer release() + return fn(ctx) +} + +func (c *Coordinator) lockFor(bucket string) *sync.Mutex { + c.mu.Lock() + defer c.mu.Unlock() + if m, ok := c.locks[bucket]; ok { + return m + } + m := &sync.Mutex{} + c.locks[bucket] = m + return m +} + +// Close shuts down the underlying log: seals the open segment, +// drains the flush queue, and updates per-bucket forge_root_cid +// for every op_root contained in flushed segments. After Close +// returns cleanly, every acked write is durable in Forge or +// scheduled to ship. Close is one-shot at process shutdown; +// subsequent Begin/Lock calls are not safe. +func (c *Coordinator) Close(ctx context.Context) error { + return c.deps.Log.Close(ctx) +} + +// Tx is a single-bucket write transaction. It exposes four I/O +// methods (Get/Put for CBOR, GetBlock/PutBlock for raw bytes) so +// callers don't have to reach for the underlying blockstore / +// IpldStore views — the four interface assertions below pin the +// contracts the rest of pkg/ms3t relies on. +type Tx struct { + deps Deps + bucket string + state *registry.State + staging *blockstore.OpStaging + cst blockstore.Store + + // release is the bucket-lock release closure. Set by Begin; + // nil-ed by finalize() so Commit and Discard mutually agree + // that the lock has been released exactly once. + release func() +} + +// Compile-time assertions: Tx is the canonical handle through which +// the rest of pkg/ms3t reaches into the per-op staging buffer, so +// it must satisfy each of the contracts at the call sites. +var ( + _ blockstore.Store = (*Tx)(nil) // Get, Put → manifest CBOR + MST.GetPointer + _ blockstore.Reader = (*Tx)(nil) // Get → mst.LoadMST / NewEmptyMST + _ blockstore.BlockReader = (*Tx)(nil) // GetBlock → OpenBody / OpenBodyRange + _ blockstore.BlockWriter = (*Tx)(nil) // PutBlock → PutBody +) + +// State returns the bucket's State as snapshotted at Begin. The +// reported Root is the value Commit will CAS against. +func (tx *Tx) State() *registry.State { return tx.state } + +// Get fetches a CBOR-encoded value at c into out. Tx satisfies +// blockstore.Store (Get + Put) and blockstore.Reader (Get) so it +// can be passed directly to mst.LoadMST and +// MerkleSearchTree.GetPointer. +func (tx *Tx) Get(ctx context.Context, c cid.Cid, out any) error { + return tx.cst.Get(ctx, c, out) +} + +// Put CBOR-encodes v into the per-op staging buffer and returns +// its CID. Tx satisfies blockstore.Store via Get + Put. +func (tx *Tx) Put(ctx context.Context, v any) (cid.Cid, error) { + return tx.cst.Put(ctx, v) +} + +// GetBlock fetches a raw block from the per-tx view: staging +// buffer first, then the layered read store. Satisfies +// bucket.BlockReader so OpenBody / OpenBodyRange can read from +// freshly-staged chunks during the same op (rare but consistent). +func (tx *Tx) GetBlock(ctx context.Context, c cid.Cid) (block.Block, error) { + return tx.staging.Get(ctx, c) +} + +// PutBlock writes a raw block into the per-op staging buffer. +// Satisfies bucket.BlockWriter so PutBody can stream chunks +// directly through the Tx without the caller threading a separate +// blockstore. +func (tx *Tx) PutBlock(ctx context.Context, blk block.Block) error { + return tx.staging.Put(ctx, blk) +} + +// LoadTree returns the bucket's MST loaded from State().Root, or a +// fresh empty MST if the bucket has no objects yet. Mutations on +// the returned tree flow into the per-op staging buffer because +// the tree is loaded with Tx as its store (Tx satisfies blockstore.Reader, +// and MST writes only happen at GetPointer time, which takes its +// writer as an explicit argument). +func (tx *Tx) LoadTree() *mst.MerkleSearchTree { + if tx.state.Root.Defined() { + return mst.LoadMST(tx, tx.state.Root) + } + return mst.NewEmptyMST(tx) +} + +// Commit finalizes the transaction: +// 1. log.AppendBatch fsyncs the staging buffer into the open log +// segment with an op-root of (bucket, newRoot). +// 2. registry.CASRoot advances the bucket Root from State().Root +// to newRoot. +// 3. The bucket lock is released. +// +// Returns registry.ErrConflict if another writer raced ahead of us +// (only possible across processes — within a single process the +// per-bucket lock prevents it). On any error the lock is still +// released; defer-Discard becomes a no-op. +// +// Failure mode worth knowing: if step 1 succeeds but step 2 fails +// (transient Postgres error, context cancellation between the two +// calls), the op_root is durable in the log even though the bucket +// Root never advanced. The flusher will eventually see this op_root +// and — today — blindly advance forge_root_cid to it, leaving +// forge_root_cid pointing at an orphan Root the bucket never +// published. See the TODO in pkg/ms3t/registry/segments.go's +// MarkSegmentFlushed for the planned conditional-update fix. +func (tx *Tx) Commit(ctx context.Context, newRoot cid.Cid) error { + if tx.release == nil { + return errors.New("bucketop: tx already finalized") + } + defer tx.finalize() + + if err := tx.staging.Commit(ctx, newRoot); err != nil { + return fmt.Errorf("bucketop: append: %w", err) + } + if err := tx.deps.Reg.CASRoot(ctx, tx.bucket, tx.state.Root, newRoot); err != nil { + return fmt.Errorf("bucketop: advance root: %w", err) + } + return nil +} + +// Discard rolls back the staging buffer (drops staged blocks +// without writing) and releases the bucket lock. Idempotent — safe +// to defer at the top of every operation regardless of whether +// Commit eventually runs. +func (tx *Tx) Discard() { + if tx.release == nil { + return + } + tx.staging.Discard() + tx.finalize() +} + +func (tx *Tx) finalize() { + if tx.release != nil { + tx.release() + tx.release = nil + } +} diff --git a/pkg/ms3t/cars/encoder.go b/pkg/ms3t/cars/encoder.go index 11bdcdc..03952c5 100644 --- a/pkg/ms3t/cars/encoder.go +++ b/pkg/ms3t/cars/encoder.go @@ -46,6 +46,47 @@ func Write(w io.Writer, roots []cid.Cid, blocks []block.Block) error { return err } +// WriteHeader writes only the CAR v1 header (root array + version) +// and returns the number of bytes written. Used by callers that +// build a CAR incrementally — e.g. an append-only log segment that +// emits one header at open and many block frames over time. +func WriteHeader(w io.Writer, roots []cid.Cid) (int64, error) { + if len(roots) == 0 { + return 0, fmt.Errorf("cars: at least one root required") + } + cw := &countingWriter{w: w} + headerBytes, err := encodeHeader(roots) + if err != nil { + return 0, fmt.Errorf("cars: encode header: %w", err) + } + if err := writeUvarint(cw, uint64(len(headerBytes))); err != nil { + return cw.n, fmt.Errorf("cars: write header len: %w", err) + } + if _, err := cw.Write(headerBytes); err != nil { + return cw.n, fmt.Errorf("cars: write header: %w", err) + } + return cw.n, nil +} + +// WriteBlocksAt writes only block frames (no header) at fileOffset +// and returns the absolute byte positions of each block's payload +// within the file. Use this to extend an already-open CAR built by +// WriteHeader. fileOffset must equal the current end-of-file size of +// the underlying writer; positions returned reflect that origin so +// they can be used as ReadAt offsets directly. +func WriteBlocksAt(w io.Writer, fileOffset int64, blocks []block.Block) ([]BlockPosition, error) { + cw := &countingWriter{w: w, n: fileOffset} + positions := make([]BlockPosition, 0, len(blocks)) + for i, blk := range blocks { + pos, err := writeBlock(cw, blk) + if err != nil { + return positions, fmt.Errorf("cars: write block %d (%s): %w", i, blk.Cid(), err) + } + positions = append(positions, pos) + } + return positions, nil +} + // WriteWithPositions is like Write, but additionally returns the byte // position of each block's payload within the encoded CAR. Used by the // Forge uploader to build a `blobindex.ShardedDagIndexView` mapping diff --git a/pkg/ms3t/cars/reader.go b/pkg/ms3t/cars/reader.go new file mode 100644 index 0000000..1d746e1 --- /dev/null +++ b/pkg/ms3t/cars/reader.go @@ -0,0 +1,161 @@ +package cars + +import ( + "bufio" + "encoding/binary" + "errors" + "fmt" + "io" + "os" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" +) + +// ErrTorn is returned by ScanFile when the trailing bytes of the CAR +// look like an incomplete frame (truncated varint, mismatched frame +// length, or short read on payload). Callers can use the LastGoodEnd +// field of the returned ScanResult to truncate the file back to the +// last fully-fsynced batch boundary. +var ErrTorn = errors.New("cars: torn trailing frame") + +// Frame is one block read from a CAR file along with its on-disk +// position. Offset/Length describe the payload bytes (post-CID +// prefix), matching the convention used by BlockPosition / Write. +type Frame struct { + Block block.Block + Offset uint64 + Length uint64 +} + +// ScanResult is the outcome of ScanFile. +type ScanResult struct { + // Frames are every block read in file order. + Frames []Frame + // LastGoodEnd is the byte offset just past the last fully-read + // frame. If the file is intact, equals the file size; if a torn + // frame was detected, equals the start of that torn frame so + // callers can truncate to it. + LastGoodEnd int64 + // HeaderEnd is the byte offset just past the CAR v1 header (i.e., + // the offset of the first frame). + HeaderEnd int64 +} + +// ScanFile reads a CAR v1 file from path and returns every fully +// readable block + its on-disk position. If the file ends in a torn +// frame, ScanFile returns the frames it could read along with +// ErrTorn and LastGoodEnd pointing at the start of the torn frame. +// +// This is the recovery primitive: callers can `os.Truncate(path, +// LastGoodEnd)` to drop a torn tail, then re-derive the in-memory +// index from Frames. +func ScanFile(path string) (*ScanResult, error) { + f, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("cars: open %s: %w", path, err) + } + defer f.Close() + st, err := f.Stat() + if err != nil { + return nil, fmt.Errorf("cars: stat %s: %w", path, err) + } + size := st.Size() + + br := bufio.NewReader(f) + headerLen, err := binary.ReadUvarint(br) + if err != nil { + return nil, fmt.Errorf("cars: read header len: %w", err) + } + headerVarintBytes := uvarintLen(headerLen) + if _, err := br.Discard(int(headerLen)); err != nil { + return nil, fmt.Errorf("cars: skip header: %w", err) + } + headerEnd := int64(headerVarintBytes) + int64(headerLen) + + res := &ScanResult{HeaderEnd: headerEnd, LastGoodEnd: headerEnd} + pos := headerEnd + + for pos < size { + frameStart := pos + frameLen, varSize, terr := readUvarint(br) + if terr != nil { + if errors.Is(terr, io.EOF) || errors.Is(terr, io.ErrUnexpectedEOF) { + return res, fmt.Errorf("%w at offset %d", ErrTorn, frameStart) + } + return nil, fmt.Errorf("cars: read frame len at %d: %w", frameStart, terr) + } + // Bound check: frame must fit in remaining bytes. + if int64(frameLen)+int64(varSize)+frameStart > size { + res.LastGoodEnd = frameStart + return res, fmt.Errorf("%w at offset %d (frame len %d exceeds file)", ErrTorn, frameStart, frameLen) + } + + // Read frame body: CID prefix + block bytes. + body := make([]byte, frameLen) + if _, err := io.ReadFull(br, body); err != nil { + res.LastGoodEnd = frameStart + return res, fmt.Errorf("%w at offset %d: %w", ErrTorn, frameStart, err) + } + c, cidLen, err := cidFromBytes(body) + if err != nil { + return nil, fmt.Errorf("cars: parse cid at offset %d: %w", frameStart, err) + } + payload := body[cidLen:] + blk, err := block.NewBlockWithCid(payload, c) + if err != nil { + return nil, fmt.Errorf("cars: new block at offset %d: %w", frameStart, err) + } + + dataOffset := uint64(frameStart) + uint64(varSize) + uint64(cidLen) + res.Frames = append(res.Frames, Frame{ + Block: blk, + Offset: dataOffset, + Length: uint64(len(payload)), + }) + pos = frameStart + int64(varSize) + int64(frameLen) + res.LastGoodEnd = pos + } + return res, nil +} + +// uvarintLen returns the encoded byte length of n. +func uvarintLen(n uint64) int { + var buf [binary.MaxVarintLen64]byte + return binary.PutUvarint(buf[:], n) +} + +// readUvarint pulls a varint from br and reports how many bytes it +// consumed. Wraps the bufio reader's ReadByte so we can count. +func readUvarint(br *bufio.Reader) (uint64, int, error) { + var ( + x uint64 + s uint + n int + ) + for { + b, err := br.ReadByte() + if err != nil { + return 0, n, err + } + n++ + if b < 0x80 { + if n > binary.MaxVarintLen64 || (n == binary.MaxVarintLen64 && b > 1) { + return 0, n, fmt.Errorf("cars: uvarint overflow") + } + return x | uint64(b)< maxSeq { + maxSeq = seq + } + row, hasRow := dbBySeq[seq] + + switch { + case hasRow && row.State == StateOpen: + seg, err := rebuildOpenFromDisk(s.cfg.Dir, seq, s.logger) + if err != nil { + return fmt.Errorf("logstore: rebuild open seg %d: %w", seq, err) + } + if recoveredOpen != nil { + return fmt.Errorf("logstore: more than one open segment on disk (seqs %d and %d)", + recoveredOpen.seq, seq) + } + recoveredOpen = seg + + case hasRow && row.State == StateSealed: + seg, err := loadSealedFromIdx(s.cfg.Dir, seq, s.logger) + if err != nil { + return fmt.Errorf("logstore: load sealed seg %d: %w", seq, err) + } + sealedRecovered = append(sealedRecovered, loaded{seg: seg}) + + case flushedOnDisk[seq]: + seg, err := loadFlushedFromIdx(s.cfg.Dir, seq, 0, s.logger) + if err != nil { + return fmt.Errorf("logstore: load flushed seg %d: %w", seq, err) + } + flushedRecovered = append(flushedRecovered, loaded{seg: seg}) + + default: + // File on disk but no DB row and no idx — treat as a + // previously-open segment that crashed before sealing. Rebuild + // as open and let the force-seal path in Open() finalize it. + seg, err := rebuildOpenFromDisk(s.cfg.Dir, seq, s.logger) + if err != nil { + return fmt.Errorf("logstore: rebuild orphan seg %d: %w", seq, err) + } + // Seed the DB row in 'open' so the seal transition's + // "from open" UPDATE matches. + if err := s.cfg.Meta.InsertSegmentOpen(ctx, seq); err != nil { + return fmt.Errorf("logstore: insert orphan row %d: %w", seq, err) + } + if recoveredOpen != nil { + return fmt.Errorf("logstore: orphan + open conflict (seqs %d and %d)", + recoveredOpen.seq, seq) + } + recoveredOpen = seg + } + } + + // DB rows without a corresponding .car file → log + clean up. + for seq, row := range dbBySeq { + if _, ok := carSeqs[seq]; ok { + continue + } + s.logger.Error("logstore: DB segment row without on-disk file; deleting row", + zap.Uint64("seq", seq), zap.String("state", row.State.String())) + if err := s.cfg.Meta.DeleteSegment(ctx, seq); err != nil { + return fmt.Errorf("logstore: delete orphan row %d: %w", seq, err) + } + } + + // Sort recovered sealed/flushed segments newest-first by seq. + sort.Slice(sealedRecovered, func(i, j int) bool { + return sealedRecovered[i].seg.Seq() > sealedRecovered[j].seg.Seq() + }) + sort.Slice(flushedRecovered, func(i, j int) bool { + return flushedRecovered[i].seg.Seq() > flushedRecovered[j].seg.Seq() + }) + + // Combine into the sealed slice (newest-first overall). + all := make([]*Segment, 0, len(sealedRecovered)+len(flushedRecovered)) + for _, l := range sealedRecovered { + all = append(all, l.seg) + } + for _, l := range flushedRecovered { + all = append(all, l.seg) + } + sort.SliceStable(all, func(i, j int) bool { return all[i].Seq() > all[j].Seq() }) + s.sealed = all + + // Re-enqueue sealed segments (not flushed) for the flusher. + for _, seg := range s.sealed { + if seg.State() == StateSealed { + select { + case s.flushQ <- seg: + default: + s.logger.Warn("logstore: flush queue full at recovery; will retry on tick", + zap.Uint64("seq", seg.Seq())) + } + } + } + + s.open = recoveredOpen + if recoveredOpen != nil && recoveredOpen.Seq() > maxSeq { + maxSeq = recoveredOpen.Seq() + } + s.nextSeq = maxSeq + 1 + + return nil +} diff --git a/pkg/ms3t/logstore/segment.go b/pkg/ms3t/logstore/segment.go new file mode 100644 index 0000000..5af0b71 --- /dev/null +++ b/pkg/ms3t/logstore/segment.go @@ -0,0 +1,826 @@ +package logstore + +import ( + "context" + "crypto/sha256" + "encoding/binary" + "encoding/json" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "sync" + "time" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + "go.uber.org/zap" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" + "github.com/storacha/sprue/pkg/ms3t/cars" +) + +// placeholderRoot is the placeholder CAR header root. Each segment +// is multi-rooted by intent; the per-op roots live in the .ops +// sidecar (and in-memory OpRoots), not the CAR header. +var placeholderRoot = cid.NewCidV1(cid.Raw, []byte{0x00, 0x00}) + +// Segment is one log file. Open segments accept appends; sealed +// segments are read-only. +// +// Concurrency model: Append is serialized by Store.appMu. Reads +// (Lookup + ReadAt against fdRO) and seal/finalize use Segment-level +// locks so they don't block appenders unnecessarily. +type Segment struct { + seq uint64 + dir string + logger *zap.Logger + + // stateMu guards state, sealedAt, sha256, opRoots, sizeBytes, + // index, seen, fdRW, and fdRO. RLock for reads (lookups, opRoots + // access); Lock for mutating any of the above. + stateMu sync.RWMutex + + state State + sealedAt int64 + sha256 []byte + + sizeBytes int64 + // index maps each block's CID to its on-disk byte position + // inside the segment's CAR. Updated on append (after a successful + // fsync) and rebuilt on recovery from either the .idx sidecar or + // a fresh CAR scan. + index map[cid.Cid]blockstore.BlockLoc + // seen is the dedup gate consulted by append. CIDs that have + // already landed in this segment are skipped before + // cars.WriteBlocksAt is called, so duplicate bytes are never + // written to disk and never shipped to Forge. Always kept in + // sync with index's key set. + seen *cid.Set + opRoots []blockstore.OpRoot + + // fdRW is the append/read file descriptor for an open segment. + // Closed at seal. + fdRW *os.File + // opsFD is the append-only ops sidecar (open segment only). + // Closed at seal. + opsFD *os.File + // fdRO is the read-only descriptor used to serve Get after seal + // (and before, when the open fdRW exists). For open segments we + // use fdRW for reads via ReadAt; fdRO is opened lazily at seal + // time so reads after seal don't need to reopen on every Get. + fdRO *os.File +} + +// Seq returns the segment's identifier. +func (s *Segment) Seq() uint64 { return s.seq } + +// State reports the current lifecycle state. +func (s *Segment) State() State { + s.stateMu.RLock() + defer s.stateMu.RUnlock() + return s.state +} + +// Size reports the current on-disk byte size of the CAR file. +func (s *Segment) Size() int64 { + s.stateMu.RLock() + defer s.stateMu.RUnlock() + return s.sizeBytes +} + +// SHA256 returns the seal-time sha256 of the CAR file. Empty for +// open segments. +func (s *Segment) SHA256() []byte { + s.stateMu.RLock() + defer s.stateMu.RUnlock() + out := make([]byte, len(s.sha256)) + copy(out, s.sha256) + return out +} + +// SealedAt returns the seal-time unix-seconds timestamp. Zero for +// open segments. +func (s *Segment) SealedAt() int64 { + s.stateMu.RLock() + defer s.stateMu.RUnlock() + return s.sealedAt +} + +// OpRoots returns a copy of the per-batch (bucket, root) records. +// Safe to call from any goroutine. +func (s *Segment) OpRoots() []blockstore.OpRoot { + s.stateMu.RLock() + defer s.stateMu.RUnlock() + out := make([]blockstore.OpRoot, len(s.opRoots)) + copy(out, s.opRoots) + return out +} + +// BlockPositions returns a copy of the cid → on-disk-position +// table for the segment's CAR. Populated at append time and +// rebuilt on recovery from either the .idx sidecar or a fresh CAR +// scan. Used by the flush path to build a ShardedDagIndexView +// without rescanning the file. Safe to call from any goroutine. +func (s *Segment) BlockPositions() map[cid.Cid]blockstore.BlockLoc { + s.stateMu.RLock() + defer s.stateMu.RUnlock() + out := make(map[cid.Cid]blockstore.BlockLoc, len(s.index)) + for c, loc := range s.index { + out[c] = loc + } + return out +} + +// CARPath returns the absolute path to the segment's CAR file. +func (s *Segment) CARPath() string { return filepath.Join(s.dir, carName(s.seq)) } + +// OpsPath returns the absolute path to the segment's ops sidecar. +func (s *Segment) OpsPath() string { return filepath.Join(s.dir, opsName(s.seq)) } + +// IdxPath returns the absolute path to the segment's idx sidecar. +func (s *Segment) IdxPath() string { return filepath.Join(s.dir, idxName(s.seq)) } + +func carName(seq uint64) string { return fmt.Sprintf("seg-%020d.car", seq) } +func opsName(seq uint64) string { return fmt.Sprintf("seg-%020d.ops", seq) } +func idxName(seq uint64) string { return fmt.Sprintf("seg-%020d.idx", seq) } + +// createOpenSegment creates a brand-new segment in the open state: +// initializes the CAR file with a header, opens the ops sidecar, +// and records the row in Meta. +func createOpenSegment(ctx context.Context, dir string, seq uint64, meta Meta, logger *zap.Logger) (*Segment, error) { + carPath := filepath.Join(dir, carName(seq)) + opsPath := filepath.Join(dir, opsName(seq)) + + carFile, err := os.OpenFile(carPath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644) + if err != nil { + return nil, fmt.Errorf("logstore: open car %d: %w", seq, err) + } + hdrLen, err := cars.WriteHeader(carFile, []cid.Cid{placeholderRoot}) + if err != nil { + _ = carFile.Close() + _ = os.Remove(carPath) + return nil, fmt.Errorf("logstore: write header %d: %w", seq, err) + } + if err := carFile.Sync(); err != nil { + _ = carFile.Close() + _ = os.Remove(carPath) + return nil, fmt.Errorf("logstore: sync header %d: %w", seq, err) + } + + opsFile, err := os.OpenFile(opsPath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644) + if err != nil { + _ = carFile.Close() + _ = os.Remove(carPath) + return nil, fmt.Errorf("logstore: open ops %d: %w", seq, err) + } + + if err := meta.InsertSegmentOpen(ctx, seq); err != nil { + _ = carFile.Close() + _ = opsFile.Close() + _ = os.Remove(carPath) + _ = os.Remove(opsPath) + return nil, err + } + + return &Segment{ + seq: seq, + dir: dir, + logger: logger, + state: StateOpen, + sizeBytes: hdrLen, + index: map[cid.Cid]blockstore.BlockLoc{}, + seen: cid.NewSet(), + fdRW: carFile, + opsFD: opsFile, + }, nil +} + +// append writes the given blocks + opRoot to disk and updates the +// in-memory index. fsyncs both files before returning. Caller must +// hold Store.appMu. +// +// Block-level dedup: every block is checked against s.seen before +// writing. CIDs already present in this segment are skipped, so a +// duplicate body chunk or MST node landing across two PUTs in the +// same segment never hits the CAR file twice and never ships to +// Forge twice. The op-root record is appended unconditionally — +// even an all-duplicate batch still represents a real bucket-Root +// advance and must be replayed by the flusher. +func (s *Segment) append(blocks []block.Block, opRoot blockstore.OpRoot) error { + s.stateMu.Lock() + defer s.stateMu.Unlock() + + if s.state != StateOpen || s.fdRW == nil { + return errors.New("logstore: segment not open for append") + } + + // Filter out CIDs we've already written into this segment. We + // don't mutate s.seen yet — only after the file write succeeds — + // so a fsync error doesn't poison the dedup state. + fresh := make([]block.Block, 0, len(blocks)) + for _, blk := range blocks { + if s.seen.Has(blk.Cid()) { + continue + } + fresh = append(fresh, blk) + } + + var positions []cars.BlockPosition + if len(fresh) > 0 { + var err error + positions, err = cars.WriteBlocksAt(s.fdRW, s.sizeBytes, fresh) + if err != nil { + return fmt.Errorf("logstore: append blocks seg %d: %w", s.seq, err) + } + } + + // Append the op-root record to the ops sidecar regardless of + // whether any new bytes were written to the CAR. + opsRec, err := encodeOpRecord(opRoot) + if err != nil { + return fmt.Errorf("logstore: encode oprec seg %d: %w", s.seq, err) + } + if _, err := s.opsFD.Write(opsRec); err != nil { + return fmt.Errorf("logstore: write ops seg %d: %w", s.seq, err) + } + + // fsync both files in parallel. The CAR fsync is a fast no-op + // when len(fresh) == 0 (nothing written since the last sync) but + // we issue it anyway to keep the durability contract uniform. + var wg sync.WaitGroup + var carErr, opsErr error + wg.Add(2) + go func() { + defer wg.Done() + carErr = s.fdRW.Sync() + }() + go func() { + defer wg.Done() + opsErr = s.opsFD.Sync() + }() + wg.Wait() + if carErr != nil { + return fmt.Errorf("logstore: fsync car seg %d: %w", s.seq, carErr) + } + if opsErr != nil { + return fmt.Errorf("logstore: fsync ops seg %d: %w", s.seq, opsErr) + } + + // Commit the dedup state and the position table together. + for i, blk := range fresh { + s.seen.Add(blk.Cid()) + s.index[blk.Cid()] = blockstore.BlockLoc{Offset: positions[i].Offset, Length: positions[i].Length} + } + if n := len(positions); n > 0 { + end := int64(positions[n-1].Offset) + int64(positions[n-1].Length) + if end > s.sizeBytes { + s.sizeBytes = end + } + } + s.opRoots = append(s.opRoots, opRoot) + return nil +} + +// seal closes the open fds, hashes the CAR, writes the .idx sidecar, +// and updates Meta. After this returns, the segment is in +// StateSealed and safe to be flushed. +func (s *Segment) seal(ctx context.Context, meta Meta) error { + s.stateMu.Lock() + defer s.stateMu.Unlock() + + if s.state != StateOpen { + // Idempotent: already sealed. + return nil + } + + // Final fsync before close (defensive — append already fsyncs). + if err := s.fdRW.Sync(); err != nil { + return fmt.Errorf("logstore: pre-seal fsync car %d: %w", s.seq, err) + } + if err := s.opsFD.Sync(); err != nil { + return fmt.Errorf("logstore: pre-seal fsync ops %d: %w", s.seq, err) + } + if err := s.fdRW.Close(); err != nil { + return fmt.Errorf("logstore: close car %d: %w", s.seq, err) + } + s.fdRW = nil + if err := s.opsFD.Close(); err != nil { + return fmt.Errorf("logstore: close ops %d: %w", s.seq, err) + } + s.opsFD = nil + + // Compute CAR sha256 by streaming the file. + sum, err := hashFile(s.CARPath()) + if err != nil { + return fmt.Errorf("logstore: hash %d: %w", s.seq, err) + } + s.sha256 = sum + s.sealedAt = time.Now().Unix() + s.state = StateSealed + + // Write idx sidecar (atomic via tmp+rename). + if err := s.writeIdxLocked(); err != nil { + return fmt.Errorf("logstore: write idx %d: %w", s.seq, err) + } + + // Persist sealed state in Postgres. + if err := meta.MarkSegmentSealed(ctx, s.seq, s.sealedAt, s.sizeBytes, s.sha256, s.opRoots); err != nil { + return fmt.Errorf("logstore: mark sealed %d: %w", s.seq, err) + } + + // Open the read-only fd that will serve Get from now on. + roFD, err := os.Open(s.CARPath()) + if err != nil { + return fmt.Errorf("logstore: open ro car %d: %w", s.seq, err) + } + s.fdRO = roFD + + return nil +} + +// retire closes any open fd and unlinks the segment's files. Safe to +// call after MarkFlushed; the caller must guarantee no other +// goroutine still holds a reference for reads. +func (s *Segment) retire() error { + s.stateMu.Lock() + defer s.stateMu.Unlock() + + if s.fdRO != nil { + _ = s.fdRO.Close() + s.fdRO = nil + } + if s.fdRW != nil { + _ = s.fdRW.Close() + s.fdRW = nil + } + if s.opsFD != nil { + _ = s.opsFD.Close() + s.opsFD = nil + } + + for _, name := range []string{s.CARPath(), s.OpsPath(), s.IdxPath()} { + if err := os.Remove(name); err != nil && !errors.Is(err, os.ErrNotExist) { + return fmt.Errorf("logstore: unlink %s: %w", name, err) + } + } + return nil +} + +// get returns the block at the given CID, or blockstore.ErrNotFound. Safe for +// concurrent callers. +func (s *Segment) get(_ context.Context, c cid.Cid) (block.Block, error) { + s.stateMu.RLock() + loc, ok := s.index[c] + fd := s.fdRO + if fd == nil { + fd = s.fdRW + } + s.stateMu.RUnlock() + if !ok { + return nil, blockstore.ErrNotFound + } + if fd == nil { + return nil, fmt.Errorf("logstore: segment %d has no read fd", s.seq) + } + buf := make([]byte, loc.Length) + if _, err := fd.ReadAt(buf, int64(loc.Offset)); err != nil { + return nil, fmt.Errorf("logstore: read seg %d offset %d: %w", s.seq, loc.Offset, err) + } + return block.NewBlockWithCid(buf, c) +} + +// writeIdxLocked persists the idx sidecar. Caller must hold stateMu +// in write mode and have already populated sha256/sealedAt. +func (s *Segment) writeIdxLocked() error { + type idxBlock struct { + CID string `json:"cid"` + Offset uint64 `json:"offset"` + Length uint64 `json:"length"` + } + type idxOpRoot struct { + Bucket string `json:"bucket"` + Root string `json:"root"` + } + type idxFile struct { + Seq uint64 `json:"seq"` + SizeBytes int64 `json:"size_bytes"` + SHA256 string `json:"sha256_hex"` + SealedAt int64 `json:"sealed_at"` + Blocks []idxBlock `json:"blocks"` + OpRoots []idxOpRoot `json:"op_roots"` + } + + blocks := make([]idxBlock, 0, len(s.index)) + for c, loc := range s.index { + blocks = append(blocks, idxBlock{ + CID: c.String(), + Offset: loc.Offset, + Length: loc.Length, + }) + } + opRoots := make([]idxOpRoot, len(s.opRoots)) + for i, opr := range s.opRoots { + opRoots[i] = idxOpRoot{Bucket: opr.Bucket, Root: opr.Root.String()} + } + + body := idxFile{ + Seq: s.seq, + SizeBytes: s.sizeBytes, + SHA256: fmt.Sprintf("%x", s.sha256), + SealedAt: s.sealedAt, + Blocks: blocks, + OpRoots: opRoots, + } + data, err := json.MarshalIndent(body, "", " ") + if err != nil { + return err + } + tmp := s.IdxPath() + ".tmp" + if err := os.WriteFile(tmp, data, 0o644); err != nil { + return err + } + return os.Rename(tmp, s.IdxPath()) +} + +// loadSealedFromIdx hydrates a Segment in the StateSealed state from +// its on-disk .idx sidecar. Used at startup. Returns (nil, error) on +// any malformed sidecar; the caller can fall back to a CAR scan. +func loadSealedFromIdx(dir string, seq uint64, logger *zap.Logger) (*Segment, error) { + idxPath := filepath.Join(dir, idxName(seq)) + data, err := os.ReadFile(idxPath) + if err != nil { + return nil, fmt.Errorf("logstore: read idx %d: %w", seq, err) + } + var raw struct { + Seq uint64 `json:"seq"` + SizeBytes int64 `json:"size_bytes"` + SHA256 string `json:"sha256_hex"` + SealedAt int64 `json:"sealed_at"` + Blocks []struct { + CID string `json:"cid"` + Offset uint64 `json:"offset"` + Length uint64 `json:"length"` + } `json:"blocks"` + OpRoots []struct { + Bucket string `json:"bucket"` + Root string `json:"root"` + } `json:"op_roots"` + } + if err := json.Unmarshal(data, &raw); err != nil { + return nil, fmt.Errorf("logstore: parse idx %d: %w", seq, err) + } + if raw.Seq != seq { + return nil, fmt.Errorf("logstore: idx seq %d does not match filename %d", raw.Seq, seq) + } + idx := make(map[cid.Cid]blockstore.BlockLoc, len(raw.Blocks)) + seen := cid.NewSet() + for _, b := range raw.Blocks { + c, err := cid.Decode(b.CID) + if err != nil { + return nil, fmt.Errorf("logstore: idx bad cid %q: %w", b.CID, err) + } + idx[c] = blockstore.BlockLoc{Offset: b.Offset, Length: b.Length} + seen.Add(c) + } + ops := make([]blockstore.OpRoot, len(raw.OpRoots)) + for i, o := range raw.OpRoots { + c, err := cid.Decode(o.Root) + if err != nil { + return nil, fmt.Errorf("logstore: idx bad root %q: %w", o.Root, err) + } + ops[i] = blockstore.OpRoot{Bucket: o.Bucket, Root: c} + } + sha, err := hexDecode(raw.SHA256) + if err != nil { + return nil, fmt.Errorf("logstore: idx bad sha %q: %w", raw.SHA256, err) + } + + carFD, err := os.Open(filepath.Join(dir, carName(seq))) + if err != nil { + return nil, fmt.Errorf("logstore: open sealed car %d: %w", seq, err) + } + return &Segment{ + seq: seq, + dir: dir, + logger: logger, + state: StateSealed, + sealedAt: raw.SealedAt, + sha256: sha, + sizeBytes: raw.SizeBytes, + index: idx, + seen: seen, + opRoots: ops, + fdRO: carFD, + }, nil +} + +// loadFlushedFromIdx is loadSealedFromIdx but yields StateFlushed. +// Used to pick up retained segments at startup. +func loadFlushedFromIdx(dir string, seq uint64, flushedAt int64, logger *zap.Logger) (*Segment, error) { + seg, err := loadSealedFromIdx(dir, seq, logger) + if err != nil { + return nil, err + } + seg.state = StateFlushed + _ = flushedAt // kept for future use; not stored on Segment today. + return seg, nil +} + +// rebuildOpenFromDisk takes a torn or sidecar-less open segment on +// disk (the segment was open at crash time) and reconstructs an +// in-memory Segment ready to be sealed. It scans the CAR (truncating +// any torn last frame) and replays the .ops file. +// +// The returned segment is in StateOpen with its fds repositioned at +// EOF; the caller is expected to immediately call seal() to retire +// it cleanly. We do not resume appending to a recovered open +// segment — every restart starts a fresh segment for the next ops. +func rebuildOpenFromDisk(dir string, seq uint64, logger *zap.Logger) (*Segment, error) { + carPath := filepath.Join(dir, carName(seq)) + scan, err := cars.ScanFile(carPath) + if err != nil && !errors.Is(err, cars.ErrTorn) { + return nil, fmt.Errorf("logstore: scan recovered car %d: %w", seq, err) + } + if errors.Is(err, cars.ErrTorn) { + if terr := os.Truncate(carPath, scan.LastGoodEnd); terr != nil { + return nil, fmt.Errorf("logstore: truncate torn car %d: %w", seq, terr) + } + logger.Warn("logstore: truncated torn trailing frame in segment", + zap.Uint64("seq", seq), + zap.Int64("truncated_at", scan.LastGoodEnd)) + } + + idx := make(map[cid.Cid]blockstore.BlockLoc, len(scan.Frames)) + seen := cid.NewSet() + var size int64 = scan.LastGoodEnd + for _, f := range scan.Frames { + c := f.Block.Cid() + idx[c] = blockstore.BlockLoc{Offset: f.Offset, Length: f.Length} + seen.Add(c) + } + + opsPath := filepath.Join(dir, opsName(seq)) + ops, err := readAllOps(opsPath) + if err != nil { + return nil, fmt.Errorf("logstore: read ops %d: %w", seq, err) + } + + carFD, err := os.OpenFile(carPath, os.O_RDWR, 0o644) + if err != nil { + return nil, fmt.Errorf("logstore: reopen car %d: %w", seq, err) + } + if _, err := carFD.Seek(size, io.SeekStart); err != nil { + _ = carFD.Close() + return nil, fmt.Errorf("logstore: seek car %d: %w", seq, err) + } + opsFD, err := os.OpenFile(opsPath, os.O_RDWR|os.O_CREATE, 0o644) + if err != nil { + _ = carFD.Close() + return nil, fmt.Errorf("logstore: reopen ops %d: %w", seq, err) + } + if _, err := opsFD.Seek(0, io.SeekEnd); err != nil { + _ = carFD.Close() + _ = opsFD.Close() + return nil, fmt.Errorf("logstore: seek ops %d: %w", seq, err) + } + return &Segment{ + seq: seq, + dir: dir, + logger: logger, + state: StateOpen, + sizeBytes: size, + index: idx, + seen: seen, + opRoots: ops, + fdRW: carFD, + opsFD: opsFD, + }, nil +} + +// === ops sidecar codec === +// +// Each record is a 4-byte big-endian length prefix followed by a +// minimal CBOR-encoded payload: a 2-element array +// [bucket: text, root: cid bytes]. We use array form rather than a +// map to keep the encoding compact and order-independent of map +// iteration. + +const opRecMaxSize = 1 << 20 // 1 MiB ceiling per record (defensive) + +func encodeOpRecord(opr blockstore.OpRoot) ([]byte, error) { + if !opr.Root.Defined() { + return nil, errors.New("logstore: opRoot.Root must be defined") + } + if len(opr.Bucket) > 1<<16 { + return nil, errors.New("logstore: bucket name too long") + } + bucketBytes := []byte(opr.Bucket) + rootBytes := opr.Root.Bytes() + + // Manual CBOR: array(2) + text(bucket) + bytes(root). + body := make([]byte, 0, 16+len(bucketBytes)+len(rootBytes)) + body = appendCborHead(body, 4 /*MajArray*/, 2) + body = appendCborHead(body, 3 /*MajTextString*/, uint64(len(bucketBytes))) + body = append(body, bucketBytes...) + body = appendCborHead(body, 2 /*MajByteString*/, uint64(len(rootBytes))) + body = append(body, rootBytes...) + + buf := make([]byte, 4+len(body)) + binary.BigEndian.PutUint32(buf[:4], uint32(len(body))) + copy(buf[4:], body) + return buf, nil +} + +func readAllOps(path string) ([]blockstore.OpRoot, error) { + data, err := os.ReadFile(path) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil, nil + } + return nil, err + } + var out []blockstore.OpRoot + for off := 0; off < len(data); { + if len(data)-off < 4 { + break // torn trailing prefix — drop + } + length := int(binary.BigEndian.Uint32(data[off : off+4])) + if length <= 0 || length > opRecMaxSize || off+4+length > len(data) { + break // torn trailing record — drop + } + body := data[off+4 : off+4+length] + opr, err := decodeOpRecord(body) + if err != nil { + return nil, fmt.Errorf("logstore: ops record at %d: %w", off, err) + } + out = append(out, opr) + off += 4 + length + } + return out, nil +} + +func decodeOpRecord(body []byte) (blockstore.OpRoot, error) { + r := newCborReader(body) + maj, count, err := r.readHead() + if err != nil { + return blockstore.OpRoot{}, err + } + if maj != 4 || count != 2 { + return blockstore.OpRoot{}, fmt.Errorf("expected array(2), got %d/%d", maj, count) + } + bm, blen, err := r.readHead() + if err != nil { + return blockstore.OpRoot{}, err + } + if bm != 3 { + return blockstore.OpRoot{}, fmt.Errorf("expected text bucket, got maj %d", bm) + } + bucket, err := r.readBytes(int(blen)) + if err != nil { + return blockstore.OpRoot{}, err + } + rm, rlen, err := r.readHead() + if err != nil { + return blockstore.OpRoot{}, err + } + if rm != 2 { + return blockstore.OpRoot{}, fmt.Errorf("expected bytes root, got maj %d", rm) + } + rootBytes, err := r.readBytes(int(rlen)) + if err != nil { + return blockstore.OpRoot{}, err + } + c, err := cid.Cast(rootBytes) + if err != nil { + return blockstore.OpRoot{}, err + } + return blockstore.OpRoot{Bucket: string(bucket), Root: c}, nil +} + +// hashFile returns the sha256 of the file at path. +func hashFile(path string) ([]byte, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + h := sha256.New() + if _, err := io.Copy(h, f); err != nil { + return nil, err + } + return h.Sum(nil), nil +} + +func hexDecode(s string) ([]byte, error) { + if len(s)%2 != 0 { + return nil, fmt.Errorf("odd length") + } + out := make([]byte, len(s)/2) + for i := 0; i < len(out); i++ { + hi, ok1 := unhex(s[2*i]) + lo, ok2 := unhex(s[2*i+1]) + if !ok1 || !ok2 { + return nil, fmt.Errorf("bad hex char") + } + out[i] = hi<<4 | lo + } + return out, nil +} + +func unhex(b byte) (byte, bool) { + switch { + case b >= '0' && b <= '9': + return b - '0', true + case b >= 'a' && b <= 'f': + return b - 'a' + 10, true + case b >= 'A' && b <= 'F': + return b - 'A' + 10, true + } + return 0, false +} + +// === minimal CBOR head encoding/decoding === + +func appendCborHead(buf []byte, maj uint8, val uint64) []byte { + switch { + case val < 24: + return append(buf, byte(maj<<5)|byte(val)) + case val < 1<<8: + return append(buf, byte(maj<<5)|24, byte(val)) + case val < 1<<16: + return append(buf, byte(maj<<5)|25, byte(val>>8), byte(val)) + case val < 1<<32: + return append(buf, byte(maj<<5)|26, + byte(val>>24), byte(val>>16), byte(val>>8), byte(val)) + default: + return append(buf, byte(maj<<5)|27, + byte(val>>56), byte(val>>48), byte(val>>40), byte(val>>32), + byte(val>>24), byte(val>>16), byte(val>>8), byte(val)) + } +} + +type cborReader struct { + buf []byte + pos int +} + +func newCborReader(b []byte) *cborReader { return &cborReader{buf: b} } + +func (r *cborReader) readHead() (uint8, uint64, error) { + if r.pos >= len(r.buf) { + return 0, 0, io.EOF + } + first := r.buf[r.pos] + r.pos++ + maj := first >> 5 + low := first & 0x1f + switch { + case low < 24: + return maj, uint64(low), nil + case low == 24: + if r.pos+1 > len(r.buf) { + return 0, 0, io.ErrUnexpectedEOF + } + v := uint64(r.buf[r.pos]) + r.pos++ + return maj, v, nil + case low == 25: + if r.pos+2 > len(r.buf) { + return 0, 0, io.ErrUnexpectedEOF + } + v := uint64(r.buf[r.pos])<<8 | uint64(r.buf[r.pos+1]) + r.pos += 2 + return maj, v, nil + case low == 26: + if r.pos+4 > len(r.buf) { + return 0, 0, io.ErrUnexpectedEOF + } + v := uint64(r.buf[r.pos])<<24 | uint64(r.buf[r.pos+1])<<16 | + uint64(r.buf[r.pos+2])<<8 | uint64(r.buf[r.pos+3]) + r.pos += 4 + return maj, v, nil + case low == 27: + if r.pos+8 > len(r.buf) { + return 0, 0, io.ErrUnexpectedEOF + } + v := uint64(r.buf[r.pos])<<56 | uint64(r.buf[r.pos+1])<<48 | + uint64(r.buf[r.pos+2])<<40 | uint64(r.buf[r.pos+3])<<32 | + uint64(r.buf[r.pos+4])<<24 | uint64(r.buf[r.pos+5])<<16 | + uint64(r.buf[r.pos+6])<<8 | uint64(r.buf[r.pos+7]) + r.pos += 8 + return maj, v, nil + default: + return 0, 0, fmt.Errorf("invalid cbor head 0x%x", first) + } +} + +func (r *cborReader) readBytes(n int) ([]byte, error) { + if r.pos+n > len(r.buf) { + return nil, io.ErrUnexpectedEOF + } + b := r.buf[r.pos : r.pos+n] + r.pos += n + return b, nil +} diff --git a/pkg/ms3t/logstore/store.go b/pkg/ms3t/logstore/store.go new file mode 100644 index 0000000..d387fbe --- /dev/null +++ b/pkg/ms3t/logstore/store.go @@ -0,0 +1,430 @@ +package logstore + +import ( + "context" + "errors" + "fmt" + "os" + "sync" + "time" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + "go.uber.org/zap" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" +) + +// Compile-time assertion that *Store satisfies blockstore.Log. +// blockstore.Log is the consumer-facing contract (AppendBatch / +// Get / Close); *Store is the production LSM implementation that +// backs it. +var _ blockstore.Log = (*Store)(nil) + +// Store is the LSM-style log: one open segment accepting appends, +// plus N sealed segments (some flushed, some pending flush) that +// serve reads in front of the network blockstore. +// +// Concurrency: +// - catMu (RWMutex) guards open + sealed slice + nextSeq. Writers +// hold Lock briefly during seal/retire/new-open swaps. Readers +// hold RLock to take a stable snapshot of the segment list, then +// do file I/O outside the lock. +// - appMu (Mutex) serializes appenders against each other so the +// open-segment append fd has a single writer. +type Store struct { + cfg Config + logger *zap.Logger + + catMu sync.RWMutex + open *Segment + sealed []*Segment // newest-first; includes flushed-and-retained + nextSeq uint64 + + appMu sync.Mutex + + flushQ chan *Segment + closing chan struct{} + wg sync.WaitGroup + + openedAt time.Time + + // sealReq is a coalesced "seal the open segment now" channel. + // AppendBatch sends after exceeding SealBytes; the seal-ticker + // sends on every tick if the open segment has been open longer + // than SealAge. + sealReq chan struct{} +} + +// Open initializes a Store: scans Dir, reconciles with cfg.Meta, +// re-enqueues unflushed segments for the flusher, force-seals any +// previously-open segment, and starts a fresh open segment ready to +// accept appends. +func Open(ctx context.Context, cfg Config) (*Store, error) { + if err := cfg.validate(); err != nil { + return nil, err + } + cfg.defaults() + + if err := os.MkdirAll(cfg.Dir, 0o755); err != nil { + return nil, fmt.Errorf("logstore: mkdir %s: %w", cfg.Dir, err) + } + + s := &Store{ + cfg: cfg, + logger: cfg.Logger, + flushQ: make(chan *Segment, 64), + closing: make(chan struct{}), + sealReq: make(chan struct{}, 1), + } + + if err := s.recover(ctx); err != nil { + return nil, err + } + + // Force-seal a recovered open segment (if any) so a fresh open is + // always brand-new on each process startup. This avoids the + // complications of resuming append into a partially-written file. + if s.open != nil { + if err := s.open.seal(ctx, cfg.Meta); err != nil { + return nil, fmt.Errorf("logstore: force-seal recovered open segment: %w", err) + } + s.sealed = append([]*Segment{s.open}, s.sealed...) + select { + case s.flushQ <- s.open: + default: + s.logger.Warn("logstore: flush queue full at recovery; segment will retry on next tick") + } + s.open = nil + } + + s.wg.Add(2) + go s.flushLoop() + go s.sealTickerLoop() + + return s, nil +} + +// AppendBatch persists `blocks` to the open segment along with an +// op-root record identifying the (bucket, root) this batch's S3 +// op produced. fsyncs CAR + ops sidecar before returning. After +// AppendBatch returns nil, both blocks and op-root are durable; the +// caller may safely advance the bucket's published Root. +// +// An empty blocks slice is legal — an MST mutation can produce a +// new root that points at a node already materialized in a prior +// segment (e.g., trimTop after Delete unwraps to an existing +// subtree). In that case only the OpRoot record is written; +// nothing new lands in the CAR. +func (s *Store) AppendBatch(ctx context.Context, blocks []block.Block, opRoot blockstore.OpRoot) error { + if !opRoot.Root.Defined() { + return errors.New("logstore: AppendBatch: opRoot.Root must be defined") + } + + s.appMu.Lock() + defer s.appMu.Unlock() + + open, err := s.ensureOpenLockedAppMu(ctx) + if err != nil { + return err + } + if err := open.append(blocks, opRoot); err != nil { + return err + } + + // Trigger seal if size threshold hit. Non-blocking signal — the + // actual seal happens off this goroutine to keep AppendBatch + // latency bounded by fsync. + if open.Size() >= s.cfg.SealBytes { + s.requestSeal() + } + return nil +} + +// Get returns the block from the local log if any segment contains +// it, or ErrNotFound otherwise. Searches open first, then sealed +// newest-first. +func (s *Store) Get(ctx context.Context, c cid.Cid) (block.Block, error) { + s.catMu.RLock() + open := s.open + sealed := make([]*Segment, len(s.sealed)) + copy(sealed, s.sealed) + s.catMu.RUnlock() + + if open != nil { + if blk, err := open.get(ctx, c); err == nil { + return blk, nil + } else if !errors.Is(err, blockstore.ErrNotFound) { + return nil, err + } + } + for _, seg := range sealed { + blk, err := seg.get(ctx, c) + if err == nil { + return blk, nil + } + if !errors.Is(err, blockstore.ErrNotFound) { + return nil, err + } + } + return nil, blockstore.ErrNotFound +} + +// Close seals the open segment, drains the flush queue, and stops +// background goroutines. Safe to call once. +func (s *Store) Close(ctx context.Context) error { + s.catMu.Lock() + already := s.closing == nil + if !already { + select { + case <-s.closing: + already = true + default: + } + } + if !already { + close(s.closing) + } + s.catMu.Unlock() + if already { + return nil + } + + // Force-seal the open segment so anything still buffered makes it + // into the flush queue. + s.appMu.Lock() + s.catMu.Lock() + open := s.open + s.open = nil + s.catMu.Unlock() + if open != nil { + if err := open.seal(ctx, s.cfg.Meta); err != nil { + s.logger.Error("logstore: seal at close", zap.Error(err)) + } else { + s.catMu.Lock() + s.sealed = append([]*Segment{open}, s.sealed...) + s.catMu.Unlock() + select { + case s.flushQ <- open: + case <-ctx.Done(): + } + } + } + s.appMu.Unlock() + + close(s.flushQ) + s.wg.Wait() + return nil +} + +// requestSeal coalesces seal triggers — the channel has buffer 1 so +// repeated triggers between two ticks of the seal goroutine are +// folded into one. +func (s *Store) requestSeal() { + select { + case s.sealReq <- struct{}{}: + default: + } +} + +// ensureOpenLockedAppMu returns the current open segment, creating a +// fresh one if none exists. Caller must hold appMu (so concurrent +// AppendBatches don't race on segment creation). +func (s *Store) ensureOpenLockedAppMu(ctx context.Context) (*Segment, error) { + s.catMu.RLock() + open := s.open + s.catMu.RUnlock() + if open != nil { + return open, nil + } + + seq, err := s.cfg.Meta.NextSegmentSeq(ctx) + if err != nil { + return nil, err + } + seg, err := createOpenSegment(ctx, s.cfg.Dir, seq, s.cfg.Meta, s.logger) + if err != nil { + return nil, err + } + + s.catMu.Lock() + if s.open == nil { + s.open = seg + s.openedAt = time.Now() + if seq >= s.nextSeq { + s.nextSeq = seq + 1 + } + s.catMu.Unlock() + return seg, nil + } + // Lost a race; another caller created an open segment first. + s.catMu.Unlock() + if err := seg.retire(); err != nil { + s.logger.Warn("logstore: retire raced new segment", zap.Error(err)) + } + if err := s.cfg.Meta.DeleteSegment(ctx, seq); err != nil { + s.logger.Warn("logstore: delete raced new segment row", zap.Error(err)) + } + s.catMu.RLock() + open = s.open + s.catMu.RUnlock() + return open, nil +} + +// sealOpenIfDue seals the current open segment if one exists. Sends +// to flushQ. Idempotent: returns nil if there's nothing to seal. +func (s *Store) sealOpenIfDue(ctx context.Context, force bool) error { + s.appMu.Lock() + defer s.appMu.Unlock() + + s.catMu.RLock() + open := s.open + openedAt := s.openedAt + s.catMu.RUnlock() + if open == nil { + return nil + } + if !force { + if open.Size() < s.cfg.SealBytes && time.Since(openedAt) < s.cfg.SealAge { + return nil + } + } + + if err := open.seal(ctx, s.cfg.Meta); err != nil { + return err + } + + s.catMu.Lock() + if s.open == open { + s.open = nil + s.sealed = append([]*Segment{open}, s.sealed...) + } + s.catMu.Unlock() + + select { + case s.flushQ <- open: + case <-s.closing: + return nil + } + return nil +} + +// flushLoop drains flushQ, calling cfg.Flush for each sealed segment. +// On success, transitions the segment to StateFlushed and runs the +// retention sweep. On failure, requeues with backoff so transient +// errors (network blips) don't permanently stall the pipeline. +// +// Exits when either the closing signal fires or flushQ is closed +// (whichever comes first). +func (s *Store) flushLoop() { + defer s.wg.Done() + for { + select { + case <-s.closing: + return + case seg, ok := <-s.flushQ: + if !ok { + return + } + s.flushOne(seg) + } + } +} + +func (s *Store) flushOne(seg *Segment) { + ctx := context.Background() + const maxAttempts = 5 + backoff := time.Second + + for attempt := 1; attempt <= maxAttempts; attempt++ { + err := s.cfg.Flush(ctx, seg) + if err == nil { + seg.stateMu.Lock() + seg.state = StateFlushed + seg.stateMu.Unlock() + s.runRetention(ctx) + return + } + s.logger.Warn("logstore: flush attempt failed", + zap.Uint64("seq", seg.Seq()), + zap.Int("attempt", attempt), + zap.Error(err)) + select { + case <-s.closing: + return + case <-time.After(backoff): + } + if backoff < 30*time.Second { + backoff *= 2 + } + } + s.logger.Error("logstore: flush exhausted retries; segment remains sealed", + zap.Uint64("seq", seg.Seq())) + // Leaving the segment in sealed state; recovery will pick it up + // at next process restart, or operators can intervene. +} + +// runRetention removes flushed segments older than cfg.Retain from +// disk and the catalog. +func (s *Store) runRetention(ctx context.Context) { + s.catMu.Lock() + // Walk newest-first, count flushed segments. Once we exceed + // Retain flushed segments, the rest are retire candidates. + var ( + flushedSeen int + keep []*Segment + retire []*Segment + ) + for _, seg := range s.sealed { + if seg.State() != StateFlushed { + keep = append(keep, seg) + continue + } + flushedSeen++ + if flushedSeen <= s.cfg.Retain { + keep = append(keep, seg) + continue + } + retire = append(retire, seg) + } + s.sealed = keep + s.catMu.Unlock() + + for _, seg := range retire { + if err := seg.retire(); err != nil { + s.logger.Warn("logstore: retire", zap.Uint64("seq", seg.Seq()), zap.Error(err)) + } + if err := s.cfg.Meta.DeleteSegment(ctx, seg.Seq()); err != nil { + s.logger.Warn("logstore: delete segment row", + zap.Uint64("seq", seg.Seq()), zap.Error(err)) + } + } +} + +// sealTickerLoop wakes periodically (every SealAge / 4) and seals +// the open segment if it has been open longer than SealAge or its +// size is over SealBytes (the latter is also signaled directly via +// requestSeal but we double-check defensively). +func (s *Store) sealTickerLoop() { + defer s.wg.Done() + interval := s.cfg.SealAge / 4 + if interval < 100*time.Millisecond { + interval = 100 * time.Millisecond + } + t := time.NewTicker(interval) + defer t.Stop() + for { + select { + case <-s.closing: + return + case <-t.C: + if err := s.sealOpenIfDue(context.Background(), false); err != nil { + s.logger.Warn("logstore: tick seal", zap.Error(err)) + } + case <-s.sealReq: + if err := s.sealOpenIfDue(context.Background(), false); err != nil { + s.logger.Warn("logstore: req seal", zap.Error(err)) + } + } + } +} diff --git a/pkg/ms3t/logstore/store_test.go b/pkg/ms3t/logstore/store_test.go new file mode 100644 index 0000000..4769208 --- /dev/null +++ b/pkg/ms3t/logstore/store_test.go @@ -0,0 +1,501 @@ +package logstore + +import ( + "context" + "errors" + "fmt" + "path/filepath" + "sync" + "sync/atomic" + "testing" + "time" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + "github.com/multiformats/go-multihash" + "go.uber.org/zap/zaptest" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" +) + +// fakeMeta is an in-memory Meta implementation for tests. It keeps +// just enough state to exercise the segment lifecycle without +// touching Postgres. +type fakeMeta struct { + mu sync.Mutex + nextSeq uint64 + segments map[uint64]*SegmentMeta + flushed []uint64 // order of MarkSegmentFlushed calls +} + +func newFakeMeta() *fakeMeta { + return &fakeMeta{segments: map[uint64]*SegmentMeta{}} +} + +func (f *fakeMeta) NextSegmentSeq(_ context.Context) (uint64, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.nextSeq++ + return f.nextSeq, nil +} + +func (f *fakeMeta) InsertSegmentOpen(_ context.Context, seq uint64) error { + f.mu.Lock() + defer f.mu.Unlock() + if _, ok := f.segments[seq]; ok { + return nil + } + f.segments[seq] = &SegmentMeta{Seq: seq, State: StateOpen} + return nil +} + +func (f *fakeMeta) MarkSegmentSealed(_ context.Context, seq uint64, sealedAt int64, sizeBytes int64, sha256 []byte, opRoots []blockstore.OpRoot) error { + f.mu.Lock() + defer f.mu.Unlock() + m, ok := f.segments[seq] + if !ok { + return fmt.Errorf("fake: seal missing seq %d", seq) + } + if m.State != StateOpen { + // idempotent + return nil + } + m.State = StateSealed + m.SealedAt = sealedAt + m.SizeBytes = sizeBytes + m.SHA256 = append([]byte(nil), sha256...) + m.OpRoots = append([]blockstore.OpRoot(nil), opRoots...) + return nil +} + +func (f *fakeMeta) MarkSegmentFlushed(_ context.Context, seq uint64, flushedAt int64, opRoots []blockstore.OpRoot) error { + f.mu.Lock() + defer f.mu.Unlock() + m, ok := f.segments[seq] + if !ok { + return fmt.Errorf("fake: flush missing seq %d", seq) + } + if m.State == StateFlushed { + return nil + } + m.State = StateFlushed + m.FlushedAt = flushedAt + if len(opRoots) > 0 { + m.OpRoots = append([]blockstore.OpRoot(nil), opRoots...) + } + f.flushed = append(f.flushed, seq) + return nil +} + +func (f *fakeMeta) DeleteSegment(_ context.Context, seq uint64) error { + f.mu.Lock() + defer f.mu.Unlock() + delete(f.segments, seq) + return nil +} + +func (f *fakeMeta) ListUnflushedSegments(_ context.Context) ([]SegmentMeta, error) { + f.mu.Lock() + defer f.mu.Unlock() + var out []SegmentMeta + for _, m := range f.segments { + if m.State == StateOpen || m.State == StateSealed { + out = append(out, *m) + } + } + return out, nil +} + +func (f *fakeMeta) RehydrateSegment(_ context.Context, m SegmentMeta) error { + f.mu.Lock() + defer f.mu.Unlock() + cp := m + f.segments[m.Seq] = &cp + return nil +} + +func (f *fakeMeta) snapshot(seq uint64) (SegmentMeta, bool) { + f.mu.Lock() + defer f.mu.Unlock() + m, ok := f.segments[seq] + if !ok { + return SegmentMeta{}, false + } + return *m, true +} + +// makeBlock returns a raw-codec block whose CID is the sha256 of +// payload. We construct the CID explicitly rather than relying on +// block.NewBlock because the latter uses a v0 CID we don't want. +func makeBlock(t *testing.T, payload []byte) block.Block { + t.Helper() + mh, err := multihash.Sum(payload, multihash.SHA2_256, -1) + if err != nil { + t.Fatalf("multihash: %v", err) + } + c := cid.NewCidV1(cid.Raw, mh) + blk, err := block.NewBlockWithCid(payload, c) + if err != nil { + t.Fatalf("block: %v", err) + } + return blk +} + +// makeRoot returns a deterministic CID derived from name; used as +// the OpRoot.Root in tests. +func makeRoot(t *testing.T, name string) cid.Cid { + t.Helper() + mh, err := multihash.Sum([]byte("root:"+name), multihash.SHA2_256, -1) + if err != nil { + t.Fatalf("mh: %v", err) + } + return cid.NewCidV1(cid.DagCBOR, mh) +} + +func newTestStore(t *testing.T, sealBytes int64, sealAge time.Duration, retain int) (*Store, *fakeMeta, *atomicCounter) { + t.Helper() + dir := t.TempDir() + meta := newFakeMeta() + flushCalls := &atomicCounter{} + logger := zaptest.NewLogger(t) + cfg := Config{ + Dir: dir, + Meta: meta, + SealBytes: sealBytes, + SealAge: sealAge, + Retain: retain, + Flush: func(ctx context.Context, seg *Segment) error { + flushCalls.add(1) + return meta.MarkSegmentFlushed(ctx, seg.Seq(), time.Now().Unix(), seg.OpRoots()) + }, + Logger: logger, + } + s, err := Open(context.Background(), cfg) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close(context.Background()) }) + return s, meta, flushCalls +} + +type atomicCounter struct{ n int64 } + +func (a *atomicCounter) add(n int64) { atomic.AddInt64(&a.n, n) } +func (a *atomicCounter) load() int64 { return atomic.LoadInt64(&a.n) } + +func TestAppendThenGetSameProcess(t *testing.T) { + s, _, _ := newTestStore(t, 64<<20, 5*time.Second, 6) + + blk := makeBlock(t, []byte("hello world")) + root := makeRoot(t, "alpha") + if err := s.AppendBatch(context.Background(), []block.Block{blk}, blockstore.OpRoot{Bucket: "bk", Root: root}); err != nil { + t.Fatalf("AppendBatch: %v", err) + } + + got, err := s.Get(context.Background(), blk.Cid()) + if err != nil { + t.Fatalf("Get: %v", err) + } + if string(got.RawData()) != "hello world" { + t.Fatalf("got %q want %q", got.RawData(), "hello world") + } +} + +func TestSealBySize(t *testing.T) { + s, meta, flushes := newTestStore(t, 256, 50*time.Millisecond, 6) + + // Each block carries 100 bytes of payload; after a few writes the + // segment crosses the 256-byte threshold and seals. + payload := make([]byte, 100) + for i := range payload { + payload[i] = byte(i) + } + for i := 0; i < 6; i++ { + blk := makeBlock(t, append([]byte(fmt.Sprintf("rec-%02d-", i)), payload...)) + if err := s.AppendBatch(context.Background(), []block.Block{blk}, blockstore.OpRoot{ + Bucket: "bk", + Root: makeRoot(t, fmt.Sprintf("size-%d", i)), + }); err != nil { + t.Fatalf("append %d: %v", i, err) + } + } + + // Wait for at least one flush. + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if flushes.load() > 0 { + break + } + time.Sleep(10 * time.Millisecond) + } + if flushes.load() == 0 { + t.Fatalf("expected at least one flush after size threshold; got 0") + } + + // At least one segment row should now be flushed. + meta.mu.Lock() + var flushed int + for _, m := range meta.segments { + if m.State == StateFlushed { + flushed++ + } + } + meta.mu.Unlock() + if flushed == 0 { + t.Fatalf("expected at least one segment in flushed state") + } +} + +func TestSealByAge(t *testing.T) { + s, _, flushes := newTestStore(t, 1<<30, 80*time.Millisecond, 6) + + blk := makeBlock(t, []byte("age-trigger")) + if err := s.AppendBatch(context.Background(), []block.Block{blk}, blockstore.OpRoot{ + Bucket: "bk", + Root: makeRoot(t, "age"), + }); err != nil { + t.Fatalf("append: %v", err) + } + + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if flushes.load() > 0 { + break + } + time.Sleep(20 * time.Millisecond) + } + if flushes.load() == 0 { + t.Fatalf("expected age-triggered seal to produce a flush") + } +} + +func TestRetentionDropsOldFlushed(t *testing.T) { + s, _, _ := newTestStore(t, 64, 50*time.Millisecond, 2) + dir := s.cfg.Dir + + // Issue 5 PUTs; each one large enough to exceed SealBytes=64 in + // a single batch, so each becomes its own segment. + for i := 0; i < 5; i++ { + payload := make([]byte, 80) + for j := range payload { + payload[j] = byte(i) + } + blk := makeBlock(t, append([]byte(fmt.Sprintf("retain-%02d-", i)), payload...)) + if err := s.AppendBatch(context.Background(), []block.Block{blk}, blockstore.OpRoot{ + Bucket: "bk", + Root: makeRoot(t, fmt.Sprintf("ret-%d", i)), + }); err != nil { + t.Fatalf("append %d: %v", i, err) + } + } + + // Wait for retention to converge. + deadline := time.Now().Add(3 * time.Second) + for time.Now().Before(deadline) { + entries, err := readSegmentSeqs(dir) + if err != nil { + t.Fatalf("readDir: %v", err) + } + // 1 active open + 2 retained + if len(entries) <= 3 { + break + } + time.Sleep(50 * time.Millisecond) + } + + entries, err := readSegmentSeqs(dir) + if err != nil { + t.Fatalf("readDir: %v", err) + } + if len(entries) > 3 { + t.Fatalf("retain=2 should leave at most 3 .car files (open + retained); got %d (%v)", + len(entries), entries) + } +} + +func TestForceSealRecoveredOpenOnRestart(t *testing.T) { + dir := t.TempDir() + meta := newFakeMeta() + logger := zaptest.NewLogger(t) + openStore := func() *Store { + cfg := Config{ + Dir: dir, + Meta: meta, + SealBytes: 1 << 30, // never seals on size during this test + SealAge: 1 * time.Hour, + Retain: 6, + Flush: func(ctx context.Context, seg *Segment) error { + return meta.MarkSegmentFlushed(ctx, seg.Seq(), time.Now().Unix(), seg.OpRoots()) + }, + Logger: logger, + } + s, err := Open(context.Background(), cfg) + if err != nil { + t.Fatalf("Open: %v", err) + } + return s + } + + s := openStore() + blk := makeBlock(t, []byte("survives-restart")) + if err := s.AppendBatch(context.Background(), []block.Block{blk}, blockstore.OpRoot{ + Bucket: "bk", + Root: makeRoot(t, "survive"), + }); err != nil { + t.Fatalf("append: %v", err) + } + + // Simulate process exit without orderly Close (don't seal). Close + // the file descriptors via a panic-safe path: we just stop the + // goroutines and forget the in-memory state. + close(s.closing) + s.wg.Wait() + // Drop the in-memory ref; on disk the segment is still open. + + // Re-Open from the same dir. + s2 := openStore() + t.Cleanup(func() { _ = s2.Close(context.Background()) }) + + // The previously-open segment should have been force-sealed on + // startup; the write must still be readable. + got, err := s2.Get(context.Background(), blk.Cid()) + if err != nil { + t.Fatalf("Get after restart: %v", err) + } + if string(got.RawData()) != "survives-restart" { + t.Fatalf("got %q", got.RawData()) + } +} + +func TestAppendBatchEmptyBlocksAccepted(t *testing.T) { + s, _, _ := newTestStore(t, 64<<20, 5*time.Second, 6) + root := makeRoot(t, "x") + if err := s.AppendBatch(context.Background(), nil, blockstore.OpRoot{Bucket: "bk", Root: root}); err != nil { + t.Fatalf("empty blocks with defined root should succeed, got %v", err) + } + if err := s.AppendBatch(context.Background(), []block.Block{makeBlock(t, []byte("x"))}, blockstore.OpRoot{Bucket: "bk"}); err == nil { + t.Fatalf("expected error on undefined root") + } +} + +func TestGetMissReturnsErrNotFound(t *testing.T) { + s, _, _ := newTestStore(t, 64<<20, 5*time.Second, 6) + + want, err := makeRoot(t, "absent"), error(nil) + _, err = s.Get(context.Background(), want) + if !errors.Is(err, blockstore.ErrNotFound) { + t.Fatalf("expected ErrNotFound, got %v", err) + } +} + +// TestAppendBatchDedupesAcrossOps confirms that a CID written in +// one AppendBatch is filtered out of a later AppendBatch landing in +// the same open segment: the file grows by one frame's worth of +// bytes, not two. +func TestAppendBatchDedupesAcrossOps(t *testing.T) { + s, _, _ := newTestStore(t, 64<<20, 1*time.Hour, 6) + + shared := makeBlock(t, []byte("shared block bytes")) + uniqA := makeBlock(t, []byte("unique-A")) + uniqB := makeBlock(t, []byte("unique-B")) + + // First batch: shared + uniqA. + if err := s.AppendBatch(context.Background(), + []block.Block{shared, uniqA}, + blockstore.OpRoot{Bucket: "bk", Root: makeRoot(t, "op-a")}, + ); err != nil { + t.Fatalf("append A: %v", err) + } + + // Snapshot the open segment's size after the first append. + s.catMu.RLock() + sizeAfterA := s.open.Size() + s.catMu.RUnlock() + + // Second batch: shared (duplicate of first batch) + uniqB. + if err := s.AppendBatch(context.Background(), + []block.Block{shared, uniqB}, + blockstore.OpRoot{Bucket: "bk", Root: makeRoot(t, "op-b")}, + ); err != nil { + t.Fatalf("append B: %v", err) + } + + s.catMu.RLock() + sizeAfterB := s.open.Size() + s.catMu.RUnlock() + + // Frame for `shared` is one varint(len) + cid + payload. Whatever + // that totals, the second batch should NOT have re-written it. + // `uniqA` has ~the same payload size as `uniqB`, so growth-from-A + // and growth-from-B (had we written `shared` twice) would be + // nearly identical. Instead we expect growth-from-B ≈ uniqB-frame + // only. Simplest assertion: only one frame's worth of growth. + growthB := sizeAfterB - sizeAfterA + growthFirstBatch := sizeAfterA // includes header + 2 frames; can't isolate + + if growthB >= growthFirstBatch { + t.Fatalf("second batch grew %d bytes, expected ~half of first-batch growth (%d) since shared was deduped", + growthB, growthFirstBatch) + } + + // All three blocks must be readable. + for _, blk := range []block.Block{shared, uniqA, uniqB} { + got, err := s.Get(context.Background(), blk.Cid()) + if err != nil { + t.Fatalf("Get %s: %v", blk.Cid(), err) + } + if string(got.RawData()) != string(blk.RawData()) { + t.Fatalf("Get %s payload mismatch", blk.Cid()) + } + } +} + +// TestAppendBatchAllDuplicatesStillRecordsOpRoot covers the edge +// case where every block in a batch is a duplicate of bytes already +// in the segment. The CAR file shouldn't grow — but the op-root +// still has to persist so the bucket's forge_root_cid catches up +// when the segment ships. +func TestAppendBatchAllDuplicatesStillRecordsOpRoot(t *testing.T) { + s, _, _ := newTestStore(t, 64<<20, 1*time.Hour, 6) + + blk := makeBlock(t, []byte("only-block")) + + if err := s.AppendBatch(context.Background(), []block.Block{blk}, blockstore.OpRoot{ + Bucket: "bk", Root: makeRoot(t, "first"), + }); err != nil { + t.Fatalf("first append: %v", err) + } + s.catMu.RLock() + sizeBefore := s.open.Size() + opRootsBefore := len(s.open.OpRoots()) + s.catMu.RUnlock() + + if err := s.AppendBatch(context.Background(), []block.Block{blk}, blockstore.OpRoot{ + Bucket: "bk", Root: makeRoot(t, "second"), + }); err != nil { + t.Fatalf("dup append: %v", err) + } + s.catMu.RLock() + sizeAfter := s.open.Size() + opRootsAfter := len(s.open.OpRoots()) + s.catMu.RUnlock() + + if sizeAfter != sizeBefore { + t.Fatalf("all-duplicate batch grew CAR by %d bytes; expected 0", sizeAfter-sizeBefore) + } + if opRootsAfter != opRootsBefore+1 { + t.Fatalf("op-root count went %d→%d; expected +1", opRootsBefore, opRootsAfter) + } +} + +func readSegmentSeqs(dir string) ([]string, error) { + matches, err := filepath.Glob(filepath.Join(dir, "seg-*.car")) + if err != nil { + return nil, err + } + out := make([]string, 0, len(matches)) + for _, m := range matches { + out = append(out, filepath.Base(m)) + } + return out, nil +} diff --git a/pkg/ms3t/logstore/types.go b/pkg/ms3t/logstore/types.go new file mode 100644 index 0000000..fe67d0b --- /dev/null +++ b/pkg/ms3t/logstore/types.go @@ -0,0 +1,108 @@ +package logstore + +import ( + "context" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" +) + +// State describes the lifecycle stage of a segment as observed at the +// catalog/Postgres level. The on-disk MANIFEST may briefly lag the +// in-memory state but recovery reconciles the two. +type State int + +const ( + // StateOpen means the segment is the current append target. Exactly + // one segment is in this state at a time. + StateOpen State = iota + // StateSealed means the segment is closed for writes and waiting to + // be (or being) shipped to Forge. + StateSealed + // StateFlushed means the segment has been successfully shipped to + // Forge and the per-bucket forge_root advances were applied. The + // segment may still be on disk, kept around as a read tier. + StateFlushed +) + +// String renders State for logs. +func (s State) String() string { + switch s { + case StateOpen: + return "open" + case StateSealed: + return "sealed" + case StateFlushed: + return "flushed" + default: + return "unknown" + } +} + +// ParseState is the inverse of State.String. Unknown strings yield +// StateOpen and ok=false, matching what we want at the SQL boundary. +func ParseState(s string) (State, bool) { + switch s { + case "open": + return StateOpen, true + case "sealed": + return StateSealed, true + case "flushed": + return StateFlushed, true + default: + return StateOpen, false + } +} + +// SegmentMeta is the persistence-layer view of a segment. Used by +// recovery to enumerate segments that need attention. +type SegmentMeta struct { + Seq uint64 + State State + SealedAt int64 + FlushedAt int64 + SizeBytes int64 + SHA256 []byte + OpRoots []blockstore.OpRoot +} + +// Meta is the persistence backing for the segment lifecycle. The +// production implementation is *registry.Postgres; tests use an +// in-memory fake. Logstore never touches SQL directly. +type Meta interface { + // NextSegmentSeq returns a fresh monotonic segment id. + NextSegmentSeq(ctx context.Context) (uint64, error) + + // InsertSegmentOpen records that segment seq has just been opened. + // Idempotent: if the row already exists in any state it is left + // alone. + InsertSegmentOpen(ctx context.Context, seq uint64) error + + // MarkSegmentSealed transitions a segment from open to sealed in + // one transaction: updates ms3t.segments and inserts the + // per-segment op-root rows. opRoots are applied in slice order + // (each gets seq_within = i). + MarkSegmentSealed(ctx context.Context, seq uint64, sealedAt int64, sizeBytes int64, sha256 []byte, opRoots []blockstore.OpRoot) error + + // MarkSegmentFlushed transitions a segment from sealed to flushed + // AND advances forge_root_cid in ms3t.buckets for every op-root + // recorded against this segment, all in one transaction. opRoots + // is the in-order list from MarkSegmentSealed; the registry uses + // it directly so callers can treat the sidecar as the source of + // truth. + MarkSegmentFlushed(ctx context.Context, seq uint64, flushedAt int64, opRoots []blockstore.OpRoot) error + + // DeleteSegment removes a segment row (cascades to op-root rows). + // Used by retention after the on-disk file is unlinked. + DeleteSegment(ctx context.Context, seq uint64) error + + // ListUnflushedSegments returns every segment whose state is open + // or sealed, ordered by seq ascending. Recovery uses this to + // re-enqueue work for the flusher and to verify on-disk vs DB + // state. + ListUnflushedSegments(ctx context.Context) ([]SegmentMeta, error) + + // RehydrateSegment writes a segment row + its op-root rows from a + // sidecar `.idx` when the DB row is missing or torn. Idempotent + // on (seq) — replaces any existing rows for that segment. + RehydrateSegment(ctx context.Context, m SegmentMeta) error +} diff --git a/pkg/ms3t/migrations/migrations.go b/pkg/ms3t/migrations/migrations.go new file mode 100644 index 0000000..91a6b9a --- /dev/null +++ b/pkg/ms3t/migrations/migrations.go @@ -0,0 +1,64 @@ +// Package migrations embeds the ms3t Postgres migrations and exposes +// a runner that applies them via goose against a caller-provided +// *pgxpool.Pool. +// +// All ms3t tables live in the `ms3t` schema and goose tracks them in +// ms3t.goose_db_version, so this package can run against the same +// database as sprue's internal/migrations without colliding. +package migrations + +import ( + "context" + "fmt" + + "github.com/jackc/pgx/v5/pgxpool" + "github.com/jackc/pgx/v5/stdlib" + "github.com/pressly/goose/v3" + "go.uber.org/zap" + + "embed" +) + +//go:embed sql/*.sql +var FS embed.FS + +const ( + schemaName = "ms3t" + gooseVersionName = schemaName + ".goose_db_version" +) + +// Up applies all pending migrations embedded in FS to the database +// behind pool. The ms3t schema is created if it does not already +// exist, then goose is configured to track its version in +// ms3t.goose_db_version. +func Up(ctx context.Context, pool *pgxpool.Pool, logger *zap.Logger) error { + if _, err := pool.Exec(ctx, "CREATE SCHEMA IF NOT EXISTS "+schemaName); err != nil { + return fmt.Errorf("ms3t migrations: ensure schema: %w", err) + } + + db := stdlib.OpenDBFromPool(pool) + defer db.Close() + + goose.SetBaseFS(FS) + goose.SetLogger(&zapGooseLogger{logger: logger}) + goose.SetTableName(gooseVersionName) + if err := goose.SetDialect("postgres"); err != nil { + return fmt.Errorf("ms3t migrations: set dialect: %w", err) + } + if err := goose.UpContext(ctx, db, "sql"); err != nil { + return fmt.Errorf("ms3t migrations: up: %w", err) + } + return nil +} + +type zapGooseLogger struct { + logger *zap.Logger +} + +func (l *zapGooseLogger) Fatalf(format string, v ...interface{}) { + l.logger.Sugar().Fatalf(format, v...) +} + +func (l *zapGooseLogger) Printf(format string, v ...interface{}) { + l.logger.Sugar().Infof(format, v...) +} diff --git a/pkg/ms3t/migrations/sql/00001_init.sql b/pkg/ms3t/migrations/sql/00001_init.sql new file mode 100644 index 0000000..f8c185d --- /dev/null +++ b/pkg/ms3t/migrations/sql/00001_init.sql @@ -0,0 +1,20 @@ +-- +goose Up +-- ms3t bucket registry. Mirrors the columns of the previous SQLite +-- schema (pkg/ms3t/registry/sqlite.go's `buckets` table) but in the +-- `ms3t` schema so the same Postgres database can host both sprue's +-- and ms3t's tables without collision. +-- +-- name — S3 bucket name (PK) +-- root_cid — current MST root CID, bytes form; NULL for empty bucket +-- forge_root_cid — last MST root whose DAG has been shipped to Forge +-- created_at — unix seconds at create time + +CREATE TABLE ms3t.buckets ( + name TEXT PRIMARY KEY, + root_cid BYTEA, + forge_root_cid BYTEA, + created_at BIGINT NOT NULL +); + +-- +goose Down +DROP TABLE ms3t.buckets; diff --git a/pkg/ms3t/migrations/sql/00002_segments.sql b/pkg/ms3t/migrations/sql/00002_segments.sql new file mode 100644 index 0000000..ccd82dc --- /dev/null +++ b/pkg/ms3t/migrations/sql/00002_segments.sql @@ -0,0 +1,49 @@ +-- +goose Up +-- ms3t log segments (LSM-style write log) and the per-segment +-- record of bucket-root advances that landed in each segment. +-- +-- segments +-- seq — monotonic segment id (matches the on-disk filename +-- stem `seg-.car`) +-- state — one of 'open', 'sealed', 'flushed' +-- sealed_at — unix seconds when seal was completed; NULL while open +-- flushed_at — unix seconds when the Forge ship completed; NULL otherwise +-- size_bytes — final size of the CAR file at seal +-- car_sha256 — sha256 of the CAR file at seal (used to detect torn +-- sidecars during recovery) +-- +-- segment_op_roots +-- seq, seq_within — composite ordering of S3 ops within a segment +-- bucket — the bucket whose root advanced for this op +-- root_cid — the new MST root the op produced +-- +-- The on-disk `seg-.idx` sidecar is the source of truth at +-- recovery time; these tables are rehydrated from sidecars when rows +-- are missing. The flusher uses `segment_op_roots` (joined with +-- `segments.state = 'flushed'`) to advance per-bucket forge_root_cid +-- in `ms3t.buckets` atomically with the state transition. + +CREATE SEQUENCE ms3t.segment_seq; + +CREATE TABLE ms3t.segments ( + seq BIGINT PRIMARY KEY, + state TEXT NOT NULL CHECK (state IN ('open', 'sealed', 'flushed')), + sealed_at BIGINT, + flushed_at BIGINT, + size_bytes BIGINT NOT NULL DEFAULT 0, + car_sha256 BYTEA +); + +CREATE TABLE ms3t.segment_op_roots ( + seq BIGINT NOT NULL REFERENCES ms3t.segments(seq) ON DELETE CASCADE, + seq_within INT NOT NULL, + bucket TEXT NOT NULL, + root_cid BYTEA NOT NULL, + PRIMARY KEY (seq, seq_within) +); +CREATE INDEX segment_op_roots_bucket_seq_idx ON ms3t.segment_op_roots (bucket, seq); + +-- +goose Down +DROP TABLE ms3t.segment_op_roots; +DROP TABLE ms3t.segments; +DROP SEQUENCE ms3t.segment_seq; diff --git a/pkg/ms3t/module.go b/pkg/ms3t/module.go new file mode 100644 index 0000000..ce5e0fa --- /dev/null +++ b/pkg/ms3t/module.go @@ -0,0 +1,190 @@ +// Package ms3t exposes the embedded S3 listener as both a low-level +// Server type (see server.go) and an fx module (see Module). +// +// The S3 protocol layer is provided by github.com/versity/versitygw; +// the storage backend is the LSM-style log in pkg/ms3t/logstore in +// front of a Forge-backed read tier, with versitygw → logstore +// translation in pkg/ms3t/s3frontend. +// +// pkg/ms3t depends on a single external storage type for production +// wiring: *pgxpool.Pool. Callers are responsible for constructing +// the pool (typically via sprue's internal/fx/store/postgres). The +// module runs its own goose migrations (pkg/ms3t/migrations) against +// the pool at startup so the outer wiring does not need to know +// about ms3t's schema. +// +// When config.MS3T.Enabled is false the module is a no-op, so it is +// safe to always include it in the app graph. +package ms3t + +import ( + "context" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/jackc/pgx/v5/pgxpool" + "github.com/storacha/go-ucanto/did" + "go.uber.org/fx" + "go.uber.org/zap" + + "github.com/storacha/sprue/internal/config" + "github.com/storacha/sprue/pkg/identity" + "github.com/storacha/sprue/pkg/indexerclient" + "github.com/storacha/sprue/pkg/ms3t/blockstore" + "github.com/storacha/sprue/pkg/ms3t/migrations" + "github.com/storacha/sprue/pkg/ms3t/registry" + "github.com/storacha/sprue/pkg/ms3t/uploader" + "github.com/storacha/sprue/pkg/piriclient" + "github.com/storacha/sprue/pkg/routing" +) + +// Module registers the embedded ms3t S3 listener. When +// config.MS3T.Enabled is false the module is a no-op, so it's safe +// to always include in the app graph. +var Module = fx.Module("ms3t", + fx.Invoke(registerLifecycle), +) + +// FxDeps bundles the sprue-internal services ms3t pulls in from the +// fx graph in production. Pool is the only storage dependency: ms3t +// owns its own schema (under the ms3t Postgres schema) and runs its +// own migrations. +// +// Pool is marked optional in the fx graph because storage backends +// other than postgres (memory, aws) do not provide one. ms3t is +// opt-in; when ms3t.enabled is true, registerLifecycle returns a +// fail-fast error if Pool is nil. +type FxDeps struct { + fx.In + + Pool *pgxpool.Pool `optional:"true"` + Identity *identity.Identity + Router *routing.Service + PiriProvider piriclient.Provider + IndexerClient *indexerclient.Client `optional:"true"` +} + +// registerLifecycle is the fx-only thin shim. It builds the +// production-only collaborators (Forge, Internal uploader, Postgres +// registry, migrations, space signer) and hands them to ms3t.New. +// Anything beyond that wiring lives in server.go and is reachable by +// tests without fx. +func registerLifecycle( + lc fx.Lifecycle, + cfg *config.Config, + zlog *zap.Logger, + deps FxDeps, +) error { + mc := cfg.MS3T + if !mc.Enabled { + return nil + } + + if deps.Pool == nil { + return fmt.Errorf("ms3t: a *pgxpool.Pool must be provided in the fx graph when ms3t.enabled is true") + } + if deps.IndexerClient == nil { + return fmt.Errorf("ms3t: indexer client is required (configure indexer.endpoint)") + } + + if err := os.MkdirAll(mc.DataDir, 0o755); err != nil { + return fmt.Errorf("ms3t: mkdir data dir: %w", err) + } + + // Apply ms3t's own migrations against the caller-supplied pool. + // Goose runs in the ms3t schema and tracks its version table at + // ms3t.goose_db_version, so this never collides with any other + // migrations on the same database. + if err := migrations.Up(context.Background(), deps.Pool, zlog); err != nil { + return fmt.Errorf("ms3t: migrations: %w", err) + } + + // ms3t IS the space owner (root UCAN authority) so that + // self-issued space/content/retrieve delegations validate down + // the chain to piri's retrieval auth check. Key is generated on + // first run and persisted under data_dir/space.key. + keyPath := filepath.Join(mc.DataDir, "space.key") + spaceSigner, err := LoadOrCreateSigner(keyPath) + if err != nil { + return fmt.Errorf("ms3t: space signer: %w", err) + } + zlog.Info("ms3t space loaded", + zap.String("space_did", spaceSigner.DID().String()), + zap.String("key_file", keyPath), + ) + + forgeReader, err := blockstore.NewForge(blockstore.ForgeConfig{ + IndexerEndpoint: cfg.Indexer.Endpoint, + IndexerDID: cfg.Indexer.DID, + Spaces: []did.DID{spaceSigner.DID()}, + Signer: deps.Identity.Signer, + SpaceSigner: spaceSigner, + Logger: zlog, + }) + if err != nil { + return fmt.Errorf("ms3t: Reader blockstore: %w", err) + } + + reg := registry.NewPostgres(deps.Pool) + + zlog.Info("ms3t internal uploader configured", + zap.String("space_did", spaceSigner.DID().String()), + zap.String("signer_did", deps.Identity.DID()), + ) + up, err := uploader.NewForge(uploader.ForgeConfig{ + Router: deps.Router, + PiriProvider: deps.PiriProvider, + IndexerClient: deps.IndexerClient, + Signer: deps.Identity.Signer, + SpaceSigner: spaceSigner, + Logger: zlog, + }) + if err != nil { + return fmt.Errorf("ms3t: uploader: %w", err) + } + + sealAge, err := time.ParseDuration(emptyDefault(mc.SealAge, "5s")) + if err != nil { + return fmt.Errorf("ms3t: parse seal_age %q: %w", mc.SealAge, err) + } + + server, err := New(context.Background(), + ServerConfig{ + Addr: mc.Addr, + DataDir: mc.DataDir, + Region: mc.Region, + RootAccess: mc.RootAccess, + RootSecret: mc.RootSecret, + ChunkSize: mc.ChunkSize, + SealBytes: mc.SealBytes, + SealAge: sealAge, + Retain: mc.Retain, + }, + ServerDeps{ + Logger: zlog, + BaseBlockReader: forgeReader, + Uploader: up, + Registry: reg, + Meta: reg, + }, + ) + if err != nil { + return err + } + + lc.Append(fx.Hook{ + OnStart: server.Start, + OnStop: server.Stop, + }) + return nil +} + +// emptyDefault returns def when s is the empty string. +func emptyDefault(s, def string) string { + if s == "" { + return def + } + return s +} diff --git a/pkg/ms3t/mst/diff.go b/pkg/ms3t/mst/diff.go index 0d3d844..f663ff8 100644 --- a/pkg/ms3t/mst/diff.go +++ b/pkg/ms3t/mst/diff.go @@ -5,7 +5,8 @@ import ( "fmt" cid "github.com/ipfs/go-cid" - cbor "github.com/ipfs/go-ipld-cbor" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" ) // DiffOp describes a single change between two MST roots. @@ -19,8 +20,8 @@ type DiffOp struct { // DiffTrees enumerates the additions, deletions, and mutations needed to go // from the MST rooted at `from` to the MST rooted at `to`. -func DiffTrees(ctx context.Context, bs cbor.IpldBlockstore, from, to cid.Cid) ([]*DiffOp, error) { - cst := CborStore(bs) +func DiffTrees(ctx context.Context, bs blockstore.BaseStore, from, to cid.Cid) ([]*DiffOp, error) { + cst := blockstore.CborStore(bs) if from == cid.Undef { return identityDiff(ctx, bs, to) @@ -173,8 +174,8 @@ func nodeEntriesEqual(a, b *nodeEntry) bool { return false } -func identityDiff(ctx context.Context, bs cbor.IpldBlockstore, root cid.Cid) ([]*DiffOp, error) { - cst := CborStore(bs) +func identityDiff(ctx context.Context, bs blockstore.BaseStore, root cid.Cid) ([]*DiffOp, error) { + cst := blockstore.CborStore(bs) tt := LoadMST(cst, root) var ops []*DiffOp diff --git a/pkg/ms3t/mst/mst.go b/pkg/ms3t/mst/mst.go index c5f7c1a..dc42b36 100644 --- a/pkg/ms3t/mst/mst.go +++ b/pkg/ms3t/mst/mst.go @@ -18,9 +18,16 @@ import ( "reflect" "github.com/ipfs/go-cid" - cbor "github.com/ipfs/go-ipld-cbor" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" ) +// The MST package consumes blockstore.Reader on the load + traversal +// path and blockstore.Store on the materialization (GetPointer) +// path. Mutating operations (Add, Update, Delete) build new +// in-memory tree values without any I/O writes; the only write site +// in the package is GetPointer, which takes a writer argument. + // nodeKind is the type of node in the MST. type nodeKind uint8 @@ -84,8 +91,14 @@ type TreeEntry struct { // MerkleSearchTree is an MST tree node. Values are immutable: methods return // copies with changes applied. Hydration is lazy; a tree loaded by CID has no // entries until getEntries is called. +// +// The cst field is a blockstore.Reader: traversal and mutation +// (Add/Update/Delete) both stay read-only at the storage level, +// returning new in-memory MerkleSearchTree values rather than +// persisting anything. The only write site is GetPointer, which +// takes its writer as an explicit argument. type MerkleSearchTree struct { - cst cbor.IpldStore + cst blockstore.Reader entries []nodeEntry // non-nil when "hydrated" layer int pointer cid.Cid @@ -93,11 +106,11 @@ type MerkleSearchTree struct { } // NewEmptyMST returns a new empty MST using cst as its storage. -func NewEmptyMST(cst cbor.IpldStore) *MerkleSearchTree { +func NewEmptyMST(cst blockstore.Reader) *MerkleSearchTree { return createMST(cst, cid.Undef, []nodeEntry{}, 0) } -func createMST(cst cbor.IpldStore, ptr cid.Cid, entries []nodeEntry, layer int) *MerkleSearchTree { +func createMST(cst blockstore.Reader, ptr cid.Cid, entries []nodeEntry, layer int) *MerkleSearchTree { mst := &MerkleSearchTree{ cst: cst, pointer: ptr, @@ -110,7 +123,7 @@ func createMST(cst cbor.IpldStore, ptr cid.Cid, entries []nodeEntry, layer int) // LoadMST returns a lazy reference to an MST rooted at the given CID. Entries // are not loaded until needed. -func LoadMST(cst cbor.IpldStore, root cid.Cid) *MerkleSearchTree { +func LoadMST(cst blockstore.Reader, root cid.Cid) *MerkleSearchTree { return createMST(cst, root, nil, -1) } @@ -149,7 +162,7 @@ func (mst *MerkleSearchTree) getEntries(ctx context.Context) ([]nodeEntry, error return nil, fmt.Errorf("no entries or self-pointer (CID) on MerkleSearchTree") } -func entriesFromNodeData(ctx context.Context, nd *NodeData, cst cbor.IpldStore) ([]nodeEntry, error) { +func entriesFromNodeData(ctx context.Context, nd *NodeData, cst blockstore.Reader) ([]nodeEntry, error) { layer := -1 if len(nd.Entries) > 0 { // the first entry's KeySuffix is a complete key (PrefixLen=0) @@ -166,8 +179,11 @@ func entriesFromNodeData(ctx context.Context, nd *NodeData, cst cbor.IpldStore) } // GetPointer returns the CID of this MST root, recomputing it if any subtree -// has been mutated since the last call. -func (mst *MerkleSearchTree) GetPointer(ctx context.Context) (cid.Cid, error) { +// has been mutated since the last call. writer is the IpldStore that any +// freshly-serialized subtree nodes are Put through; only this method (and the +// cidForEntries / serializeNodeData helpers it drives) ever issues writes +// against it. +func (mst *MerkleSearchTree) GetPointer(ctx context.Context, writer blockstore.Store) (cid.Cid, error) { if mst.validPtr { return mst.pointer, nil } @@ -179,7 +195,7 @@ func (mst *MerkleSearchTree) GetPointer(ctx context.Context) (cid.Cid, error) { for i, e := range mst.entries { if e.isTree() { if !e.Tree.validPtr { - if _, err := e.Tree.GetPointer(ctx); err != nil { + if _, err := e.Tree.GetPointer(ctx, writer); err != nil { return cid.Undef, err } mst.entries[i] = e @@ -187,7 +203,7 @@ func (mst *MerkleSearchTree) GetPointer(ctx context.Context) (cid.Cid, error) { } } - nptr, err := cidForEntries(ctx, mst.entries, mst.cst) + nptr, err := cidForEntries(ctx, mst.entries, writer) if err != nil { return cid.Undef, err } diff --git a/pkg/ms3t/mst/mst_util.go b/pkg/ms3t/mst/mst_util.go index 730b5cc..6aa386f 100644 --- a/pkg/ms3t/mst/mst_util.go +++ b/pkg/ms3t/mst/mst_util.go @@ -9,8 +9,8 @@ import ( "unsafe" "github.com/ipfs/go-cid" - cbor "github.com/ipfs/go-ipld-cbor" - mh "github.com/multiformats/go-multihash" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" ) // MaxKeyBytes is the maximum length, in bytes, of a key stored in the MST. @@ -64,7 +64,7 @@ func layerForEntries(entries []nodeEntry) int { return leadingZerosOnHash(firstLeaf.Key) } -func deserializeNodeData(ctx context.Context, cst cbor.IpldStore, nd *NodeData, layer int) ([]nodeEntry, error) { +func deserializeNodeData(ctx context.Context, cst blockstore.Reader, nd *NodeData, layer int) ([]nodeEntry, error) { entries := []nodeEntry{} if nd.Left != nil { entries = append(entries, nodeEntry{ @@ -106,14 +106,14 @@ func deserializeNodeData(ctx context.Context, cst cbor.IpldStore, nd *NodeData, return entries, nil } -func serializeNodeData(entries []nodeEntry) (*NodeData, error) { +func serializeNodeData(ctx context.Context, entries []nodeEntry, writer blockstore.Store) (*NodeData, error) { var data NodeData i := 0 if len(entries) > 0 && entries[0].isTree() { i++ - ptr, err := entries[0].Tree.GetPointer(context.TODO()) + ptr, err := entries[0].Tree.GetPointer(ctx, writer) if err != nil { return nil, err } @@ -135,7 +135,7 @@ func serializeNodeData(entries []nodeEntry) (*NodeData, error) { next := entries[i] if next.isTree() { - ptr, err := next.Tree.GetPointer(context.TODO()) + ptr, err := next.Tree.GetPointer(ctx, writer) if err != nil { return nil, fmt.Errorf("getting subtree pointer: %w", err) } @@ -173,12 +173,12 @@ func countPrefixLen(a, b string) int { return i } -func cidForEntries(ctx context.Context, entries []nodeEntry, cst cbor.IpldStore) (cid.Cid, error) { - nd, err := serializeNodeData(entries) +func cidForEntries(ctx context.Context, entries []nodeEntry, writer blockstore.Store) (cid.Cid, error) { + nd, err := serializeNodeData(ctx, entries, writer) if err != nil { return cid.Undef, fmt.Errorf("serializing new entries: %w", err) } - return cst.Put(ctx, nd) + return writer.Put(ctx, nd) } // IsValidKey reports whether s is a valid MST key under this fork's relaxed @@ -203,10 +203,3 @@ func ensureValidKey(s string) error { return nil } -// CborStore wraps a blockstore in a CBOR-aware IpldStore using SHA2-256 -// multihashing. Equivalent to indigo's util.CborStore. -func CborStore(bs cbor.IpldBlockstore) *cbor.BasicIpldStore { - cst := cbor.NewCborStore(bs) - cst.DefaultMultihash = mh.SHA2_256 - return cst -} diff --git a/pkg/ms3t/registry/postgres.go b/pkg/ms3t/registry/postgres.go new file mode 100644 index 0000000..475b971 --- /dev/null +++ b/pkg/ms3t/registry/postgres.go @@ -0,0 +1,180 @@ +package registry + +import ( + "context" + "errors" + "fmt" + + "github.com/ipfs/go-cid" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/pgx/v5/pgxpool" +) + +// uniqueViolation is the Postgres SQLSTATE for a unique constraint +// violation (matches the literal used elsewhere in sprue's stores). +const uniqueViolation = "23505" + +// Postgres is a *pgxpool.Pool-backed Registry. Schema is owned by +// pkg/ms3t/migrations and lives in the `ms3t` Postgres schema. The +// pool is borrowed, never closed by this type. +type Postgres struct { + pool *pgxpool.Pool +} + +// NewPostgres wraps an existing pool. Callers are responsible for +// running pkg/ms3t/migrations.Up against the same pool before any +// registry method is called. +func NewPostgres(pool *pgxpool.Pool) *Postgres { + return &Postgres{pool: pool} +} + +// Compile-time assertion. +var _ Registry = (*Postgres)(nil) + +func (r *Postgres) Create(ctx context.Context, name string, createdAt int64) error { + _, err := r.pool.Exec(ctx, + `INSERT INTO ms3t.buckets (name, root_cid, created_at) VALUES ($1, NULL, $2)`, + name, createdAt) + if err != nil { + var pgErr *pgconn.PgError + if errors.As(err, &pgErr) && pgErr.Code == uniqueViolation { + return ErrExists + } + return fmt.Errorf("registry: create %q: %w", name, err) + } + return nil +} + +func (r *Postgres) Get(ctx context.Context, name string) (*State, error) { + var rootBytes, forgeBytes []byte + var createdAt int64 + err := r.pool.QueryRow(ctx, + `SELECT root_cid, forge_root_cid, created_at FROM ms3t.buckets WHERE name = $1`, name). + Scan(&rootBytes, &forgeBytes, &createdAt) + if errors.Is(err, pgx.ErrNoRows) { + return nil, ErrNotFound + } + if err != nil { + return nil, fmt.Errorf("registry: get %q: %w", name, err) + } + + st := &State{Name: name, CreatedAt: createdAt} + if err := setCidPg(&st.Root, rootBytes, name, "root_cid"); err != nil { + return nil, err + } + if err := setCidPg(&st.ForgeRoot, forgeBytes, name, "forge_root_cid"); err != nil { + return nil, err + } + return st, nil +} + +func (r *Postgres) List(ctx context.Context) ([]*State, error) { + rows, err := r.pool.Query(ctx, + `SELECT name, root_cid, forge_root_cid, created_at FROM ms3t.buckets ORDER BY name ASC`) + if err != nil { + return nil, fmt.Errorf("registry: list: %w", err) + } + defer rows.Close() + + var out []*State + for rows.Next() { + var name string + var rootBytes, forgeBytes []byte + var createdAt int64 + if err := rows.Scan(&name, &rootBytes, &forgeBytes, &createdAt); err != nil { + return nil, fmt.Errorf("registry: list scan: %w", err) + } + st := &State{Name: name, CreatedAt: createdAt} + if err := setCidPg(&st.Root, rootBytes, name, "root_cid"); err != nil { + return nil, err + } + if err := setCidPg(&st.ForgeRoot, forgeBytes, name, "forge_root_cid"); err != nil { + return nil, err + } + out = append(out, st) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("registry: list rows: %w", err) + } + return out, nil +} + +func (r *Postgres) Delete(ctx context.Context, name string) error { + tag, err := r.pool.Exec(ctx, `DELETE FROM ms3t.buckets WHERE name = $1`, name) + if err != nil { + return fmt.Errorf("registry: delete %q: %w", name, err) + } + if tag.RowsAffected() == 0 { + return ErrNotFound + } + return nil +} + +func (r *Postgres) CASRoot(ctx context.Context, name string, expect, next cid.Cid) error { + var ( + expectBytes []byte + nextBytes []byte + ) + if expect.Defined() { + expectBytes = expect.Bytes() + } + if next.Defined() { + nextBytes = next.Bytes() + } + + var ( + tag pgconn.CommandTag + err error + ) + if expectBytes == nil { + tag, err = r.pool.Exec(ctx, + `UPDATE ms3t.buckets SET root_cid = $1 WHERE name = $2 AND root_cid IS NULL`, + nextBytes, name) + } else { + tag, err = r.pool.Exec(ctx, + `UPDATE ms3t.buckets SET root_cid = $1 WHERE name = $2 AND root_cid = $3`, + nextBytes, name, expectBytes) + } + if err != nil { + return fmt.Errorf("registry: cas %q: %w", name, err) + } + if tag.RowsAffected() == 0 { + // Either the bucket doesn't exist or the expected root didn't match. + if _, gerr := r.Get(ctx, name); errors.Is(gerr, ErrNotFound) { + return ErrNotFound + } + return ErrConflict + } + return nil +} + +func (r *Postgres) SetForgeRoot(ctx context.Context, name string, root cid.Cid) error { + var rootBytes []byte + if root.Defined() { + rootBytes = root.Bytes() + } + tag, err := r.pool.Exec(ctx, + `UPDATE ms3t.buckets SET forge_root_cid = $1 WHERE name = $2`, + rootBytes, name) + if err != nil { + return fmt.Errorf("registry: set forge root %q: %w", name, err) + } + if tag.RowsAffected() == 0 { + return ErrNotFound + } + return nil +} + +func setCidPg(dst *cid.Cid, raw []byte, name, field string) error { + if len(raw) == 0 { + *dst = cid.Undef + return nil + } + c, err := cid.Cast(raw) + if err != nil { + return fmt.Errorf("registry: bad %s for %q: %w", field, name, err) + } + *dst = c + return nil +} diff --git a/pkg/ms3t/registry/segments.go b/pkg/ms3t/registry/segments.go new file mode 100644 index 0000000..4e369ce --- /dev/null +++ b/pkg/ms3t/registry/segments.go @@ -0,0 +1,277 @@ +package registry + +import ( + "context" + "errors" + "fmt" + + "github.com/ipfs/go-cid" + "github.com/jackc/pgx/v5" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" + "github.com/storacha/sprue/pkg/ms3t/logstore" +) + +// Segment-level methods for *Postgres. These satisfy logstore.Meta; +// the compile-time assertion at the bottom of the file pins the +// interface. + +func (r *Postgres) NextSegmentSeq(ctx context.Context) (uint64, error) { + var seq uint64 + if err := r.pool.QueryRow(ctx, `SELECT nextval('ms3t.segment_seq')`).Scan(&seq); err != nil { + return 0, fmt.Errorf("registry: next segment seq: %w", err) + } + return seq, nil +} + +func (r *Postgres) InsertSegmentOpen(ctx context.Context, seq uint64) error { + _, err := r.pool.Exec(ctx, + `INSERT INTO ms3t.segments (seq, state, size_bytes) VALUES ($1, 'open', 0) + ON CONFLICT (seq) DO NOTHING`, + int64(seq)) + if err != nil { + return fmt.Errorf("registry: insert segment %d: %w", seq, err) + } + return nil +} + +func (r *Postgres) MarkSegmentSealed(ctx context.Context, seq uint64, sealedAt int64, sizeBytes int64, sha256 []byte, opRoots []blockstore.OpRoot) error { + tx, err := r.pool.Begin(ctx) + if err != nil { + return fmt.Errorf("registry: begin seal %d: %w", seq, err) + } + defer tx.Rollback(ctx) + + tag, err := tx.Exec(ctx, + `UPDATE ms3t.segments + SET state = 'sealed', sealed_at = $2, size_bytes = $3, car_sha256 = $4 + WHERE seq = $1 AND state = 'open'`, + int64(seq), sealedAt, sizeBytes, sha256) + if err != nil { + return fmt.Errorf("registry: seal %d: %w", seq, err) + } + if tag.RowsAffected() == 0 { + // Either the segment is missing or it has already advanced past + // 'open'. Treat as a no-op so seal is idempotent against + // crashes between disk seal and DB update. + return nil + } + + if err := insertOpRootsTx(ctx, tx, seq, opRoots); err != nil { + return err + } + if err := tx.Commit(ctx); err != nil { + return fmt.Errorf("registry: commit seal %d: %w", seq, err) + } + return nil +} + +func (r *Postgres) MarkSegmentFlushed(ctx context.Context, seq uint64, flushedAt int64, opRoots []blockstore.OpRoot) error { + tx, err := r.pool.Begin(ctx) + if err != nil { + return fmt.Errorf("registry: begin flush %d: %w", seq, err) + } + defer tx.Rollback(ctx) + + tag, err := tx.Exec(ctx, + `UPDATE ms3t.segments SET state = 'flushed', flushed_at = $2 WHERE seq = $1 AND state = 'sealed'`, + int64(seq), flushedAt) + if err != nil { + return fmt.Errorf("registry: flush %d: %w", seq, err) + } + if tag.RowsAffected() == 0 { + // Already flushed (or somehow rolled back to open). Idempotent. + return nil + } + + // Apply forge_root advances in slice order. Segments flush in seq + // order, and within a segment the slice order is the order of + // commits, so the last write for each bucket wins. + // + // TODO(frrist/ms3t): the UPDATE below is unconditional on + // root_cid, which is incorrect when a writer's logstore.Commit + // succeeds but its subsequent registry.CASRoot fails (transient + // Postgres error, context cancellation between the two calls). + // In that case, the op_root for newRoot is durable in the log + // even though the bucket's published root_cid never advanced. + // When this segment flushes, the loop below blindly sets + // forge_root_cid = newRoot — even though root_cid is still + // oldRoot — breaking the invariant "forge_root_cid is a Root the + // bucket has actually published, with its full DAG in Forge." + // + // Fix: gate the UPDATE on root_cid, e.g. + // + // UPDATE ms3t.buckets + // SET forge_root_cid = $1 + // WHERE name = $2 AND root_cid = $1 + // + // With segments flushing in seq order, this naturally lets a + // later segment's flush advance forge_root_cid for the bucket + // once root_cid has caught up via a successful CASRoot, and + // silently skips orphan op_roots from failed commits. + // + // Out of scope for the bucketop refactor; track separately. + for _, opr := range opRoots { + if !opr.Root.Defined() { + continue + } + if _, err := tx.Exec(ctx, + `UPDATE ms3t.buckets SET forge_root_cid = $1 WHERE name = $2`, + opr.Root.Bytes(), opr.Bucket); err != nil { + return fmt.Errorf("registry: advance forge_root for %q: %w", opr.Bucket, err) + } + } + if err := tx.Commit(ctx); err != nil { + return fmt.Errorf("registry: commit flush %d: %w", seq, err) + } + return nil +} + +func (r *Postgres) DeleteSegment(ctx context.Context, seq uint64) error { + if _, err := r.pool.Exec(ctx, `DELETE FROM ms3t.segments WHERE seq = $1`, int64(seq)); err != nil { + return fmt.Errorf("registry: delete segment %d: %w", seq, err) + } + return nil +} + +func (r *Postgres) ListUnflushedSegments(ctx context.Context) ([]logstore.SegmentMeta, error) { + rows, err := r.pool.Query(ctx, + `SELECT seq, state, COALESCE(sealed_at, 0), COALESCE(flushed_at, 0), size_bytes, car_sha256 + FROM ms3t.segments + WHERE state IN ('open', 'sealed') + ORDER BY seq ASC`) + if err != nil { + return nil, fmt.Errorf("registry: list unflushed segments: %w", err) + } + defer rows.Close() + + var out []logstore.SegmentMeta + for rows.Next() { + var ( + seqInt int64 + stateS string + sealed int64 + flushed int64 + size int64 + sha []byte + ) + if err := rows.Scan(&seqInt, &stateS, &sealed, &flushed, &size, &sha); err != nil { + return nil, fmt.Errorf("registry: scan segment: %w", err) + } + state, ok := logstore.ParseState(stateS) + if !ok { + return nil, fmt.Errorf("registry: bad segment state %q for seq %d", stateS, seqInt) + } + out = append(out, logstore.SegmentMeta{ + Seq: uint64(seqInt), + State: state, + SealedAt: sealed, + FlushedAt: flushed, + SizeBytes: size, + SHA256: sha, + }) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("registry: list segments rows: %w", err) + } + + // Hydrate op_roots for sealed segments only (open segments have + // none). Done in a second pass to keep the query simple. + for i := range out { + if out[i].State != logstore.StateSealed { + continue + } + ops, err := r.fetchOpRoots(ctx, out[i].Seq) + if err != nil { + return nil, err + } + out[i].OpRoots = ops + } + return out, nil +} + +func (r *Postgres) RehydrateSegment(ctx context.Context, m logstore.SegmentMeta) error { + tx, err := r.pool.Begin(ctx) + if err != nil { + return fmt.Errorf("registry: begin rehydrate %d: %w", m.Seq, err) + } + defer tx.Rollback(ctx) + + // Replace any existing rows for this seq. + if _, err := tx.Exec(ctx, `DELETE FROM ms3t.segments WHERE seq = $1`, int64(m.Seq)); err != nil { + return fmt.Errorf("registry: rehydrate clear %d: %w", m.Seq, err) + } + + var sealedAt, flushedAt *int64 + if m.SealedAt != 0 { + v := m.SealedAt + sealedAt = &v + } + if m.FlushedAt != 0 { + v := m.FlushedAt + flushedAt = &v + } + if _, err := tx.Exec(ctx, + `INSERT INTO ms3t.segments (seq, state, sealed_at, flushed_at, size_bytes, car_sha256) + VALUES ($1, $2, $3, $4, $5, $6)`, + int64(m.Seq), m.State.String(), sealedAt, flushedAt, m.SizeBytes, m.SHA256); err != nil { + return fmt.Errorf("registry: rehydrate insert %d: %w", m.Seq, err) + } + + if err := insertOpRootsTx(ctx, tx, m.Seq, m.OpRoots); err != nil { + return err + } + if err := tx.Commit(ctx); err != nil { + return fmt.Errorf("registry: rehydrate commit %d: %w", m.Seq, err) + } + return nil +} + +func (r *Postgres) fetchOpRoots(ctx context.Context, seq uint64) ([]blockstore.OpRoot, error) { + rows, err := r.pool.Query(ctx, + `SELECT bucket, root_cid FROM ms3t.segment_op_roots WHERE seq = $1 ORDER BY seq_within ASC`, + int64(seq)) + if err != nil { + return nil, fmt.Errorf("registry: fetch op_roots %d: %w", seq, err) + } + defer rows.Close() + + var out []blockstore.OpRoot + for rows.Next() { + var bucket string + var rootBytes []byte + if err := rows.Scan(&bucket, &rootBytes); err != nil { + return nil, fmt.Errorf("registry: scan op_root: %w", err) + } + c, err := cid.Cast(rootBytes) + if err != nil { + return nil, fmt.Errorf("registry: bad root_cid for %q seq %d: %w", bucket, seq, err) + } + out = append(out, blockstore.OpRoot{Bucket: bucket, Root: c}) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("registry: fetch op_roots rows %d: %w", seq, err) + } + return out, nil +} + +func insertOpRootsTx(ctx context.Context, tx pgx.Tx, seq uint64, opRoots []blockstore.OpRoot) error { + if len(opRoots) == 0 { + return nil + } + for i, opr := range opRoots { + if !opr.Root.Defined() { + return errors.New("registry: op_root.Root must be defined") + } + if _, err := tx.Exec(ctx, + `INSERT INTO ms3t.segment_op_roots (seq, seq_within, bucket, root_cid) + VALUES ($1, $2, $3, $4)`, + int64(seq), i, opr.Bucket, opr.Root.Bytes()); err != nil { + return fmt.Errorf("registry: insert op_root %d/%d: %w", seq, i, err) + } + } + return nil +} + +// Compile-time assertion: Postgres satisfies logstore.Meta. +var _ logstore.Meta = (*Postgres)(nil) diff --git a/pkg/ms3t/registry/sqlite.go b/pkg/ms3t/registry/sqlite.go deleted file mode 100644 index c85cf0f..0000000 --- a/pkg/ms3t/registry/sqlite.go +++ /dev/null @@ -1,212 +0,0 @@ -package registry - -import ( - "context" - "database/sql" - "errors" - "fmt" - "strings" - - "github.com/ipfs/go-cid" -) - -// Schema is the DDL for the buckets table. Vanilla SQL. -const Schema = ` -CREATE TABLE IF NOT EXISTS buckets ( - name TEXT PRIMARY KEY, - root_cid BLOB, - forge_root_cid BLOB, - created_at INTEGER NOT NULL -); -` - -// addForgeRootColumn brings older schemas (without forge_root_cid) -// forward in place. Idempotent: if the column exists, the ALTER -// errors and we treat that as already-migrated. -const addForgeRootColumn = `ALTER TABLE buckets ADD COLUMN forge_root_cid BLOB` - -// SQL is a database/sql-backed Registry. Works with any SQL driver that -// supports the byte-blob and integer types used here. -type SQL struct { - db *sql.DB -} - -// NewSQL wraps an open *sql.DB and ensures the schema exists. -func NewSQL(db *sql.DB) (*SQL, error) { - if _, err := db.Exec(Schema); err != nil { - return nil, fmt.Errorf("registry: ensure schema: %w", err) - } - // Best-effort migration for older databases. The error case is the - // column already existing (driver-specific message), which is fine. - if _, err := db.Exec(addForgeRootColumn); err != nil { - if !strings.Contains(err.Error(), "duplicate column") { - return nil, fmt.Errorf("registry: add forge_root_cid: %w", err) - } - } - return &SQL{db: db}, nil -} - -func (r *SQL) Create(ctx context.Context, name string, createdAt int64) error { - _, err := r.db.ExecContext(ctx, - `INSERT INTO buckets (name, root_cid, created_at) VALUES (?, NULL, ?)`, - name, createdAt) - if err != nil { - // Cheap, portable detection: a second Create with the same name will - // trip the PK. Different drivers wrap this error differently, so - // fall back to Get to distinguish. - if existing, gerr := r.Get(ctx, name); gerr == nil && existing != nil { - return ErrExists - } - return fmt.Errorf("registry: create %q: %w", name, err) - } - return nil -} - -func (r *SQL) Get(ctx context.Context, name string) (*State, error) { - var rootBytes, forgeBytes []byte - var createdAt int64 - err := r.db.QueryRowContext(ctx, - `SELECT root_cid, forge_root_cid, created_at FROM buckets WHERE name = ?`, name). - Scan(&rootBytes, &forgeBytes, &createdAt) - if errors.Is(err, sql.ErrNoRows) { - return nil, ErrNotFound - } - if err != nil { - return nil, fmt.Errorf("registry: get %q: %w", name, err) - } - - st := &State{Name: name, CreatedAt: createdAt} - if err := setCid(&st.Root, rootBytes, name, "root_cid"); err != nil { - return nil, err - } - if err := setCid(&st.ForgeRoot, forgeBytes, name, "forge_root_cid"); err != nil { - return nil, err - } - return st, nil -} - -func (r *SQL) List(ctx context.Context) ([]*State, error) { - rows, err := r.db.QueryContext(ctx, - `SELECT name, root_cid, forge_root_cid, created_at FROM buckets ORDER BY name ASC`) - if err != nil { - return nil, fmt.Errorf("registry: list: %w", err) - } - defer rows.Close() - - var out []*State - for rows.Next() { - var name string - var rootBytes, forgeBytes []byte - var createdAt int64 - if err := rows.Scan(&name, &rootBytes, &forgeBytes, &createdAt); err != nil { - return nil, fmt.Errorf("registry: list scan: %w", err) - } - st := &State{Name: name, CreatedAt: createdAt} - if err := setCid(&st.Root, rootBytes, name, "root_cid"); err != nil { - return nil, err - } - if err := setCid(&st.ForgeRoot, forgeBytes, name, "forge_root_cid"); err != nil { - return nil, err - } - out = append(out, st) - } - if err := rows.Err(); err != nil { - return nil, fmt.Errorf("registry: list rows: %w", err) - } - return out, nil -} - -func setCid(dst *cid.Cid, raw []byte, name, field string) error { - if len(raw) == 0 { - *dst = cid.Undef - return nil - } - c, err := cid.Cast(raw) - if err != nil { - return fmt.Errorf("registry: bad %s for %q: %w", field, name, err) - } - *dst = c - return nil -} - -func (r *SQL) Delete(ctx context.Context, name string) error { - res, err := r.db.ExecContext(ctx, - `DELETE FROM buckets WHERE name = ?`, name) - if err != nil { - return fmt.Errorf("registry: delete %q: %w", name, err) - } - n, err := res.RowsAffected() - if err != nil { - return fmt.Errorf("registry: delete rows: %w", err) - } - if n == 0 { - return ErrNotFound - } - return nil -} - -func (r *SQL) CASRoot(ctx context.Context, name string, expect, next cid.Cid) error { - var ( - expectBytes []byte - nextBytes []byte - ) - if expect.Defined() { - expectBytes = expect.Bytes() - } - if next.Defined() { - nextBytes = next.Bytes() - } - - var ( - res sql.Result - err error - ) - if expectBytes == nil { - res, err = r.db.ExecContext(ctx, - `UPDATE buckets SET root_cid = ? WHERE name = ? AND root_cid IS NULL`, - nextBytes, name) - } else { - res, err = r.db.ExecContext(ctx, - `UPDATE buckets SET root_cid = ? WHERE name = ? AND root_cid = ?`, - nextBytes, name, expectBytes) - } - if err != nil { - return fmt.Errorf("registry: cas %q: %w", name, err) - } - n, err := res.RowsAffected() - if err != nil { - return fmt.Errorf("registry: cas rows: %w", err) - } - if n == 0 { - // Either the bucket doesn't exist or the expected root didn't match. - if _, gerr := r.Get(ctx, name); errors.Is(gerr, ErrNotFound) { - return ErrNotFound - } - return ErrConflict - } - return nil -} - -func (r *SQL) SetForgeRoot(ctx context.Context, name string, root cid.Cid) error { - var rootBytes []byte - if root.Defined() { - rootBytes = root.Bytes() - } - res, err := r.db.ExecContext(ctx, - `UPDATE buckets SET forge_root_cid = ? WHERE name = ?`, - rootBytes, name) - if err != nil { - return fmt.Errorf("registry: set forge root %q: %w", name, err) - } - n, err := res.RowsAffected() - if err != nil { - return fmt.Errorf("registry: set forge root rows: %w", err) - } - if n == 0 { - return ErrNotFound - } - return nil -} - -// Compile-time assertion. -var _ Registry = (*SQL)(nil) diff --git a/pkg/ms3t/s3frontend/backend.go b/pkg/ms3t/s3frontend/backend.go new file mode 100644 index 0000000..28013b2 --- /dev/null +++ b/pkg/ms3t/s3frontend/backend.go @@ -0,0 +1,89 @@ +// Package s3frontend implements versitygw's backend.Backend by +// orchestrating directly over the ms3t domain primitives. It is the +// only S3 frontend ms3t ships; it is wired into the process via +// pkg/ms3t.Server. +// +// The Backend type is a thin protocol adapter: +// - Read paths drive a single ReadStore that exposes both +// CBOR-decoded reads (manifest, MST nodes) and raw block reads +// (body chunks). The interface has no Put method, so write paths +// can't accidentally route through it. +// - Write paths drive a per-op bucketop.Tx, which owns the +// staging buffer, MST CBOR view, bucket-Root CAS, and per-bucket +// locking. +// +// Operations not implemented (multipart, lifecycle, locking, +// versioning, etc.) inherit ErrNotImplemented from the embedded +// backend.BackendUnsupported. The few unsupported-by-default +// methods that versitygw nevertheless calls on every request +// (GetBucketAcl, GetBucketPolicy, GetObjectLockConfiguration, +// GetBucketVersioning) are stubbed in bucket.go. +package s3frontend + +import ( + "context" + + "github.com/versity/versitygw/backend" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" + msbucket "github.com/storacha/sprue/pkg/ms3t/bucket" + "github.com/storacha/sprue/pkg/ms3t/bucketop" + "github.com/storacha/sprue/pkg/ms3t/logstore" + "github.com/storacha/sprue/pkg/ms3t/registry" +) + +// Backend implements versitygw's backend.Backend directly over the +// ms3t domain primitives. The embedded BackendUnsupported supplies +// ErrNotImplemented defaults for every operation; we override only +// the ones we actually serve. +type Backend struct { + backend.BackendUnsupported + + read blockstore.ReadStore + reg registry.Registry + txns *bucketop.Coordinator + codec msbucket.BodyCodec +} + +// Compile-time assertion that Backend satisfies versitygw's interface. +var _ backend.Backend = (*Backend)(nil) + +// New constructs a Backend wired over ms3t's domain primitives. +// rs is the layered read blockstore (log → forge); log is the +// LSM-style write log; codec is the body-DAG codec used for both +// chunking on PUT and streaming on GET — typically a *FixedChunker. +func New(reg registry.Registry, rs blockstore.ReadStore, log *logstore.Store, codec msbucket.BodyCodec) *Backend { + return &Backend{ + read: rs, + reg: reg, + txns: bucketop.NewCoordinator(bucketop.Deps{Reg: reg, Log: log, Reads: rs}), + codec: codec, + } +} + +// String identifies this backend in versitygw logs. +func (*Backend) String() string { return "ms3t" } + +// Shutdown is a no-op; lifecycle for the underlying registry/log is +// owned by pkg/ms3t.Server's Stop hook, not by versitygw. +func (*Backend) Shutdown() {} + +// Recover is a no-op in the LSM design: logstore.Open already +// scanned the segment directory, reconciled with Postgres, and +// re-enqueued any pending segments for the background flusher. +// Recover is retained as the lifecycle seam in case future +// invariants need verifying before the listener accepts traffic. +func (b *Backend) Recover(_ context.Context) error { return nil } + +// Drain shuts the log down via the Coordinator: seals the open +// segment, drains the flush queue, and updates per-bucket +// forge_root_cid for every op_root that landed in a flushed +// segment. After Drain returns cleanly, no acked write is +// unrepresented in Postgres. +func (b *Backend) Drain(ctx context.Context) error { + if b.txns == nil { + return nil + } + return b.txns.Close(ctx) +} + diff --git a/pkg/ms3t/s3frontend/bucket.go b/pkg/ms3t/s3frontend/bucket.go new file mode 100644 index 0000000..07b0f3c --- /dev/null +++ b/pkg/ms3t/s3frontend/bucket.go @@ -0,0 +1,211 @@ +package s3frontend + +import ( + "context" + "errors" + "sort" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/ipfs/go-cid" + "github.com/versity/versitygw/s3err" + "github.com/versity/versitygw/s3response" + + "github.com/storacha/sprue/pkg/ms3t/mst" + "github.com/storacha/sprue/pkg/ms3t/registry" +) + +func (b *Backend) ListBuckets(ctx context.Context, input s3response.ListBucketsInput) (s3response.ListAllMyBucketsResult, error) { + states, err := b.reg.List(ctx) + if err != nil { + return s3response.ListAllMyBucketsResult{}, err + } + sort.Slice(states, func(i, j int) bool { return states[i].Name < states[j].Name }) + + var entries []s3response.ListAllMyBucketsEntry + var cToken string + for _, st := range states { + if input.Prefix != "" && !strings.HasPrefix(st.Name, input.Prefix) { + continue + } + if st.Name <= input.ContinuationToken { + continue + } + if input.MaxBuckets > 0 && int32(len(entries)) == input.MaxBuckets { + cToken = entries[len(entries)-1].Name + break + } + entries = append(entries, s3response.ListAllMyBucketsEntry{ + Name: st.Name, + CreationDate: time.Unix(st.CreatedAt, 0), + }) + } + + return s3response.ListAllMyBucketsResult{ + Buckets: s3response.ListAllMyBucketsList{Bucket: entries}, + Owner: s3response.CanonicalUser{ID: input.Owner}, + Prefix: input.Prefix, + ContinuationToken: cToken, + }, nil +} + +// GetBucketAcl is invoked on every object op via versitygw's ParseAcl +// middleware to capture the bucket owner before the controller runs +// (acl-parser.go:30). We don't model ACLs — but returning the +// BackendUnsupported default (ErrNotImplemented) propagates as +// "header you provided implies functionality that is not implemented" +// for *every* PUT/GET/DELETE. Returning empty bytes for a known +// bucket lets ParseACL produce ACL{}, after which the middleware +// substitutes the configured root access key as the owner. +func (b *Backend) GetBucketAcl(ctx context.Context, input *s3.GetBucketAclInput) ([]byte, error) { + if input.Bucket == nil { + return nil, s3err.GetAPIError(s3err.ErrInvalidBucketName) + } + if _, err := b.reg.Get(ctx, *input.Bucket); err != nil { + if errors.Is(err, registry.ErrNotFound) { + return nil, s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return nil, err + } + return nil, nil +} + +// GetObjectLockConfiguration is called from auth.CheckObjectAccess +// (object_lock.go:223) on every object PUT/DELETE. The caller only +// tolerates ErrObjectLockConfigurationNotFound; ErrNotImplemented +// propagates as "header you provided implies functionality not +// implemented" — ms3t doesn't model object lock today, so the +// honest answer is "no configuration." +func (b *Backend) GetObjectLockConfiguration(ctx context.Context, bucket string) ([]byte, error) { + if _, err := b.reg.Get(ctx, bucket); err != nil { + if errors.Is(err, registry.ErrNotFound) { + return nil, s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return nil, err + } + return nil, s3err.GetAPIError(s3err.ErrObjectLockConfigurationNotFound) +} + +// GetBucketPolicy is called from auth.VerifyAccess (access-control.go:103) +// for non-root requests and from auth.VerifyPublicAccess for anonymous +// ones. Authenticated root requests short-circuit before this is hit +// today, but stubbing it now keeps non-root authz paths from tripping +// the same NotImplemented trap. +func (b *Backend) GetBucketPolicy(ctx context.Context, bucket string) ([]byte, error) { + if _, err := b.reg.Get(ctx, bucket); err != nil { + if errors.Is(err, registry.ErrNotFound) { + return nil, s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return nil, err + } + return nil, s3err.GetAPIError(s3err.ErrNoSuchBucketPolicy) +} + +// GetBucketVersioning is called from auth.CheckObjectAccess +// (object_lock.go:220, 257). Both call sites tolerate any error by +// treating versioning as disabled, so we could leave the default +// ErrNotImplemented — but returning a clean "Suspended" status is +// less noisy in logs and makes the no-op intent explicit. +func (b *Backend) GetBucketVersioning(ctx context.Context, bucket string) (s3response.GetBucketVersioningOutput, error) { + if _, err := b.reg.Get(ctx, bucket); err != nil { + if errors.Is(err, registry.ErrNotFound) { + return s3response.GetBucketVersioningOutput{}, s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return s3response.GetBucketVersioningOutput{}, err + } + return s3response.GetBucketVersioningOutput{}, nil +} + +func (b *Backend) HeadBucket(ctx context.Context, input *s3.HeadBucketInput) (*s3.HeadBucketOutput, error) { + if input.Bucket == nil { + return nil, s3err.GetAPIError(s3err.ErrInvalidBucketName) + } + if _, err := b.reg.Get(ctx, *input.Bucket); err != nil { + if errors.Is(err, registry.ErrNotFound) { + return nil, s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return nil, err + } + return &s3.HeadBucketOutput{}, nil +} + +func (b *Backend) CreateBucket(ctx context.Context, input *s3.CreateBucketInput, _ []byte) error { + if input.Bucket == nil { + return s3err.GetAPIError(s3err.ErrInvalidBucketName) + } + // strings.Clone: versitygw passes us a fiber.Ctx.Params() string + // whose backing buffer is recycled when the request completes. + // Storing it directly in the registry produces map-key corruption + // once the buffer is reused for the next request. + name := strings.Clone(*input.Bucket) + if !validBucketName(name) { + return s3err.GetAPIError(s3err.ErrInvalidBucketName) + } + if err := b.reg.Create(ctx, name, time.Now().Unix()); err != nil { + if errors.Is(err, registry.ErrExists) { + return s3err.GetAPIError(s3err.ErrBucketAlreadyExists) + } + return err + } + return nil +} + +func (b *Backend) DeleteBucket(ctx context.Context, name string) error { + return b.txns.WithLock(ctx, name, func(ctx context.Context) error { + st, err := b.reg.Get(ctx, name) + if err != nil { + if errors.Is(err, registry.ErrNotFound) { + return s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return err + } + + // S3 forbids deleting non-empty buckets. Walk the MST until + // we see any leaf, then bail. + if st.Root.Defined() { + t := mst.LoadMST(b.read, st.Root) + var seen bool + walkErr := t.WalkLeavesFromNocache(ctx, "", func(string, cid.Cid) error { + seen = true + return mst.ErrStopWalk + }) + if walkErr != nil { + return walkErr + } + if seen { + return s3err.GetAPIError(s3err.ErrBucketNotEmpty) + } + } + + if err := b.reg.Delete(ctx, name); err != nil { + if errors.Is(err, registry.ErrNotFound) { + return s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return err + } + return nil + }) +} + +// validBucketName mirrors the rules from the prior bucket.Service: +// 3-63 chars, lowercase letters, digits, dots, dashes; cannot begin +// with a dot or dash. This is the S3 DNS-compliant subset. +func validBucketName(s string) bool { + if len(s) < 3 || len(s) > 63 { + return false + } + for i, r := range s { + switch { + case r >= 'a' && r <= 'z': + case r >= '0' && r <= '9': + case r == '-' || r == '.': + if i == 0 { + return false + } + default: + return false + } + } + return true +} diff --git a/pkg/ms3t/s3frontend/object.go b/pkg/ms3t/s3frontend/object.go new file mode 100644 index 0000000..a5485a0 --- /dev/null +++ b/pkg/ms3t/s3frontend/object.go @@ -0,0 +1,441 @@ +package s3frontend + +import ( + "context" + "encoding/hex" + "errors" + "fmt" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/ipfs/go-cid" + "github.com/versity/versitygw/backend" + "github.com/versity/versitygw/s3err" + "github.com/versity/versitygw/s3response" + + msbucket "github.com/storacha/sprue/pkg/ms3t/bucket" + "github.com/storacha/sprue/pkg/ms3t/bucketop" + "github.com/storacha/sprue/pkg/ms3t/mst" + "github.com/storacha/sprue/pkg/ms3t/registry" +) + +const defaultMaxKeys = 1000 + +// PutObject writes an object. Tagging, user metadata, ACLs, +// checksums, retention, and preconditions are dropped on the floor +// for now — the manifest schema has no place for them yet (see +// bucket-metadata.rfc §"Canonical state vs service state"). ETag is +// the hex sha256 of the body, quoted per S3 wire format. +func (b *Backend) PutObject(ctx context.Context, input s3response.PutObjectInput) (s3response.PutObjectOutput, error) { + if input.Bucket == nil { + return s3response.PutObjectOutput{}, s3err.GetAPIError(s3err.ErrInvalidBucketName) + } + if input.Key == nil { + return s3response.PutObjectOutput{}, s3err.GetAPIError(s3err.ErrNoSuchKey) + } + bucketName := *input.Bucket + key := *input.Key + if !mst.IsValidKey(key) { + return s3response.PutObjectOutput{}, s3err.GetAPIError(s3err.ErrInvalidArgument) + } + + contentType := backend.GetStringFromPtr(input.ContentType) + if contentType == "" { + contentType = "application/octet-stream" + } + + // mf is captured by the closure and read after WithTx commits, + // so we can build the response (ETag = sha256, size) from the + // same manifest that was committed. + var mf *msbucket.ObjectManifest + + err := b.txns.WithTx(ctx, bucketName, func(ctx context.Context, tx *bucketop.Tx) (cid.Cid, error) { + bodyRec, err := b.codec.Chunk(ctx, tx, input.Body) + if err != nil { + return cid.Undef, fmt.Errorf("chunk body: %w", err) + } + mf = &msbucket.ObjectManifest{ + Key: key, + ContentType: contentType, + Created: time.Now().Unix(), + Body: bodyRec, + } + mfCid, err := tx.Put(ctx, mf) + if err != nil { + return cid.Undef, fmt.Errorf("manifest put: %w", err) + } + + t := tx.LoadTree() + t2, err := t.Add(ctx, key, mfCid, -1) + if errors.Is(err, mst.ErrAlreadyExists) { + t2, err = t.Update(ctx, key, mfCid) + } + if err != nil { + return cid.Undef, fmt.Errorf("mst write: %w", err) + } + + return t2.GetPointer(ctx, tx) + }) + if err != nil { + if errors.Is(err, bucketop.ErrBucketNotFound) { + return s3response.PutObjectOutput{}, s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return s3response.PutObjectOutput{}, fmt.Errorf("s3frontend: put: %w", err) + } + + size := mf.Body.Size + return s3response.PutObjectOutput{ + ETag: etagOf(mf), + Size: &size, + }, nil +} + +// HeadObject returns the manifest's metadata. Range, partNumber, +// preconditions, versioning, and checksums are not implemented. +func (b *Backend) HeadObject(ctx context.Context, input *s3.HeadObjectInput) (*s3.HeadObjectOutput, error) { + if input.Bucket == nil { + return nil, s3err.GetAPIError(s3err.ErrInvalidBucketName) + } + if input.Key == nil { + return nil, s3err.GetAPIError(s3err.ErrNoSuchKey) + } + mf, err := b.lookupManifest(ctx, *input.Bucket, *input.Key) + if err != nil { + return nil, err + } + etag := etagOf(mf) + size := mf.Body.Size + lastModified := time.Unix(mf.Created, 0) + contentType := mf.ContentType + return &s3.HeadObjectOutput{ + AcceptRanges: backend.GetPtrFromString("bytes"), + ContentLength: &size, + ContentType: &contentType, + ETag: &etag, + LastModified: &lastModified, + StorageClass: types.StorageClassStandard, + }, nil +} + +// GetObject returns an object body, optionally restricted to a byte +// range supplied via the Range header. The body io.ReadCloser is +// owned by the caller (versitygw closes it after streaming). +func (b *Backend) GetObject(ctx context.Context, input *s3.GetObjectInput) (*s3.GetObjectOutput, error) { + if input.Bucket == nil { + return nil, s3err.GetAPIError(s3err.ErrInvalidBucketName) + } + if input.Key == nil { + return nil, s3err.GetAPIError(s3err.ErrNoSuchKey) + } + mf, err := b.lookupManifest(ctx, *input.Bucket, *input.Key) + if err != nil { + return nil, err + } + + objSize := mf.Body.Size + startOffset, length, isRange, err := backend.ParseObjectRange(objSize, backend.GetStringFromPtr(input.Range)) + if err != nil { + return nil, err + } + + var contentRange *string + var body = b.codec.Open(ctx, b.read, mf.Body) + if isRange { + body = b.codec.OpenRange(ctx, b.read, mf.Body, startOffset, startOffset+length-1) + cr := fmt.Sprintf("bytes %d-%d/%d", startOffset, startOffset+length-1, objSize) + contentRange = &cr + } + + etag := etagOf(mf) + lastModified := time.Unix(mf.Created, 0) + contentType := mf.ContentType + return &s3.GetObjectOutput{ + AcceptRanges: backend.GetPtrFromString("bytes"), + Body: body, + ContentLength: &length, + ContentType: &contentType, + ContentRange: contentRange, + ETag: &etag, + LastModified: &lastModified, + StorageClass: types.StorageClassStandard, + }, nil +} + +// DeleteObject removes an object. Missing keys are no-ops (matching +// S3's idempotent DELETE semantics). +func (b *Backend) DeleteObject(ctx context.Context, input *s3.DeleteObjectInput) (*s3.DeleteObjectOutput, error) { + if input.Bucket == nil { + return nil, s3err.GetAPIError(s3err.ErrInvalidBucketName) + } + if input.Key == nil { + return nil, s3err.GetAPIError(s3err.ErrNoSuchKey) + } + bucketName := *input.Bucket + key := *input.Key + + err := b.txns.WithTx(ctx, bucketName, func(ctx context.Context, tx *bucketop.Tx) (cid.Cid, error) { + // Empty bucket: nothing to delete. Returning cid.Undef from + // the closure tells WithTx to discard cleanly with no + // commit — the equivalent of "no-op success." + if !tx.State().Root.Defined() { + return cid.Undef, nil + } + t := tx.LoadTree() + t2, err := t.Delete(ctx, key) + if errors.Is(err, mst.ErrNotFound) { + // Idempotent DELETE: missing key isn't an error. + return cid.Undef, nil + } + if err != nil { + return cid.Undef, fmt.Errorf("mst delete: %w", err) + } + return t2.GetPointer(ctx, tx) + }) + if err != nil { + if errors.Is(err, bucketop.ErrBucketNotFound) { + return nil, s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return nil, fmt.Errorf("s3frontend: delete: %w", err) + } + return &s3.DeleteObjectOutput{}, nil +} + +// ListObjects (V1) walks the MST in lexicographic order, applying +// S3-style prefix / delimiter filtering with V1's Marker-based +// pagination. +func (b *Backend) ListObjects(ctx context.Context, input *s3.ListObjectsInput) (s3response.ListObjectsResult, error) { + if input.Bucket == nil { + return s3response.ListObjectsResult{}, s3err.GetAPIError(s3err.ErrInvalidBucketName) + } + bucketName := *input.Bucket + prefix := backend.GetStringFromPtr(input.Prefix) + delimiter := backend.GetStringFromPtr(input.Delimiter) + marker := backend.GetStringFromPtr(input.Marker) + + maxKeys := int32(0) + if input.MaxKeys != nil { + maxKeys = *input.MaxKeys + } + limit := int(maxKeys) + if limit <= 0 { + limit = defaultMaxKeys + } + + from := prefix + if marker != "" && marker > from { + // V1 Marker: list strictly after this key. + from = marker + "\x01" + } + + res, err := b.listWalk(ctx, bucketName, prefix, delimiter, from, limit) + if err != nil { + return s3response.ListObjectsResult{}, err + } + + out := s3response.ListObjectsResult{ + Name: &bucketName, + Prefix: &prefix, + Delimiter: &delimiter, + MaxKeys: &maxKeys, + IsTruncated: &res.truncated, + Contents: res.contents, + CommonPrefixes: res.commonPrefixes, + } + if input.Marker != nil { + out.Marker = input.Marker + } + // NextMarker is only set when delimiter is specified and the + // page was truncated, per AWS docs. Without delimiter, callers + // use the last Key in Contents as the marker for the next page. + if res.truncated && delimiter != "" && res.nextKey != "" { + next := res.nextKey + out.NextMarker = &next + } + return out, nil +} + +// ListObjectsV2 walks the MST in lexicographic order, applying +// S3-style prefix and delimiter filtering with V2's +// ContinuationToken-based pagination. +func (b *Backend) ListObjectsV2(ctx context.Context, input *s3.ListObjectsV2Input) (s3response.ListObjectsV2Result, error) { + if input.Bucket == nil { + return s3response.ListObjectsV2Result{}, s3err.GetAPIError(s3err.ErrInvalidBucketName) + } + bucketName := *input.Bucket + prefix := backend.GetStringFromPtr(input.Prefix) + delimiter := backend.GetStringFromPtr(input.Delimiter) + + // ContinuationToken (resumption) takes precedence over StartAfter + // (first-page hint) per S3 semantics. + startAfter := backend.GetStringFromPtr(input.StartAfter) + if input.ContinuationToken != nil && *input.ContinuationToken != "" { + startAfter = *input.ContinuationToken + } + + maxKeys := int32(0) + if input.MaxKeys != nil { + maxKeys = *input.MaxKeys + } + limit := int(maxKeys) + if limit <= 0 { + limit = defaultMaxKeys + } + + from := prefix + if startAfter != "" && startAfter > from { + // Walk strictly past startAfter by appending a low byte. + from = startAfter + "\x01" + } + + res, err := b.listWalk(ctx, bucketName, prefix, delimiter, from, limit) + if err != nil { + return s3response.ListObjectsV2Result{}, err + } + + keyCount := int32(len(res.contents) + len(res.commonPrefixes)) + out := s3response.ListObjectsV2Result{ + Name: &bucketName, + Prefix: &prefix, + Delimiter: &delimiter, + MaxKeys: &maxKeys, + KeyCount: &keyCount, + IsTruncated: &res.truncated, + Contents: res.contents, + CommonPrefixes: res.commonPrefixes, + } + if input.ContinuationToken != nil { + out.ContinuationToken = input.ContinuationToken + } + if input.StartAfter != nil { + out.StartAfter = input.StartAfter + } + if res.truncated && res.nextKey != "" { + next := res.nextKey + out.NextContinuationToken = &next + } + return out, nil +} + +// listWalkResult is the shared output of one MST walk for V1 and V2 +// list. nextKey is the last key (or common prefix) that ended the +// page when truncated; empty when the walk completed. +type listWalkResult struct { + contents []s3response.Object + commonPrefixes []types.CommonPrefix + truncated bool + nextKey string +} + +// listWalk drives a single MST walk shared by ListObjects and +// ListObjectsV2. The version-specific pieces (Marker vs. +// ContinuationToken / StartAfter, NextMarker vs. +// NextContinuationToken) live in the callers; this helper only +// understands prefix, delimiter, and the [from, ...) starting key. +func (b *Backend) listWalk(ctx context.Context, bucketName, prefix, delimiter, from string, limit int) (listWalkResult, error) { + out := listWalkResult{ + contents: []s3response.Object{}, + commonPrefixes: []types.CommonPrefix{}, + } + + st, err := b.reg.Get(ctx, bucketName) + if err != nil { + if errors.Is(err, registry.ErrNotFound) { + return out, s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return out, err + } + if !st.Root.Defined() { + return out, nil + } + + t := mst.LoadMST(b.read, st.Root) + seenPrefix := map[string]struct{}{} + walkErr := t.WalkLeavesFromNocache(ctx, from, func(k string, mfCid cid.Cid) error { + if prefix != "" && !strings.HasPrefix(k, prefix) { + return mst.ErrStopWalk + } + + if delimiter != "" { + tail := k[len(prefix):] + if i := strings.Index(tail, delimiter); i >= 0 { + cp := prefix + tail[:i+len(delimiter)] + if _, dup := seenPrefix[cp]; !dup { + seenPrefix[cp] = struct{}{} + cpCopy := cp + out.commonPrefixes = append(out.commonPrefixes, types.CommonPrefix{Prefix: &cpCopy}) + if len(out.contents)+len(out.commonPrefixes) >= limit { + out.truncated = true + out.nextKey = cp + return mst.ErrStopWalk + } + } + return nil + } + } + + var mf msbucket.ObjectManifest + if err := b.read.Get(ctx, mfCid, &mf); err != nil { + return fmt.Errorf("manifest get %s: %w", mfCid, err) + } + key := k + etag := etagOf(&mf) + size := mf.Body.Size + lastModified := time.Unix(mf.Created, 0) + out.contents = append(out.contents, s3response.Object{ + Key: &key, + ETag: &etag, + Size: &size, + LastModified: &lastModified, + StorageClass: types.ObjectStorageClassStandard, + }) + if len(out.contents)+len(out.commonPrefixes) >= limit { + out.truncated = true + out.nextKey = k + return mst.ErrStopWalk + } + return nil + }) + if walkErr != nil { + return out, fmt.Errorf("s3frontend: walk: %w", walkErr) + } + return out, nil +} + +// lookupManifest is the shared HEAD/GET path: registry → MST → CBOR +// decode of the manifest pointed at by the leaf. Maps "missing +// bucket" / "missing key" to S3 errors. +func (b *Backend) lookupManifest(ctx context.Context, bucketName, key string) (*msbucket.ObjectManifest, error) { + st, err := b.reg.Get(ctx, bucketName) + if err != nil { + if errors.Is(err, registry.ErrNotFound) { + return nil, s3err.GetAPIError(s3err.ErrNoSuchBucket) + } + return nil, err + } + if !st.Root.Defined() { + return nil, s3err.GetAPIError(s3err.ErrNoSuchKey) + } + t := mst.LoadMST(b.read, st.Root) + mfCid, err := t.Get(ctx, key) + if errors.Is(err, mst.ErrNotFound) { + return nil, s3err.GetAPIError(s3err.ErrNoSuchKey) + } + if err != nil { + return nil, fmt.Errorf("s3frontend: mst get: %w", err) + } + var mf msbucket.ObjectManifest + if err := b.read.Get(ctx, mfCid, &mf); err != nil { + return nil, fmt.Errorf("s3frontend: manifest get: %w", err) + } + return &mf, nil +} + +// etagOf returns the manifest's body sha256 as a hex string with +// surrounding double quotes — the format clients expect on the wire. +// Multipart-style ETags ("-") are out of scope until multipart +// is implemented. +func etagOf(mf *msbucket.ObjectManifest) string { + return `"` + hex.EncodeToString(mf.Body.SHA256) + `"` +} diff --git a/pkg/ms3t/server.go b/pkg/ms3t/server.go new file mode 100644 index 0000000..cf003cb --- /dev/null +++ b/pkg/ms3t/server.go @@ -0,0 +1,319 @@ +package ms3t + +import ( + "context" + "errors" + "fmt" + "path/filepath" + "time" + + "github.com/ipfs/go-cid" + "github.com/multiformats/go-multihash" + "github.com/versity/versitygw/auth" + "github.com/versity/versitygw/metrics" + "github.com/versity/versitygw/s3api" + "github.com/versity/versitygw/s3api/middlewares" + "github.com/versity/versitygw/s3event" + "github.com/versity/versitygw/s3log" + "go.uber.org/zap" + + "github.com/storacha/sprue/pkg/ms3t/blockstore" + msbucket "github.com/storacha/sprue/pkg/ms3t/bucket" + "github.com/storacha/sprue/pkg/ms3t/logstore" + "github.com/storacha/sprue/pkg/ms3t/registry" + "github.com/storacha/sprue/pkg/ms3t/s3frontend" + "github.com/storacha/sprue/pkg/ms3t/uploader" +) + +// ServerConfig captures the user-facing knobs of an ms3t S3 listener. +// New() applies defaults for any zero-valued knobs. SealAge is in +// time.Duration form because callers parse the string config field +// once before constructing the server. +type ServerConfig struct { + // Addr is the host:port to bind the S3 listener to. Required. + Addr string + + // DataDir is where the log writes its segments dir; the caller + // is responsible for creating this directory before calling New. + // Required. + DataDir string + + // Region is the AWS region advertised over sigv4. Defaults to + // "us-east-1". + Region string + + // RootAccess / RootSecret configure the single-account IAM root + // user for the embedded S3 listener. Both required. + RootAccess string + RootSecret string + + // ChunkSize is the body chunk size for new objects, in bytes. + // 0 → bucket.DefaultChunkSize. + ChunkSize int64 + + // SealBytes / SealAge / Retain are passed through to logstore.Open. + // Zero values pick logstore defaults (64 MiB / 5 s / 6 segments). + SealBytes int64 + SealAge time.Duration + Retain int + + // MaxConnections / MaxRequests configure versitygw's hard + // concurrency limit. Zero is unsafe (yields 503 SlowDown on every + // request), so New substitutes a sensible default. + MaxConnections int + MaxRequests int +} + +// ServerDeps bundles the runtime collaborators of an ms3t Server +// behind interfaces. Production wiring uses Forge / Internal / +// Postgres; tests can substitute in-memory equivalents without +// standing up Postgres, piri, or the indexing-service. +type ServerDeps struct { + // Logger is optional; defaults to zap.NewNop(). + Logger *zap.Logger + + // BaseBlockReader is the bottom tier of the layered read path — + // what the log falls through to on misses. In production this is + // *blockstore.Forge (network-backed via indexer + piri); in tests + // it can be any IpldBlockstore. + BaseBlockReader blockstore.BlockReader + + // Uploader is the destination for sealed segments. + Uploader uploader.Uploader + + // Registry tracks per-bucket roots. *registry.Postgres satisfies + // both Registry and Meta in production; tests can supply two + // separate implementations or one that does both. + Registry registry.Registry + + // Meta is the persistence backing for log-segment metadata. + // Typically the same instance as Registry. + Meta logstore.Meta +} + +// Server is a fully-wired ms3t S3 listener. Use Start/Stop for +// lifecycle. fx callers wrap these in OnStart/OnStop hooks; tests +// call them directly. +type Server struct { + cfg ServerConfig + logger *zap.Logger + log blockstore.Log + backend *s3frontend.Backend + api *s3api.S3ApiServer +} + +// New wires a ServerDeps + ServerConfig into a runnable Server. The +// caller is responsible for ensuring cfg.DataDir exists before +// calling. +func New(ctx context.Context, cfg ServerConfig, deps ServerDeps) (*Server, error) { + if err := validateServerInputs(cfg, deps); err != nil { + return nil, err + } + cfg = applyServerDefaults(cfg) + + logger := deps.Logger + if logger == nil { + logger = zap.NewNop() + } + + flush := newFlushFunc(deps.Uploader, deps.Meta) + + log, err := logstore.Open(ctx, logstore.Config{ + Dir: filepath.Join(cfg.DataDir, "segments"), + Meta: deps.Meta, + SealBytes: cfg.SealBytes, + SealAge: cfg.SealAge, + Retain: cfg.Retain, + Flush: flush, + Logger: logger, + }) + if err != nil { + return nil, fmt.Errorf("ms3t: logstore: %w", err) + } + + bs := blockstore.NewLayered(log, deps.BaseBlockReader) + codec := &msbucket.FixedChunker{ChunkSize: cfg.ChunkSize} + backend := s3frontend.New(deps.Registry, bs, log, codec) + + api, err := buildS3API(ctx, backend, cfg) + if err != nil { + // Best-effort cleanup if we got past the log open: the caller + // has no Server handle to call Stop on. + _ = log.Close(ctx) + return nil, err + } + + return &Server{ + cfg: cfg, + logger: logger, + log: log, + backend: backend, + api: api, + }, nil +} + +// Start runs Backend.Recover and spawns the S3 listener goroutine. +// Returns once the listener has been kicked off (does NOT wait for +// it to start serving on Addr). +func (s *Server) Start(ctx context.Context) error { + if err := s.backend.Recover(ctx); err != nil { + return fmt.Errorf("ms3t: recover: %w", err) + } + s.logger.Info("starting ms3t S3 listener", + zap.String("addr", s.cfg.Addr), + zap.String("region", s.cfg.Region), + zap.String("data_dir", s.cfg.DataDir), + zap.Int64("chunk_size", s.cfg.ChunkSize), + ) + go func() { + if err := s.api.ServeMultiPort([]string{s.cfg.Addr}); err != nil { + s.logger.Error("ms3t listener error", zap.Error(err)) + } + }() + return nil +} + +// Stop shuts the listener down and drains the log. Always returns +// the combined error of the two operations so callers see all +// failure modes; either alone is non-fatal to the other. +func (s *Server) Stop(ctx context.Context) error { + s.logger.Info("shutting down ms3t S3 listener") + + var errs []error + if err := s.api.ShutDown(); err != nil { + errs = append(errs, fmt.Errorf("s3api shutdown: %w", err)) + } + if err := s.backend.Drain(ctx); err != nil { + errs = append(errs, fmt.Errorf("backend drain: %w", err)) + } + if len(errs) > 0 { + return fmt.Errorf("ms3t shutdown: %v", errs) + } + return nil +} + +// newFlushFunc captures uploader + meta into the closure passed to +// logstore.Open. Each sealed segment becomes one Forge round trip +// (CAR + index + indexer claim) plus one Postgres tx that flips the +// segment row to flushed and advances each affected bucket's +// forge_root_cid. +// +// The sealed CAR file is the wire payload — uploader.SubmitCAR +// streams it directly into the HTTP PUT, and the segment's +// already-computed digest and append-time position table feed +// allocate/accept and the index view without rescanning the file. +func newFlushFunc(up uploader.Uploader, meta logstore.Meta) logstore.FlushFunc { + return func(ctx context.Context, seg *logstore.Segment) error { + opRoots := seg.OpRoots() + positions := seg.BlockPositions() + if len(positions) == 0 || len(opRoots) == 0 { + // Empty or no-op segment (e.g., force-sealed during a + // quiet startup). Mark flushed so retention can sweep + // it; no Forge ship and no forge_root advance are + // needed. + return meta.MarkSegmentFlushed(ctx, seg.Seq(), time.Now().Unix(), nil) + } + // Segment stores the raw 32-byte SHA-256 of the CAR file; + // the uploader and ShardedDagIndexView want the multihash + // form (varint code + length + digest). + sha, err := multihash.Encode(seg.SHA256(), multihash.SHA2_256) + if err != nil { + return fmt.Errorf("encode segment %d sha: %w", seg.Seq(), err) + } + rootCids := make([]cid.Cid, len(opRoots)) + for i, opr := range opRoots { + rootCids[i] = opr.Root + } + src := uploader.CARSource{ + Path: seg.CARPath(), + Size: seg.Size(), + SHA256: sha, + Positions: positions, + } + if err := up.SubmitCAR(ctx, rootCids, src); err != nil { + return fmt.Errorf("submit segment %d: %w", seg.Seq(), err) + } + return meta.MarkSegmentFlushed(ctx, seg.Seq(), time.Now().Unix(), opRoots) + } +} + +// buildS3API constructs the versitygw S3ApiServer with the wiring +// ms3t needs: single-account IAM, no audit / event sinks, generous +// concurrency limits. +func buildS3API(ctx context.Context, backend *s3frontend.Backend, cfg ServerConfig) (*s3api.S3ApiServer, error) { + rootAcc := auth.Account{ + Access: cfg.RootAccess, + Secret: cfg.RootSecret, + Role: auth.RoleAdmin, + } + iam := auth.NewIAMServiceSingle(rootAcc) + + loggers, err := s3log.InitLogger(&s3log.LogConfig{}) + if err != nil { + return nil, fmt.Errorf("ms3t: loggers: %w", err) + } + evSender, err := s3event.InitEventSender(&s3event.EventConfig{}) + if err != nil { + return nil, fmt.Errorf("ms3t: event sender: %w", err) + } + mm, err := metrics.NewManager(ctx, metrics.Config{}) + if err != nil { + return nil, fmt.Errorf("ms3t: metrics: %w", err) + } + + api, err := s3api.New(backend, + middlewares.RootUserConfig{Access: rootAcc.Access, Secret: rootAcc.Secret}, + cfg.Region, iam, loggers.S3Logger, loggers.AdminLogger, evSender, mm, + s3api.WithQuiet(), + s3api.WithHealth("/health"), + s3api.WithConcurrencyLimiter(cfg.MaxConnections, cfg.MaxRequests), + ) + if err != nil { + return nil, fmt.Errorf("ms3t: s3api: %w", err) + } + return api, nil +} + +func validateServerInputs(cfg ServerConfig, deps ServerDeps) error { + if cfg.Addr == "" { + return errors.New("ms3t: ServerConfig.Addr is required") + } + if cfg.DataDir == "" { + return errors.New("ms3t: ServerConfig.DataDir is required") + } + if cfg.RootAccess == "" || cfg.RootSecret == "" { + return errors.New("ms3t: ServerConfig.RootAccess and ServerConfig.RootSecret are required") + } + if deps.BaseBlockReader == nil { + return errors.New("ms3t: ServerDeps.BaseBlockReader is required") + } + if deps.Uploader == nil { + return errors.New("ms3t: ServerDeps.Uploader is required") + } + if deps.Registry == nil { + return errors.New("ms3t: ServerDeps.Registry is required") + } + if deps.Meta == nil { + return errors.New("ms3t: ServerDeps.Meta is required") + } + return nil +} + +func applyServerDefaults(cfg ServerConfig) ServerConfig { + if cfg.Region == "" { + cfg.Region = "us-east-1" + } + if cfg.ChunkSize <= 0 { + cfg.ChunkSize = msbucket.DefaultChunkSize + } + // SealBytes / SealAge / Retain pass through to logstore.Open + // untouched; logstore.Config.defaults handles its own fallbacks. + + if cfg.MaxConnections <= 0 { + cfg.MaxConnections = 4096 + } + if cfg.MaxRequests <= 0 { + cfg.MaxRequests = 4096 + } + return cfg +} diff --git a/pkg/ms3t/server/handlers.go b/pkg/ms3t/server/handlers.go deleted file mode 100644 index 28de59e..0000000 --- a/pkg/ms3t/server/handlers.go +++ /dev/null @@ -1,281 +0,0 @@ -package server - -import ( - "encoding/hex" - "errors" - "fmt" - "io" - "net/http" - "strconv" - "strings" - "time" - - "github.com/storacha/sprue/pkg/ms3t/bucket" -) - -const httpTimeFormat = http.TimeFormat // RFC1123 GMT - -// --- Buckets --- - -func (h *Handler) listBuckets(w http.ResponseWriter, r *http.Request) { - states, err := h.svc.ListBuckets(reqCtx(r)) - if err != nil { - writeServiceError(w, r, err) - return - } - resp := ListAllMyBucketsResult{ - Xmlns: s3Namespace, - Owner: bucketsOwner{ID: "ms3t", DisplayName: "ms3t"}, - } - for _, st := range states { - resp.Buckets.Bucket = append(resp.Buckets.Bucket, bucketEntry{ - Name: st.Name, - CreationDate: time.Unix(st.CreatedAt, 0).UTC().Format(time.RFC3339), - }) - } - writeXML(w, http.StatusOK, resp) -} - -func (h *Handler) createBucket(w http.ResponseWriter, r *http.Request, name string) { - err := h.svc.CreateBucket(reqCtx(r), name) - if err != nil && !errors.Is(err, bucket.ErrBucketExists) { - writeServiceError(w, r, err) - return - } - w.Header().Set("Location", "/"+name) - w.WriteHeader(http.StatusOK) -} - -func (h *Handler) deleteBucket(w http.ResponseWriter, r *http.Request, name string) { - if err := h.svc.DeleteBucket(reqCtx(r), name); err != nil { - writeServiceError(w, r, err) - return - } - w.WriteHeader(http.StatusNoContent) -} - -func (h *Handler) headBucket(w http.ResponseWriter, r *http.Request, name string) { - if _, err := h.svc.List(reqCtx(r), name, bucket.ListOptions{MaxKeys: 1}); err != nil { - writeServiceError(w, r, err) - return - } - w.WriteHeader(http.StatusOK) -} - -// --- Objects --- - -func (h *Handler) putObject(w http.ResponseWriter, r *http.Request, name, key string) { - defer r.Body.Close() - - // AWS SDKs default to chunked aws-chunked encoding, which we do NOT - // decode here. Clients must disable streaming/chunked uploads or upload - // small bodies in a single PUT. - if v := r.Header.Get("x-amz-content-sha256"); v == "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" || v == "STREAMING-UNSIGNED-PAYLOAD-TRAILER" { - writeError(w, http.StatusNotImplemented, "NotImplemented", - "chunked aws-chunked uploads are not yet supported; configure the client to send unsigned/non-chunked payloads", - r.URL.Path) - return - } - - mf, err := h.svc.PutObject(reqCtx(r), name, key, r.Body, r.Header.Get("Content-Type")) - if err != nil { - writeServiceError(w, r, err) - return - } - w.Header().Set("ETag", etag(mf)) - w.WriteHeader(http.StatusOK) -} - -func (h *Handler) getObject(w http.ResponseWriter, r *http.Request, name, key string) { - rng, rangeErr := parseRange(r.Header.Get("Range")) - if rangeErr != nil { - writeError(w, http.StatusRequestedRangeNotSatisfiable, "InvalidRange", - "invalid Range header", r.URL.Path) - return - } - - body, mf, err := h.svc.GetObject(reqCtx(r), name, key, rng) - if err != nil { - if errors.Is(err, bucket.ErrInvalidRange) { - // We have the manifest; advertise the actual size for clients. - if mf != nil { - w.Header().Set("Content-Range", fmt.Sprintf("bytes */%d", mf.Body.Size)) - } - writeError(w, http.StatusRequestedRangeNotSatisfiable, "InvalidRange", - "requested range not satisfiable", r.URL.Path) - return - } - writeServiceError(w, r, err) - return - } - defer body.Close() - - if rng != nil { - writeRangeHeaders(w, mf, rng) - w.WriteHeader(http.StatusPartialContent) - } else { - writeObjectHeaders(w, mf) - w.WriteHeader(http.StatusOK) - } - if _, err := io.Copy(w, body); err != nil { - h.log.Warn("getobject body copy", "err", err, "key", key) - } -} - -func (h *Handler) headObject(w http.ResponseWriter, r *http.Request, name, key string) { - mf, err := h.svc.HeadObject(reqCtx(r), name, key) - if err != nil { - writeServiceError(w, r, err) - return - } - writeObjectHeaders(w, mf) - w.WriteHeader(http.StatusOK) -} - -func (h *Handler) deleteObject(w http.ResponseWriter, r *http.Request, name, key string) { - if err := h.svc.DeleteObject(reqCtx(r), name, key); err != nil { - writeServiceError(w, r, err) - return - } - w.WriteHeader(http.StatusNoContent) -} - -func etag(mf *bucket.ObjectManifest) string { - return `"` + hex.EncodeToString(mf.Body.SHA256) + `"` -} - -func writeObjectHeaders(w http.ResponseWriter, mf *bucket.ObjectManifest) { - w.Header().Set("Content-Type", mf.ContentType) - w.Header().Set("Content-Length", strconv.FormatInt(mf.Body.Size, 10)) - w.Header().Set("ETag", etag(mf)) - w.Header().Set("Last-Modified", time.Unix(mf.Created, 0).UTC().Format(httpTimeFormat)) - w.Header().Set("Accept-Ranges", "bytes") -} - -func writeRangeHeaders(w http.ResponseWriter, mf *bucket.ObjectManifest, rng *bucket.Range) { - length := rng.End - rng.Start + 1 - w.Header().Set("Content-Type", mf.ContentType) - w.Header().Set("Content-Length", strconv.FormatInt(length, 10)) - w.Header().Set("ETag", etag(mf)) - w.Header().Set("Last-Modified", time.Unix(mf.Created, 0).UTC().Format(httpTimeFormat)) - w.Header().Set("Accept-Ranges", "bytes") - w.Header().Set("Content-Range", - fmt.Sprintf("bytes %d-%d/%d", rng.Start, rng.End, mf.Body.Size)) -} - -// parseRange handles the single-range subset of RFC 7233 that S3 supports: -// "bytes=START-END", "bytes=START-", or "bytes=-SUFFIX". Multi-range -// requests are rejected. Empty header → no range. -// -// Returns (rng, nil) for a valid range, (nil, nil) when no Range header -// is present, (nil, err) on a malformed header. The "suffix" form -// (bytes=-N) cannot be resolved without the body size and is returned -// with Start=-1; the bucket service applies it after loading the manifest. -func parseRange(h string) (*bucket.Range, error) { - if h == "" { - return nil, nil - } - if !strings.HasPrefix(h, "bytes=") { - return nil, errBadRange - } - spec := strings.TrimPrefix(h, "bytes=") - if strings.Contains(spec, ",") { - return nil, errBadRange // multi-range not supported - } - dash := strings.IndexByte(spec, '-') - if dash < 0 { - return nil, errBadRange - } - startStr := spec[:dash] - endStr := spec[dash+1:] - - var start, end int64 = -1, -1 - var err error - if startStr != "" { - start, err = strconv.ParseInt(startStr, 10, 64) - if err != nil || start < 0 { - return nil, errBadRange - } - } - if endStr != "" { - end, err = strconv.ParseInt(endStr, 10, 64) - if err != nil || end < 0 { - return nil, errBadRange - } - } - - switch { - case startStr != "" && endStr != "": - // "bytes=START-END" - if end < start { - return nil, errBadRange - } - return &bucket.Range{Start: start, End: end}, nil - case startStr != "" && endStr == "": - // "bytes=START-" — End resolved later by the service against Size. - return &bucket.Range{Start: start, End: -1}, nil - case startStr == "" && endStr != "": - // "bytes=-SUFFIX" — last N bytes; encoded as Start=-1, End=N. - return &bucket.Range{Start: -1, End: end}, nil - default: - return nil, errBadRange - } -} - -var errBadRange = errors.New("bad range header") - -// --- Listing --- - -func (h *Handler) listObjects(w http.ResponseWriter, r *http.Request, name string) { - q := r.URL.Query() - - prefix := q.Get("prefix") - delimiter := q.Get("delimiter") - startAfter := q.Get("start-after") - token := q.Get("continuation-token") - maxKeys := parseInt(q.Get("max-keys"), 1000) - - from := startAfter - if token != "" { - from = token - } - - res, err := h.svc.List(reqCtx(r), name, bucket.ListOptions{ - Prefix: prefix, - Delimiter: delimiter, - StartAfter: from, - MaxKeys: maxKeys, - }) - if err != nil { - writeServiceError(w, r, err) - return - } - - resp := ListBucketResult{ - Xmlns: s3Namespace, - Name: name, - Prefix: prefix, - Delimiter: delimiter, - MaxKeys: maxKeys, - IsTruncated: res.Truncated, - KeyCount: len(res.Objects) + len(res.CommonPrefixes), - StartAfter: startAfter, - ContinuationToken: token, - } - if res.Truncated { - resp.NextContinuationToken = res.NextToken - } - for _, mf := range res.Objects { - resp.Contents = append(resp.Contents, objectEntry{ - Key: mf.Key, - LastModified: time.Unix(mf.Created, 0).UTC().Format(time.RFC3339), - ETag: etag(mf), - Size: mf.Body.Size, - StorageClass: "STANDARD", - }) - } - for _, cp := range res.CommonPrefixes { - resp.CommonPrefixes = append(resp.CommonPrefixes, commonPrefix{Prefix: cp}) - } - writeXML(w, http.StatusOK, resp) -} diff --git a/pkg/ms3t/server/server.go b/pkg/ms3t/server/server.go deleted file mode 100644 index d267613..0000000 --- a/pkg/ms3t/server/server.go +++ /dev/null @@ -1,132 +0,0 @@ -// Package server exposes the bucket service over an S3-compatible HTTP API. -// Path-style addressing only (clients must set forcePathStyle=true). -// -// Auth is intentionally not validated: the Authorization header is read and -// logged so the request can be traced, but its contents are ignored. Real -// auth is a future middleware; this matches the localstack/MinIO-test style -// of giving the SDK a credential to sign with. -package server - -import ( - "context" - "encoding/xml" - "errors" - "fmt" - "log/slog" - "net/http" - "strconv" - "strings" - - "github.com/storacha/sprue/pkg/ms3t/bucket" - "github.com/storacha/sprue/pkg/ms3t/registry" -) - -// Handler implements http.Handler over a *bucket.Service. -type Handler struct { - svc *bucket.Service - log *slog.Logger -} - -// New returns an http.Handler for the bucket service. -func New(svc *bucket.Service, log *slog.Logger) *Handler { - if log == nil { - log = slog.Default() - } - return &Handler{svc: svc, log: log} -} - -func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { - // Trim leading slash, split into at most 2 components. - path := strings.TrimPrefix(r.URL.Path, "/") - var bucketName, key string - if path != "" { - if i := strings.Index(path, "/"); i >= 0 { - bucketName, key = path[:i], path[i+1:] - } else { - bucketName = path - } - } - - h.log.Debug("s3 request", - "method", r.Method, - "bucket", bucketName, - "key", key, - "query", r.URL.RawQuery, - "auth", r.Header.Get("Authorization") != "") - - switch { - case bucketName == "" && r.Method == http.MethodGet: - h.listBuckets(w, r) - case key == "" && r.Method == http.MethodPut: - h.createBucket(w, r, bucketName) - case key == "" && r.Method == http.MethodDelete: - h.deleteBucket(w, r, bucketName) - case key == "" && r.Method == http.MethodGet: - h.listObjects(w, r, bucketName) - case key == "" && r.Method == http.MethodHead: - h.headBucket(w, r, bucketName) - case key != "" && r.Method == http.MethodPut: - h.putObject(w, r, bucketName, key) - case key != "" && r.Method == http.MethodGet: - h.getObject(w, r, bucketName, key) - case key != "" && r.Method == http.MethodHead: - h.headObject(w, r, bucketName, key) - case key != "" && r.Method == http.MethodDelete: - h.deleteObject(w, r, bucketName, key) - default: - writeError(w, http.StatusMethodNotAllowed, "MethodNotAllowed", - fmt.Sprintf("method %s not allowed for this resource", r.Method), r.URL.Path) - } -} - -// === Helpers === - -func writeXML(w http.ResponseWriter, status int, body any) { - w.Header().Set("Content-Type", "application/xml") - w.WriteHeader(status) - _, _ = w.Write([]byte(xml.Header)) - _ = xml.NewEncoder(w).Encode(body) -} - -func writeError(w http.ResponseWriter, status int, code, msg, resource string) { - writeXML(w, status, ErrorResponse{ - Code: code, Message: msg, Resource: resource, - }) -} - -func mapServiceError(err error) (status int, code, msg string) { - switch { - case errors.Is(err, bucket.ErrBucketNotFound): - return http.StatusNotFound, "NoSuchBucket", "The specified bucket does not exist" - case errors.Is(err, bucket.ErrObjectNotFound): - return http.StatusNotFound, "NoSuchKey", "The specified key does not exist" - case errors.Is(err, bucket.ErrBucketExists), errors.Is(err, registry.ErrExists): - return http.StatusConflict, "BucketAlreadyOwnedByYou", "Your previous request to create the named bucket succeeded" - case errors.Is(err, bucket.ErrInvalidBucket): - return http.StatusBadRequest, "InvalidBucketName", "The specified bucket is not valid" - case errors.Is(err, bucket.ErrInvalidKey): - return http.StatusBadRequest, "InvalidArgument", "Object key is invalid" - case errors.Is(err, bucket.ErrBucketNotEmpty): - return http.StatusConflict, "BucketNotEmpty", "The bucket you tried to delete is not empty" - default: - return http.StatusInternalServerError, "InternalError", err.Error() - } -} - -func writeServiceError(w http.ResponseWriter, r *http.Request, err error) { - status, code, msg := mapServiceError(err) - writeError(w, status, code, msg, r.URL.Path) -} - -func parseInt(s string, dflt int) int { - if s == "" { - return dflt - } - n, err := strconv.Atoi(s) - if err != nil { - return dflt - } - return n -} - -func reqCtx(r *http.Request) context.Context { return r.Context() } diff --git a/pkg/ms3t/server/xml.go b/pkg/ms3t/server/xml.go deleted file mode 100644 index 23d038a..0000000 --- a/pkg/ms3t/server/xml.go +++ /dev/null @@ -1,68 +0,0 @@ -package server - -import "encoding/xml" - -// S3 XML response shapes. Field names and namespaces match the AWS S3 REST -// API documentation closely enough for the AWS SDK to parse them. - -const s3Namespace = "http://s3.amazonaws.com/doc/2006-03-01/" - -// ListAllMyBucketsResult is the body of GET / -type ListAllMyBucketsResult struct { - XMLName xml.Name `xml:"ListAllMyBucketsResult"` - Xmlns string `xml:"xmlns,attr"` - Owner bucketsOwner `xml:"Owner"` - Buckets bucketsBlock `xml:"Buckets"` -} - -type bucketsOwner struct { - ID string `xml:"ID"` - DisplayName string `xml:"DisplayName"` -} - -type bucketsBlock struct { - Bucket []bucketEntry `xml:"Bucket"` -} - -type bucketEntry struct { - Name string `xml:"Name"` - CreationDate string `xml:"CreationDate"` -} - -// ListBucketResult is the body of GET /?list-type=2 (V2). -type ListBucketResult struct { - XMLName xml.Name `xml:"ListBucketResult"` - Xmlns string `xml:"xmlns,attr"` - Name string `xml:"Name"` - Prefix string `xml:"Prefix"` - Delimiter string `xml:"Delimiter,omitempty"` - MaxKeys int `xml:"MaxKeys"` - IsTruncated bool `xml:"IsTruncated"` - KeyCount int `xml:"KeyCount"` - StartAfter string `xml:"StartAfter,omitempty"` - ContinuationToken string `xml:"ContinuationToken,omitempty"` - NextContinuationToken string `xml:"NextContinuationToken,omitempty"` - Contents []objectEntry `xml:"Contents"` - CommonPrefixes []commonPrefix `xml:"CommonPrefixes"` -} - -type objectEntry struct { - Key string `xml:"Key"` - LastModified string `xml:"LastModified"` - ETag string `xml:"ETag"` - Size int64 `xml:"Size"` - StorageClass string `xml:"StorageClass"` -} - -type commonPrefix struct { - Prefix string `xml:"Prefix"` -} - -// ErrorResponse is the body of any S3 error. -type ErrorResponse struct { - XMLName xml.Name `xml:"Error"` - Code string `xml:"Code"` - Message string `xml:"Message"` - Resource string `xml:"Resource,omitempty"` - RequestID string `xml:"RequestId,omitempty"` -} diff --git a/pkg/ms3t/testing/harness.go b/pkg/ms3t/testing/harness.go new file mode 100644 index 0000000..0ba0100 --- /dev/null +++ b/pkg/ms3t/testing/harness.go @@ -0,0 +1,443 @@ +package testing + +import ( + "context" + "fmt" + "net" + "os" + "sort" + "sync" + "time" + + block "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + "go.uber.org/zap" + + "github.com/storacha/sprue/pkg/ms3t" + "github.com/storacha/sprue/pkg/ms3t/blockstore" + "github.com/storacha/sprue/pkg/ms3t/logstore" + "github.com/storacha/sprue/pkg/ms3t/registry" + "github.com/storacha/sprue/pkg/ms3t/uploader" +) + +// DefaultAccessKey / DefaultSecretKey are the sigv4 credentials a +// freshly started Harness uses unless overridden via WithCredentials. +// They are not secrets — the harness binds to 127.0.0.1 only. +const ( + DefaultAccessKey = "ms3t-test-access" + DefaultSecretKey = "ms3t-test-secret" +) + +// Harness is an in-process ms3t.Server backed by in-memory deps. +// No Postgres, no piri, no indexer: a sealed segment's flush is a +// no-op that just advances bookkeeping. Sufficient for driving the +// upstream versitygw integration suite against the listener via +// Run + Suite. +type Harness struct { + Endpoint string + AccessKey string + SecretKey string + Region string + + server *ms3t.Server + dataDir string +} + +// HarnessOption customizes StartHarness. Each option mutates a +// HarnessOptions value in place. +type HarnessOption func(*harnessOptions) + +type harnessOptions struct { + logger *zap.Logger + region string + accessKey string + secretKey string + chunkSize int64 + sealBytes int64 + sealAge time.Duration + retain int + readyAfter time.Duration +} + +// WithLogger sets the zap logger handed to ms3t.Server. Default nop. +func WithLogger(l *zap.Logger) HarnessOption { + return func(o *harnessOptions) { o.logger = l } +} + +// WithRegion overrides the default "us-east-1" sigv4 region. +func WithRegion(r string) HarnessOption { + return func(o *harnessOptions) { o.region = r } +} + +// WithCredentials overrides DefaultAccessKey / DefaultSecretKey. +func WithCredentials(access, secret string) HarnessOption { + return func(o *harnessOptions) { + o.accessKey = access + o.secretKey = secret + } +} + +// WithChunkSize overrides the per-object body chunk size. +// 0 means use bucket.DefaultChunkSize. +func WithChunkSize(n int64) HarnessOption { + return func(o *harnessOptions) { o.chunkSize = n } +} + +// WithSealConfig forwards SealBytes / SealAge / Retain to logstore. +// Tests that exercise seal-on-size or seal-on-age behavior use this; +// the default leaves all three zero so logstore picks its own +// defaults. +func WithSealConfig(sealBytes int64, sealAge time.Duration, retain int) HarnessOption { + return func(o *harnessOptions) { + o.sealBytes = sealBytes + o.sealAge = sealAge + o.retain = retain + } +} + +// WithReadyTimeout caps how long StartHarness will dial the listener +// before giving up. Default 5 s. +func WithReadyTimeout(d time.Duration) HarnessOption { + return func(o *harnessOptions) { o.readyAfter = d } +} + +// StartHarness stands up an in-process ms3t.Server bound to a random +// 127.0.0.1 port and waits for it to accept TCP connections. The +// caller must call Stop to drain the log and remove scratch state. +func StartHarness(ctx context.Context, opts ...HarnessOption) (*Harness, error) { + options := harnessOptions{ + logger: zap.NewNop(), + region: "us-east-1", + accessKey: DefaultAccessKey, + secretKey: DefaultSecretKey, + readyAfter: 5 * time.Second, + } + for _, o := range opts { + o(&options) + } + + addr, err := pickFreeAddr() + if err != nil { + return nil, fmt.Errorf("ms3t harness: pick port: %w", err) + } + + dataDir, err := os.MkdirTemp("", "ms3t-harness-") + if err != nil { + return nil, fmt.Errorf("ms3t harness: tempdir: %w", err) + } + + mem := newMemStore() + + srv, err := ms3t.New(ctx, ms3t.ServerConfig{ + Addr: addr, + DataDir: dataDir, + Region: options.region, + RootAccess: options.accessKey, + RootSecret: options.secretKey, + ChunkSize: options.chunkSize, + SealBytes: options.sealBytes, + SealAge: options.sealAge, + Retain: options.retain, + }, ms3t.ServerDeps{ + Logger: options.logger, + BaseBlockReader: nopBaseReader{}, + Uploader: nopUploader{}, + Registry: mem, + Meta: mem, + }) + if err != nil { + _ = os.RemoveAll(dataDir) + return nil, fmt.Errorf("ms3t harness: New: %w", err) + } + + if err := srv.Start(ctx); err != nil { + _ = os.RemoveAll(dataDir) + return nil, fmt.Errorf("ms3t harness: Start: %w", err) + } + + if err := waitListening(ctx, addr, options.readyAfter); err != nil { + _ = srv.Stop(ctx) + _ = os.RemoveAll(dataDir) + return nil, fmt.Errorf("ms3t harness: %w", err) + } + + return &Harness{ + Endpoint: "http://" + addr, + AccessKey: options.accessKey, + SecretKey: options.secretKey, + Region: options.region, + server: srv, + dataDir: dataDir, + }, nil +} + +// Stop shuts the listener down, drains the log, and removes the +// scratch data directory. Safe to call once; subsequent calls +// no-op. Errors from each step are joined. +func (h *Harness) Stop(ctx context.Context) error { + var errs []error + if h.server != nil { + if err := h.server.Stop(ctx); err != nil { + errs = append(errs, err) + } + h.server = nil + } + if h.dataDir != "" { + if err := os.RemoveAll(h.dataDir); err != nil { + errs = append(errs, fmt.Errorf("remove dataDir: %w", err)) + } + h.dataDir = "" + } + if len(errs) > 0 { + return fmt.Errorf("ms3t harness stop: %v", errs) + } + return nil +} + +// Config returns a Config wired against the harness's listener, +// suitable for passing to Run. +func (h *Harness) Config() Config { + return Config{ + Endpoint: h.Endpoint, + AccessKey: h.AccessKey, + SecretKey: h.SecretKey, + Region: h.Region, + } +} + +// Server exposes the underlying *ms3t.Server for tests that want to +// reach past the S3 protocol layer (e.g., direct backend calls, +// log inspection). +func (h *Harness) Server() *ms3t.Server { return h.server } + +// pickFreeAddr asks the kernel for a free 127.0.0.1 port by binding +// and immediately closing. There is a small race window between +// close and ms3t's rebind, but for serial unit tests it is +// effectively zero. +func pickFreeAddr() (string, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return "", err + } + addr := l.Addr().String() + if err := l.Close(); err != nil { + return "", err + } + return addr, nil +} + +// waitListening polls TCP connect to addr until it succeeds, ctx +// is canceled, or the timeout fires. +func waitListening(ctx context.Context, addr string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + var d net.Dialer + for { + if !time.Now().Before(deadline) { + return fmt.Errorf("listener not ready at %s after %s", addr, timeout) + } + dialCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + conn, err := d.DialContext(dialCtx, "tcp", addr) + cancel() + if err == nil { + _ = conn.Close() + return nil + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(20 * time.Millisecond): + } + } +} + +// memStore is an in-memory implementation of registry.Registry + +// logstore.Meta. The two interfaces overlap on bucket state because +// MarkSegmentFlushed advances forge_root_cid; production wires a +// single *registry.Postgres for both seams, and this fake follows +// suit so flush behavior matches. +type memStore struct { + mu sync.Mutex + buckets map[string]*registry.State + segments map[uint64]*logstore.SegmentMeta + nextSeq uint64 +} + +func newMemStore() *memStore { + return &memStore{ + buckets: map[string]*registry.State{}, + segments: map[uint64]*logstore.SegmentMeta{}, + } +} + +// Registry methods =========================================================== + +func (m *memStore) Create(_ context.Context, name string, createdAt int64) error { + m.mu.Lock() + defer m.mu.Unlock() + if _, ok := m.buckets[name]; ok { + return registry.ErrExists + } + m.buckets[name] = ®istry.State{Name: name, CreatedAt: createdAt} + return nil +} + +func (m *memStore) Get(_ context.Context, name string) (*registry.State, error) { + m.mu.Lock() + defer m.mu.Unlock() + s, ok := m.buckets[name] + if !ok { + return nil, registry.ErrNotFound + } + cp := *s + return &cp, nil +} + +func (m *memStore) List(_ context.Context) ([]*registry.State, error) { + m.mu.Lock() + defer m.mu.Unlock() + out := make([]*registry.State, 0, len(m.buckets)) + for _, s := range m.buckets { + cp := *s + out = append(out, &cp) + } + sort.Slice(out, func(i, j int) bool { return out[i].Name < out[j].Name }) + return out, nil +} + +func (m *memStore) Delete(_ context.Context, name string) error { + m.mu.Lock() + defer m.mu.Unlock() + if _, ok := m.buckets[name]; !ok { + return registry.ErrNotFound + } + delete(m.buckets, name) + return nil +} + +func (m *memStore) CASRoot(_ context.Context, name string, expect, next cid.Cid) error { + m.mu.Lock() + defer m.mu.Unlock() + s, ok := m.buckets[name] + if !ok { + return registry.ErrNotFound + } + if !s.Root.Equals(expect) { + return registry.ErrConflict + } + s.Root = next + return nil +} + +func (m *memStore) SetForgeRoot(_ context.Context, name string, root cid.Cid) error { + m.mu.Lock() + defer m.mu.Unlock() + s, ok := m.buckets[name] + if !ok { + return registry.ErrNotFound + } + s.ForgeRoot = root + return nil +} + +// Meta methods =============================================================== + +func (m *memStore) NextSegmentSeq(_ context.Context) (uint64, error) { + m.mu.Lock() + defer m.mu.Unlock() + m.nextSeq++ + return m.nextSeq, nil +} + +func (m *memStore) InsertSegmentOpen(_ context.Context, seq uint64) error { + m.mu.Lock() + defer m.mu.Unlock() + if _, ok := m.segments[seq]; ok { + return nil + } + m.segments[seq] = &logstore.SegmentMeta{Seq: seq, State: logstore.StateOpen} + return nil +} + +func (m *memStore) MarkSegmentSealed(_ context.Context, seq uint64, sealedAt int64, sizeBytes int64, sha256 []byte, opRoots []blockstore.OpRoot) error { + m.mu.Lock() + defer m.mu.Unlock() + r, ok := m.segments[seq] + if !ok || r.State != logstore.StateOpen { + return nil + } + r.State = logstore.StateSealed + r.SealedAt = sealedAt + r.SizeBytes = sizeBytes + r.SHA256 = append([]byte(nil), sha256...) + r.OpRoots = append([]blockstore.OpRoot(nil), opRoots...) + return nil +} + +func (m *memStore) MarkSegmentFlushed(_ context.Context, seq uint64, flushedAt int64, opRoots []blockstore.OpRoot) error { + m.mu.Lock() + defer m.mu.Unlock() + if r, ok := m.segments[seq]; ok { + r.State = logstore.StateFlushed + r.FlushedAt = flushedAt + } + for _, opr := range opRoots { + if b, ok := m.buckets[opr.Bucket]; ok { + b.ForgeRoot = opr.Root + } + } + return nil +} + +func (m *memStore) DeleteSegment(_ context.Context, seq uint64) error { + m.mu.Lock() + defer m.mu.Unlock() + delete(m.segments, seq) + return nil +} + +func (m *memStore) ListUnflushedSegments(_ context.Context) ([]logstore.SegmentMeta, error) { + m.mu.Lock() + defer m.mu.Unlock() + var out []logstore.SegmentMeta + for _, r := range m.segments { + if r.State == logstore.StateOpen || r.State == logstore.StateSealed { + out = append(out, *r) + } + } + sort.Slice(out, func(i, j int) bool { return out[i].Seq < out[j].Seq }) + return out, nil +} + +func (m *memStore) RehydrateSegment(_ context.Context, sm logstore.SegmentMeta) error { + m.mu.Lock() + defer m.mu.Unlock() + cp := sm + m.segments[sm.Seq] = &cp + return nil +} + +// nopBaseReader is the base tier of the layered read path for the +// harness: every miss past the log returns ErrNotFound. Production +// wires *blockstore.Forge here; tests don't have piri to talk to. +type nopBaseReader struct{} + +func (nopBaseReader) GetBlock(_ context.Context, _ cid.Cid) (block.Block, error) { + return nil, blockstore.ErrNotFound +} + +// nopUploader is the flush sink for the harness: SubmitCAR returns +// nil so the segment is marked flushed without touching the network. +type nopUploader struct{} + +func (nopUploader) SubmitCAR(_ context.Context, _ []cid.Cid, _ uploader.CARSource) error { + return nil +} + +// Compile-time guarantees the fakes still match the contracts after +// upstream interface drift. +var ( + _ registry.Registry = (*memStore)(nil) + _ logstore.Meta = (*memStore)(nil) + _ blockstore.BlockReader = nopBaseReader{} + _ uploader.Uploader = nopUploader{} +) diff --git a/pkg/ms3t/testing/harness_test.go b/pkg/ms3t/testing/harness_test.go new file mode 100644 index 0000000..7811a42 --- /dev/null +++ b/pkg/ms3t/testing/harness_test.go @@ -0,0 +1,52 @@ +package testing_test + +import ( + "context" + "net/http" + "strings" + "testing" + "time" + + "go.uber.org/zap/zaptest" + + mstesting "github.com/storacha/sprue/pkg/ms3t/testing" +) + +func TestHarnessLifecycle(t *testing.T) { + ctx, cancel := context.WithTimeout(t.Context(), 10*time.Second) + defer cancel() + + h, err := mstesting.StartHarness(ctx, mstesting.WithLogger(zaptest.NewLogger(t))) + if err != nil { + t.Fatalf("StartHarness: %v", err) + } + t.Cleanup(func() { + if err := h.Stop(t.Context()); err != nil { + t.Errorf("Stop: %v", err) + } + }) + + if !strings.HasPrefix(h.Endpoint, "http://127.0.0.1:") { + t.Fatalf("unexpected endpoint %q", h.Endpoint) + } + + // /health is wired in buildS3API; hit it to confirm the listener + // is actually serving HTTP, not just accepting TCP. + req, err := http.NewRequestWithContext(ctx, http.MethodGet, h.Endpoint+"/health", nil) + if err != nil { + t.Fatalf("NewRequest: %v", err) + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("GET /health: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("/health status = %d, want 200", resp.StatusCode) + } + + cfg := h.Config() + if cfg.Endpoint != h.Endpoint || cfg.AccessKey != h.AccessKey || cfg.SecretKey != h.SecretKey { + t.Fatalf("Config() mismatch: %+v vs %+v", cfg, h) + } +} diff --git a/pkg/ms3t/testing/integration.go b/pkg/ms3t/testing/integration.go new file mode 100644 index 0000000..1cd4866 --- /dev/null +++ b/pkg/ms3t/testing/integration.go @@ -0,0 +1,228 @@ +// Package testing wires the upstream versitygw integration suite +// (github.com/versity/versitygw/tests/integration) against a running +// ms3t S3 listener. Callers own server lifecycle and pass connection +// details in via Config; this package only selects which upstream +// group functions to run. Each group prints its own per-test results +// to stdout; Run additionally returns a Result summarizing how many +// cases passed and failed so Go tests can fail a *testing.T when the +// suite reports any failures (see RunT). +package testing + +import ( + "context" + "fmt" + "sync" + "testing" + + "github.com/versity/versitygw/tests/integration" +) + +// Config addresses the ms3t S3 listener under test. +type Config struct { + Endpoint string + AccessKey string + SecretKey string + + // Region must match the listener's configured region. Empty + // defaults to "us-east-1". + Region string + + // Parallel queues each Run-mode test on TestState's worker + // pool instead of running serially. Sync-mode tests still + // run after the parallel batch drains. + Parallel bool + + // HostStyle uses host-style bucket addressing (bucket.host) + // instead of path-style (host/bucket). + HostStyle bool + + // VersioningEnabled tells the suite the bucket-versioning + // feature is on; group functions branch on this flag. + VersioningEnabled bool + + // SkipTLSVerify accepts self-signed certs. + SkipTLSVerify bool +} + +// Suite is an ordered list of upstream group functions. Each takes a +// *integration.TestState and dispatches its individual tests via +// ts.Run / ts.Sync. Compose ad-hoc suites by listing +// integration.TestXxx values directly: +// +// testing.Run(ctx, cfg, testing.Suite{ +// integration.TestCreateBucket, +// integration.TestPutObject, +// }) +type Suite []func(*integration.TestState) + +// Result summarizes a single suite Run. Counts are deltas — the +// versitygw counters are package-level atomics shared across every +// caller in the process, so Run snapshots them on entry and reports +// the difference. +type Result struct { + // Ran is the number of individual case functions that started. + Ran uint32 + // Passed is the number that ended in passF. + Passed uint32 + // Failed is the number that ended in failF. + Failed uint32 +} + +// Err returns a non-nil error if any case failed. Use this when the +// caller is not a *testing.T (e.g., a CLI runner). For Go tests, +// prefer RunT which propagates failures into t.Errorf directly. +func (r Result) Err() error { + if r.Failed > 0 { + return fmt.Errorf("integration suite: %d of %d cases failed", r.Failed, r.Ran) + } + return nil +} + +// runMu serializes concurrent Run calls so the global versitygw +// counters can be sampled before/after one Run without interleaving +// with another. Two parallel Run calls in the same process would +// otherwise contaminate each other's deltas. +var runMu sync.Mutex + +// Run drives suite against a fresh TestState bound to ctx and c. +// Blocks until queued (Run-mode) and deferred (Sync-mode) tests +// complete, then returns this run's case counts. +func Run(ctx context.Context, c Config, suite Suite) Result { + runMu.Lock() + defer runMu.Unlock() + + ranBefore := integration.RunCount.Load() + passedBefore := integration.PassCount.Load() + failedBefore := integration.FailCount.Load() + + ts := integration.NewTestState(ctx, newS3Conf(c), c.Parallel) + for _, group := range suite { + group(ts) + } + ts.Wait() + + return Result{ + Ran: integration.RunCount.Load() - ranBefore, + Passed: integration.PassCount.Load() - passedBefore, + Failed: integration.FailCount.Load() - failedBefore, + } +} + +// RunT is the Go-test-friendly form of Run. On any failure the +// returned Result is also reported via t.Errorf so `go test` exits +// non-zero. The per-case FAIL lines printed by versitygw are +// captured in t's log output, so the test author sees exactly which +// cases failed without RunT having to summarize them. +func RunT(t *testing.T, c Config, suite Suite) Result { + t.Helper() + r := Run(t.Context(), c, suite) + if r.Failed > 0 { + t.Errorf("integration suite: %d of %d cases failed (see test output for per-case details)", r.Failed, r.Ran) + } + return r +} + +func newS3Conf(c Config) *integration.S3Conf { + region := c.Region + if region == "" { + region = "us-east-1" + } + opts := []integration.Option{ + integration.WithEndpoint(c.Endpoint), + integration.WithAccess(c.AccessKey), + integration.WithSecret(c.SecretKey), + integration.WithRegion(region), + integration.WithTLSStatus(c.SkipTLSVerify), + } + if c.HostStyle { + opts = append(opts, integration.WithHostStyle()) + } + if c.VersioningEnabled { + opts = append(opts, integration.WithVersioningEnabled()) + } + return integration.NewS3Conf(opts...) +} + +// Smoke is the minimum subset that should pass on a working listener: +// bucket lifecycle plus single-object CRUD. +var Smoke = Suite{ + integration.TestCreateBucket, + integration.TestHeadBucket, + integration.TestListBuckets, + integration.TestPutObject, + integration.TestGetObject, + integration.TestHeadObject, + integration.TestDeleteObject, + integration.TestDeleteBucket, +} + +// CRUD covers Smoke plus listing, multi-delete, copy, and the +// GetObjectAttributes surface. Stays inside features that don't +// require multipart, versioning, ACL, policy, CORS, lock, or tagging. +var CRUD = Suite{ + integration.TestCreateBucket, + integration.TestHeadBucket, + integration.TestListBuckets, + integration.TestDeleteBucket, + integration.TestPutObject, + integration.TestHeadObject, + integration.TestGetObject, + integration.TestGetObjectAttributes, + integration.TestListObjects, + integration.TestListObjectsV2, + integration.TestCopyObject, + integration.TestDeleteObject, + integration.TestDeleteObjects, +} + +// Multipart covers the multipart-upload group set. +var Multipart = Suite{ + integration.TestCreateMultipartUpload, + integration.TestUploadPart, + integration.TestUploadPartCopy, + integration.TestListParts, + integration.TestListMultipartUploads, + integration.TestAbortMultipartUpload, + integration.TestCompleteMultipartUpload, +} + +// Tagging covers object and bucket tagging APIs. +var Tagging = Suite{ + integration.TestPutBucketTagging, + integration.TestGetBucketTagging, + integration.TestDeleteBucketTagging, + integration.TestPutObjectTagging, + integration.TestGetObjectTagging, + integration.TestDeleteObjectTagging, +} + +// ObjectLock covers retention, legal hold, lock config, and +// WORM-protection groups. +var ObjectLock = Suite{ + integration.TestPutObjectLockConfiguration, + integration.TestGetObjectLockConfiguration, + integration.TestPutObjectRetention, + integration.TestGetObjectRetention, + integration.TestPutObjectLegalHold, + integration.TestGetObjectLegalHold, + integration.TestWORMProtection, +} + +// Versioning runs the version-aware group. Set +// Config.VersioningEnabled = true. +var Versioning = Suite{ + integration.TestVersioning, + integration.TestVersioningDisabled, + integration.TestListObjectVersions_VD, +} + +// Auth runs sigv4 + presigned-URL authentication groups. +var Auth = Suite{ + integration.TestAuthentication, + integration.TestPresignedAuthentication, +} + +// Full is the upstream TestFullFlow rolled-up suite — the +// "how-far-from-full-compatibility" gauge. Expect noisy failures +// until ms3t closes the gaps tracked by the focused suites above. +var Full = Suite{integration.TestFullFlow} diff --git a/pkg/ms3t/testing/smoke_test.go b/pkg/ms3t/testing/smoke_test.go new file mode 100644 index 0000000..1f35ab2 --- /dev/null +++ b/pkg/ms3t/testing/smoke_test.go @@ -0,0 +1,374 @@ +package testing + +import ( + "context" + "testing" + + "github.com/versity/versitygw/tests/integration" + "go.uber.org/zap/zaptest" +) + +// smokeCase pairs an upstream versitygw integration case with its +// subtest name. Each TestSmoke_* / TestSmokeXFail_* function below +// declares its cases inline as a []smokeCase, so GoLand (and any +// other IDE that parses table-driven Go tests) renders one +// play-icon per row in the gutter. +type smokeCase struct { + name string + fn integration.IntTest +} + +// Layout: one top-level test per S3 group, in two flavors: +// +// TestSmoke_ — known-passing cases (every case must pass) +// TestSmokeXFail_ — cases ms3t fails today; each one is +// expected to fail and reported as SKIP. +// An unexpected pass errors so the case +// can be promoted. +// +// Adding a case: when a fix lands, run the matching TestSmokeXFail_*. +// Cases that flip green will report "case unexpectedly passed" — move +// the line from the XFail function to the matching TestSmoke_* one. +// +// Each top-level test boots its own Harness (via smokeHarness), so +// failures in one group can't leak buckets / segments / op-roots +// into another. Cases within a group share one harness because the +// upstream cases create + tear down their own buckets internally. + +// smokeHarness boots a Harness scoped to t and registers cleanup. +func smokeHarness(t *testing.T) *Harness { + t.Helper() + h, err := StartHarness(t.Context(), WithLogger(zaptest.NewLogger(t))) + if err != nil { + t.Fatalf("StartHarness: %v", err) + } + t.Cleanup(func() { _ = h.Stop(context.Background()) }) + return h +} + +// ============================================================= +// Known-passing cases +// ============================================================= + +func TestSmoke_CreateBucket(t *testing.T) { + tests := []smokeCase{ + {"invalid_bucket_name", integration.CreateBucket_invalid_bucket_name}, + {"invalid_canned_acl", integration.CreateBucket_invalid_canned_acl}, + {"invalid_location_constraint", integration.CreateBucket_invalid_location_constraint}, + {"invalid_ownership", integration.CreateBucket_invalid_ownership}, + {"ownership_with_acl", integration.CreateBucket_ownership_with_acl}, + {"success", integration.CreateBucket_success}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.fn(s3conf); err != nil { + t.Fatalf("%v", err) + } + }) + } +} + +func TestSmoke_HeadBucket(t *testing.T) { + tests := []smokeCase{ + {"non_existing_bucket", integration.HeadBucket_non_existing_bucket}, + {"success", integration.HeadBucket_success}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.fn(s3conf); err != nil { + t.Fatalf("%v", err) + } + }) + } +} + +func TestSmoke_ListBuckets(t *testing.T) { + tests := []smokeCase{ + {"empty_success", integration.ListBuckets_empty_success}, + {"invalid_max_buckets", integration.ListBuckets_invalid_max_buckets}, + {"success", integration.ListBuckets_success}, + {"truncated", integration.ListBuckets_truncated}, + {"with_prefix", integration.ListBuckets_with_prefix}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.fn(s3conf); err != nil { + t.Fatalf("%v", err) + } + }) + } +} + +func TestSmoke_DeleteBucket(t *testing.T) { + tests := []smokeCase{ + {"incorrect_expected_bucket_owner", integration.DeleteBucket_incorrect_expected_bucket_owner}, + {"non_empty_bucket", integration.DeleteBucket_non_empty_bucket}, + {"non_existing_bucket", integration.DeleteBucket_non_existing_bucket}, + {"success_status_code", integration.DeleteBucket_success_status_code}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.fn(s3conf); err != nil { + t.Fatalf("%v", err) + } + }) + } +} + +func TestSmoke_PutObject(t *testing.T) { + tests := []smokeCase{ + {"checksum_algorithm_and_header_mismatch", integration.PutObject_checksum_algorithm_and_header_mismatch}, + {"default_content_type", integration.PutObject_default_content_type}, + {"false_negative_object_names", integration.PutObject_false_negative_object_names}, + {"invalid_checksum_header", integration.PutObject_invalid_checksum_header}, + {"invalid_legal_hold", integration.PutObject_invalid_legal_hold}, + {"invalid_object_lock_mode", integration.PutObject_invalid_object_lock_mode}, + {"invalid_object_names", integration.PutObject_invalid_object_names}, + {"invalid_retain_until_date", integration.PutObject_invalid_retain_until_date}, + {"long_metadata", integration.PutObject_long_metadata}, + {"missing_object_lock_retention_config", integration.PutObject_missing_object_lock_retention_config}, + {"multiple_checksum_headers", integration.PutObject_multiple_checksum_headers}, + {"non_existing_bucket", integration.PutObject_non_existing_bucket}, + {"past_retain_until_date", integration.PutObject_past_retain_until_date}, + {"racey_success", integration.PutObject_racey_success}, + {"special_chars", integration.PutObject_special_chars}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.fn(s3conf); err != nil { + t.Fatalf("%v", err) + } + }) + } +} + +func TestSmoke_GetObject(t *testing.T) { + tests := []smokeCase{ + {"by_range_resp_status", integration.GetObject_by_range_resp_status}, + {"dir_with_range", integration.GetObject_dir_with_range}, + {"directory_object_noslash", integration.GetObject_directory_object_noslash}, + {"empty_object_part_number_1", integration.GetObject_empty_object_part_number_1}, + {"invalid_parent", integration.GetObject_invalid_parent}, + {"invalid_part_number", integration.GetObject_invalid_part_number}, + {"non_existing_dir_object", integration.GetObject_non_existing_dir_object}, + {"non_existing_key", integration.GetObject_non_existing_key}, + {"not_enabled_checksum_mode", integration.GetObject_not_enabled_checksum_mode}, + {"overrides_presign_success", integration.GetObject_overrides_presign_success}, + {"overrides_success", integration.GetObject_overrides_success}, + {"range_and_part_number", integration.GetObject_range_and_part_number}, + {"with_range", integration.GetObject_with_range}, + {"zero_len_with_range", integration.GetObject_zero_len_with_range}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.fn(s3conf); err != nil { + t.Fatalf("%v", err) + } + }) + } +} + +func TestSmoke_HeadObject(t *testing.T) { + tests := []smokeCase{ + {"conditional_reads", integration.HeadObject_conditional_reads}, + {"directory_object_noslash", integration.HeadObject_directory_object_noslash}, + {"empty_object_part_number_1", integration.HeadObject_empty_object_part_number_1}, + {"invalid_parent_dir", integration.HeadObject_invalid_parent_dir}, + {"invalid_part_number", integration.HeadObject_invalid_part_number}, + {"non_existing_dir_object", integration.HeadObject_non_existing_dir_object}, + {"non_existing_object", integration.HeadObject_non_existing_object}, + {"not_enabled_checksum_mode", integration.HeadObject_not_enabled_checksum_mode}, + {"overrides_presign_success", integration.HeadObject_overrides_presign_success}, + {"overrides_success", integration.HeadObject_overrides_success}, + {"range_and_part_number", integration.HeadObject_range_and_part_number}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.fn(s3conf); err != nil { + t.Fatalf("%v", err) + } + }) + } +} + +func TestSmoke_DeleteObject(t *testing.T) { + tests := []smokeCase{ + {"directory_object", integration.DeleteObject_directory_object}, + {"directory_object_noslash", integration.DeleteObject_directory_object_noslash}, + {"expected_bucket_owner", integration.DeleteObject_expected_bucket_owner}, + {"incorrect_expected_bucket_owner", integration.DeleteObject_incorrect_expected_bucket_owner}, + {"non_empty_dir_obj", integration.DeleteObject_non_empty_dir_obj}, + {"non_existing_dir_object", integration.DeleteObject_non_existing_dir_object}, + {"non_existing_object", integration.DeleteObject_non_existing_object}, + {"success", integration.DeleteObject_success}, + {"success_status_code", integration.DeleteObject_success_status_code}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.fn(s3conf); err != nil { + t.Fatalf("%v", err) + } + }) + } +} + +// ============================================================= +// Known-failing cases (XFail) +// ============================================================= + +func TestSmokeXFail_CreateBucket(t *testing.T) { + tests := []smokeCase{ + {"as_user", integration.CreateBucket_as_user}, + {"default_acl", integration.CreateBucket_default_acl}, + {"default_object_lock", integration.CreateBucket_default_object_lock}, + {"duplicate_keys", integration.CreateBucket_duplicate_keys}, + {"existing_bucket", integration.CreateBucket_existing_bucket}, + {"invalid_tags", integration.CreateBucket_invalid_tags}, + {"long_tags", integration.CreateBucket_long_tags}, + {"non_default_acl", integration.CreateBucket_non_default_acl}, + {"owned_by_you", integration.CreateBucket_owned_by_you}, + {"private_canned_acl", integration.CreateBucket_private_canned_acl}, + {"private_canned_acl_bucket_owner_enforced_ownership", integration.CreateBucket_private_canned_acl_bucket_owner_enforced_ownership}, + {"tag_count_limit", integration.CreateBucket_tag_count_limit}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.fn(s3conf) + if err == nil { + t.Errorf("case unexpectedly passed; promote it from TestSmokeXFail_CreateBucket to TestSmoke_CreateBucket") + return + } + t.Skipf("known-failing: %v", err) + }) + } +} + +func TestSmokeXFail_ListBuckets(t *testing.T) { + tests := []smokeCase{ + {"as_admin", integration.ListBuckets_as_admin}, + {"as_user", integration.ListBuckets_as_user}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.fn(s3conf) + if err == nil { + t.Errorf("case unexpectedly passed; promote it from TestSmokeXFail_ListBuckets to TestSmoke_ListBuckets") + return + } + t.Skipf("known-failing: %v", err) + }) + } +} + +func TestSmokeXFail_PutObject(t *testing.T) { + tests := []smokeCase{ + {"checksums_success", integration.PutObject_checksums_success}, + {"conditional_writes", integration.PutObject_conditional_writes}, + {"default_checksum", integration.PutObject_default_checksum}, + {"dir_object_checksums_success", integration.PutObject_dir_object_checksums_success}, + {"dir_object_default_checksum", integration.PutObject_dir_object_default_checksum}, + {"incorrect_checksums", integration.PutObject_incorrect_checksums}, + {"invalid_credentials", integration.PutObject_invalid_credentials}, + {"missing_bucket_lock", integration.PutObject_missing_bucket_lock}, + {"object_acl_not_supported", integration.PutObject_object_acl_not_supported}, + {"should_combine_metadata", integration.PutObject_should_combine_metadata}, + {"success", integration.PutObject_success}, + {"tagging", integration.PutObject_tagging}, + {"with_metadata", integration.PutObject_with_metadata}, + {"with_object_lock", integration.PutObject_with_object_lock}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.fn(s3conf) + if err == nil { + t.Errorf("case unexpectedly passed; promote it from TestSmokeXFail_PutObject to TestSmoke_PutObject") + return + } + t.Skipf("known-failing: %v", err) + }) + } +} + +func TestSmokeXFail_GetObject(t *testing.T) { + tests := []smokeCase{ + {"checksums", integration.GetObject_checksums}, + {"conditional_reads", integration.GetObject_conditional_reads}, + {"dir_object_checksum", integration.GetObject_dir_object_checksum}, + {"directory_success", integration.GetObject_directory_success}, + {"large_object", integration.GetObject_large_object}, + {"mp_part_number_exceeds_parts_count", integration.GetObject_mp_part_number_exceeds_parts_count}, + {"mp_part_number_resp_status", integration.GetObject_mp_part_number_resp_status}, + {"mp_part_number_success", integration.GetObject_mp_part_number_success}, + {"non_mp_part_number_1_success", integration.GetObject_non_mp_part_number_1_success}, + {"overrides_fail_public", integration.GetObject_overrides_fail_public}, + {"ranged_with_checksum_mode", integration.GetObject_ranged_with_checksum_mode}, + {"success", integration.GetObject_success}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.fn(s3conf) + if err == nil { + t.Errorf("case unexpectedly passed; promote it from TestSmokeXFail_GetObject to TestSmoke_GetObject") + return + } + t.Skipf("known-failing: %v", err) + }) + } +} + +func TestSmokeXFail_HeadObject(t *testing.T) { + tests := []smokeCase{ + {"by_range_resp_status", integration.HeadObject_by_range_resp_status}, + {"checksums", integration.HeadObject_checksums}, + {"dir_with_range", integration.HeadObject_dir_with_range}, + {"mp_part_number_exceeds_parts_count", integration.HeadObject_mp_part_number_exceeds_parts_count}, + {"mp_part_number_resp_status", integration.HeadObject_mp_part_number_resp_status}, + {"mp_part_number_success", integration.HeadObject_mp_part_number_success}, + {"non_mp_part_number_1_success", integration.HeadObject_non_mp_part_number_1_success}, + {"overrides_fail_public", integration.HeadObject_overrides_fail_public}, + {"ranged_with_checksum_mode", integration.HeadObject_ranged_with_checksum_mode}, + {"success", integration.HeadObject_success}, + {"with_range", integration.HeadObject_with_range}, + {"zero_len_with_range", integration.HeadObject_zero_len_with_range}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.fn(s3conf) + if err == nil { + t.Errorf("case unexpectedly passed; promote it from TestSmokeXFail_HeadObject to TestSmoke_HeadObject") + return + } + t.Skipf("known-failing: %v", err) + }) + } +} + +func TestSmokeXFail_DeleteObject(t *testing.T) { + tests := []smokeCase{ + {"conditional_writes", integration.DeleteObject_conditional_writes}, + } + s3conf := newS3Conf(smokeHarness(t).Config()) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.fn(s3conf) + if err == nil { + t.Errorf("case unexpectedly passed; promote it from TestSmokeXFail_DeleteObject to TestSmoke_DeleteObject") + return + } + t.Skipf("known-failing: %v", err) + }) + } +} diff --git a/pkg/ms3t/uploader/internal.go b/pkg/ms3t/uploader/forge.go similarity index 68% rename from pkg/ms3t/uploader/internal.go rename to pkg/ms3t/uploader/forge.go index 982feab..4a17f5d 100644 --- a/pkg/ms3t/uploader/internal.go +++ b/pkg/ms3t/uploader/forge.go @@ -8,8 +8,8 @@ import ( "fmt" "io" nethttp "net/http" + "os" - block "github.com/ipfs/go-block-format" "github.com/ipfs/go-cid" "github.com/ipld/go-ipld-prime/datamodel" "github.com/ipld/go-ipld-prime/fluent/qp" @@ -31,12 +31,55 @@ import ( "go.uber.org/zap" "github.com/storacha/sprue/pkg/indexerclient" - "github.com/storacha/sprue/pkg/ms3t/cars" + "github.com/storacha/sprue/pkg/ms3t/blockstore" "github.com/storacha/sprue/pkg/piriclient" "github.com/storacha/sprue/pkg/routing" ) -// Internal is an Uploader that ships CARs to Forge from inside sprue, +// placeholderCID is the smallest legal raw-codec CID with an +// identity-hashed two-byte payload. It mirrors guppy's internal +// PlaceholderCID and is used as the "root" for the +// ShardedDagIndexView and the SpaceIndexAdd invocation: the index's +// Content() field and SpaceIndexAdd's rootCID parameter aren't +// load-bearing for inner-CID lookups (per guppy's own usage), so +// instead of inventing a synthetic root for each multi-root CAR we +// just pass this placeholder through. +var placeholderCID = cid.NewCidV1(cid.Raw, []byte{0x00, 0x00}) + +// Uploader is the seam between the log flusher and durable Forge +// storage. +type Uploader interface { + // SubmitCAR ships one sealed CAR file (one log segment) to + // Forge. The implementation streams the file body straight + // into the HTTP PUT, never materializing it as a []block.Block + // or re-encoding it as a CAR. + SubmitCAR(ctx context.Context, roots []cid.Cid, src CARSource) error +} + +// CARSource describes a sealed CAR file ready to ship. All fields +// refer to data that already exists on disk or was precomputed at +// seal time, so the uploader's per-flush memory footprint is +// dominated by HTTP send buffers rather than segment size. +type CARSource struct { + // Path is the absolute path to the sealed CAR file. SubmitCAR + // streams from this path into the HTTP PUT body. + Path string + // Size is the file's byte length. Set as the request's + // Content-Length so net/http does not fall back to chunked + // transfer encoding (piri requires Content-Length). + Size int64 + // SHA256 is the SHA-256 multihash of the CAR's bytes. Computed + // once at seal time and reused both as the blob digest in + // allocate / accept and as the CAR digest the + // ShardedDagIndexView is keyed by. + SHA256 multihash.Multihash + // Positions maps each block's CID to its offset/length inside + // the CAR file, in the same shape Log appends populate at + // write time. Used to build the index view without rescanning. + Positions map[cid.Cid]blockstore.BlockLoc +} + +// Forge is an Uploader that ships CARs to Forge from inside sprue, // using sprue's own piriclient and indexerclient. No UCAN-over-HTTP // loopback to sprue's own UCAN endpoint, no separate principal or // delegation file: sprue's identity is the signer, and storage @@ -56,7 +99,7 @@ import ( // invocations that the existing space_blob_add handler builds from // the inbound user UCAN. Here there's no inbound user UCAN — sprue's // signer self-issues them so the audit shape matches. -type Internal struct { +type Forge struct { router *routing.Service piriProvider piriclient.Provider indexerClient *indexerclient.Client @@ -66,7 +109,7 @@ type Internal struct { logger *zap.Logger } -// InternalConfig wires sprue's existing services into an Internal +// ForgeConfig wires sprue's existing services into an Forge // uploader. All fields are required. // // Signer is sprue's upload-service identity — used for piriclient @@ -78,7 +121,7 @@ type Internal struct { // every PUT, and it acts as the root authority for self-issued // space/content/retrieve delegations (so the indexer can fetch the // index blob from piri on assert/index validation). -type InternalConfig struct { +type ForgeConfig struct { Router *routing.Service PiriProvider piriclient.Provider IndexerClient *indexerclient.Client @@ -88,9 +131,9 @@ type InternalConfig struct { Logger *zap.Logger } -// NewInternal validates the config and returns an Uploader that +// NewForge validates the config and returns an Uploader that // writes through sprue's internal services. -func NewInternal(cfg InternalConfig) (*Internal, error) { +func NewForge(cfg ForgeConfig) (*Forge, error) { if cfg.Router == nil { return nil, errors.New("uploader: routing service is required") } @@ -114,7 +157,7 @@ func NewInternal(cfg InternalConfig) (*Internal, error) { if logger == nil { logger = zap.NewNop() } - return &Internal{ + return &Forge{ router: cfg.Router, piriProvider: cfg.PiriProvider, indexerClient: cfg.IndexerClient, @@ -126,39 +169,35 @@ func NewInternal(cfg InternalConfig) (*Internal, error) { } // SpaceDID returns the DID of the space ms3t owns. -func (u *Internal) SpaceDID() did.DID { return u.spaceSigner.DID() } +func (u *Forge) SpaceDID() did.DID { return u.spaceSigner.DID() } -func (u *Internal) Submit(ctx context.Context, roots []cid.Cid, blocks []block.Block) error { +func (u *Forge) SubmitCAR(ctx context.Context, roots []cid.Cid, src CARSource) error { if len(roots) == 0 { return errors.New("uploader: at least one root required") } - if len(blocks) == 0 { + if src.Size <= 0 || len(src.Positions) == 0 { return nil } - // 1. Encode CAR + record positions. - var carBuf bytes.Buffer - positions, err := cars.WriteWithPositions(&carBuf, roots, blocks) - if err != nil { - return fmt.Errorf("uploader: encode car: %w", err) - } - carBytes := carBuf.Bytes() - carDigest, err := multihash.Sum(carBytes, multihash.SHA2_256, -1) - if err != nil { - return fmt.Errorf("uploader: hash car: %w", err) + // 1. PUT the data CAR by streaming from disk. The sealed CAR + // file is byte-identical to what cars.WriteWithPositions + // would produce here (same placeholder header, same block + // order), and the seal step already hashed it — so we skip + // re-encoding and rehashing entirely. + putCAR := func(url string, headers nethttp.Header) error { + return httpPutFile(ctx, u.httpClient, url, headers, src.Path, src.Size) } - - // 2. Allocate + PUT + Accept the data CAR. - if err := u.uploadBlob(ctx, carBytes, carDigest); err != nil { + if err := u.uploadBlob(ctx, src.SHA256, uint64(src.Size), putCAR); err != nil { return fmt.Errorf("uploader: ship car: %w", err) } - // 3. Build a ShardedDagIndexView keyed off the CAR's multihash. + // 2. Build a ShardedDagIndexView keyed off the CAR's multihash, + // using the precomputed positions from the segment. view := blobindex.NewShardedDagIndexView(cidlink.Link{Cid: placeholderCID}, 1) - for _, p := range positions { - view.SetSlice(carDigest, p.CID.Hash(), blobindex.Position{ - Offset: p.Offset, - Length: p.Length, + for c, loc := range src.Positions { + view.SetSlice(src.SHA256, c.Hash(), blobindex.Position{ + Offset: loc.Offset, + Length: loc.Length, }) } archReader, err := view.Archive() @@ -174,12 +213,16 @@ func (u *Internal) Submit(ctx context.Context, roots []cid.Cid, blocks []block.B return fmt.Errorf("uploader: hash index: %w", err) } - // 4. Allocate + PUT + Accept the index blob. - if err := u.uploadBlob(ctx, indexBytes, indexDigest); err != nil { + // 3. PUT the index blob. Small (one entry per inner CID), so + // in-memory is fine. + putIndex := func(url string, headers nethttp.Header) error { + return httpPut(ctx, u.httpClient, url, headers, indexBytes) + } + if err := u.uploadBlob(ctx, indexDigest, uint64(len(indexBytes)), putIndex); err != nil { return fmt.Errorf("uploader: ship index: %w", err) } - // 5. Publish the index claim. The indexer needs to fetch our + // 4. Publish the index claim. The indexer needs to fetch our // index blob from piri to validate the assertion, and piri // requires UCAN auth on retrieval. We self-issue a // space/content/retrieve delegation scoped to this specific @@ -207,14 +250,20 @@ func (u *Internal) Submit(ctx context.Context, roots []cid.Cid, blocks []block.B return nil } -func (u *Internal) Flush(context.Context) error { return nil } -func (u *Internal) Close(context.Context) error { return nil } - // uploadBlob runs the allocate → PUT → accept dance for one blob. -// Retries the allocate on ErrCandidateUnavailable by excluding failed -// providers, mirroring sprue's space_blob_add handler. -func (u *Internal) uploadBlob(ctx context.Context, data []byte, digest multihash.Multihash) error { - blob := captypes.Blob{Digest: digest, Size: uint64(len(data))} +// putBody is invoked at most once per call, after a successful +// Allocate, with the URL and headers piri returned. The retry loop +// only re-runs Allocate (on ErrCandidateUnavailable), never the +// PUT itself, so a streaming putBody can safely consume its source +// in one shot. If Allocate reports the blob is already present +// (Address == nil), putBody is skipped entirely and accept proceeds. +func (u *Forge) uploadBlob( + ctx context.Context, + digest multihash.Multihash, + size uint64, + putBody func(url string, headers nethttp.Header) error, +) error { + blob := captypes.Blob{Digest: digest, Size: size} // Synthesize a self-issued space/blob/add invocation as the cause. // Its link feeds the audit chain piri's handlers expect; never sent @@ -263,7 +312,7 @@ func (u *Internal) uploadBlob(ctx context.Context, data []byte, digest multihash // PUT bytes if piri allocated a fresh slot. If Address is nil // piri already has the blob; skip the upload. if allocResp.Address != nil { - if err := httpPut(ctx, u.httpClient, allocResp.Address.URL.String(), allocResp.Address.Headers, data); err != nil { + if err := putBody(allocResp.Address.URL.String(), allocResp.Address.Headers); err != nil { return fmt.Errorf("http put: %w", err) } } @@ -341,6 +390,38 @@ func httpPut(ctx context.Context, client *nethttp.Client, urlStr string, headers return nil } +// httpPutFile streams a file body to the given URL. Setting +// req.ContentLength explicitly keeps net/http from defaulting to +// chunked transfer encoding on a non-Reader body — piri's PUT +// endpoint requires Content-Length. +func httpPutFile(ctx context.Context, client *nethttp.Client, urlStr string, headers nethttp.Header, path string, size int64) error { + f, err := os.Open(path) + if err != nil { + return fmt.Errorf("open car %s: %w", path, err) + } + defer f.Close() + + req, err := nethttp.NewRequestWithContext(ctx, nethttp.MethodPut, urlStr, f) + if err != nil { + return err + } + req.ContentLength = size + for k, v := range headers { + if len(v) > 0 { + req.Header.Set(k, v[0]) + } + } + resp, err := client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("http put status %s", resp.Status) + } + return nil +} + // internalDelegationFetcher matches the shape of the unexported // delegationFetcher in space_blob_add.go: returns the storage // provider's pre-issued delegation when the audience matches. @@ -390,4 +471,4 @@ func (hpf httpPutFact) ToIPLD() (map[string]datamodel.Node, error) { }, nil } -var _ Uploader = (*Internal)(nil) +var _ Uploader = (*Forge)(nil) diff --git a/pkg/ms3t/uploader/forgeauth.go b/pkg/ms3t/uploader/forgeauth.go deleted file mode 100644 index f68a560..0000000 --- a/pkg/ms3t/uploader/forgeauth.go +++ /dev/null @@ -1,92 +0,0 @@ -package uploader - -import ( - "fmt" - "net/url" - "os" - - uclient "github.com/storacha/go-ucanto/client" - "github.com/storacha/go-ucanto/core/delegation" - "github.com/storacha/go-ucanto/did" - "github.com/storacha/go-ucanto/principal" - "github.com/storacha/go-ucanto/principal/ed25519/signer" - "github.com/storacha/go-ucanto/transport/car" - uhttp "github.com/storacha/go-ucanto/transport/http" - guppyclient "github.com/storacha/guppy/pkg/client" -) - -// LoadOrCreateSigner reads a persisted principal.Signer from path or -// generates and writes a fresh one if the file does not exist. The -// on-disk format is the canonical did:key string representation -// (signer.Format). -// -// The returned signer's DID is what the operator passes to a delegator -// when requesting a `space/blob/add` + `space/index/add` delegation. -func LoadOrCreateSigner(path string) (principal.Signer, error) { - data, err := os.ReadFile(path) - if os.IsNotExist(err) { - s, err := signer.Generate() - if err != nil { - return nil, fmt.Errorf("uploader: generate signer: %w", err) - } - formatted, err := signer.Format(s) - if err != nil { - return nil, fmt.Errorf("uploader: format signer: %w", err) - } - if err := os.WriteFile(path, []byte(formatted), 0o600); err != nil { - return nil, fmt.Errorf("uploader: persist signer: %w", err) - } - return s, nil - } - if err != nil { - return nil, fmt.Errorf("uploader: read signer: %w", err) - } - s, err := signer.Parse(string(data)) - if err != nil { - return nil, fmt.Errorf("uploader: parse signer: %w", err) - } - return s, nil -} - -// LoadDelegations reads a CAR-encoded delegation from path. The input -// is expected to be a single delegation per file; callers needing -// multiple delegations should pass multiple paths and concatenate the -// results. -func LoadDelegations(path string) ([]delegation.Delegation, error) { - data, err := os.ReadFile(path) - if err != nil { - return nil, fmt.Errorf("uploader: read delegation: %w", err) - } - d, err := delegation.Extract(data) - if err != nil { - return nil, fmt.Errorf("uploader: parse delegation %s: %w", path, err) - } - return []delegation.Delegation{d}, nil -} - -// NewForgeClient assembles a guppy client targeting the given upload -// service. servicePrincipal is the DID of the upload service (e.g. -// sprue's did:web for production, or the local sprue did:key under -// smelt). serviceURL is the HTTP endpoint for UCAN invocations. -func NewForgeClient( - serviceURL *url.URL, - servicePrincipal did.DID, - s principal.Signer, - proofs []delegation.Delegation, -) (*guppyclient.Client, error) { - channel := uhttp.NewChannel(serviceURL) - codec := car.NewOutboundCodec() - conn, err := uclient.NewConnection(servicePrincipal, channel, uclient.WithOutboundCodec(codec)) - if err != nil { - return nil, fmt.Errorf("uploader: build connection: %w", err) - } - c, err := guppyclient.NewClient( - guppyclient.WithConnection(conn), - guppyclient.WithPrincipal(s), - guppyclient.WithAdditionalProofs(proofs...), - ) - if err != nil { - return nil, fmt.Errorf("uploader: build guppy client: %w", err) - } - return c, nil -} diff --git a/pkg/ms3t/uploader/guppy.go b/pkg/ms3t/uploader/guppy.go deleted file mode 100644 index ddb4fd5..0000000 --- a/pkg/ms3t/uploader/guppy.go +++ /dev/null @@ -1,149 +0,0 @@ -package uploader - -import ( - "bytes" - "context" - "fmt" - "io" - - "github.com/storacha/sprue/pkg/ms3t/cars" - "github.com/ipfs/go-cid" - cidlink "github.com/ipld/go-ipld-prime/linking/cid" - block "github.com/ipfs/go-block-format" - "github.com/multiformats/go-multicodec" - "github.com/multiformats/go-multihash" - "github.com/storacha/go-libstoracha/blobindex" - "github.com/storacha/go-ucanto/did" - guppyclient "github.com/storacha/guppy/pkg/client" -) - -// placeholderCID is the smallest legal raw-codec CID with an -// identity-hashed two-byte payload. It mirrors guppy's internal -// PlaceholderCID and is used as the "root" for the -// ShardedDagIndexView and the SpaceIndexAdd invocation: the index's -// Content() field and SpaceIndexAdd's rootCID parameter aren't -// load-bearing for inner-CID lookups (per guppy's own usage), so -// instead of inventing a synthetic root for each multi-root CAR we -// just pass this placeholder through. -var placeholderCID = cid.NewCidV1(cid.Raw, []byte{0x00, 0x00}) - -// Guppy is an Uploader that ships each Submit's CAR to Forge via the -// guppy client, then uploads a per-CAR index and registers it with -// the indexing-service so individual inner CIDs become resolvable. -// -// One Submit produces three Forge round trips: -// -// 1. SpaceBlobAdd of the CAR (one piri blob, multihash-keyed) -// 2. SpaceBlobAdd of the index (a small CAR encoding the inner -// CID → byte-range mappings) -// 3. SpaceIndexAdd (registers the index → placeholder -// root association with the indexer) -// -// Multi-root CARs ride as one logical batch. The index covers every -// inner block from every root; SpaceIndexAdd is called once per -// CAR, not once per root, since the rootCID parameter is treated as -// a placeholder by the upstream pattern. -// -// Synchronous: Submit blocks until all three calls have returned. -// Wrap in uploader.Batched if you want size/time-driven batching of -// multiple S3 ops into one CAR before each Submit fires. -type Guppy struct { - client *guppyclient.Client - spaceDID did.DID -} - -// GuppyConfig wires a *guppyclient.Client (already configured with -// connection, principal, and delegation proofs) plus the destination -// space DID into a Guppy uploader. -type GuppyConfig struct { - Client *guppyclient.Client - SpaceDID did.DID -} - -// NewGuppy constructs a Guppy uploader from a configured client. -func NewGuppy(cfg GuppyConfig) (*Guppy, error) { - if cfg.Client == nil { - return nil, fmt.Errorf("uploader: guppy client is required") - } - if cfg.SpaceDID == (did.DID{}) { - return nil, fmt.Errorf("uploader: space DID is required") - } - return &Guppy{client: cfg.Client, spaceDID: cfg.SpaceDID}, nil -} - -func (g *Guppy) Submit(ctx context.Context, roots []cid.Cid, blocks []block.Block) error { - if len(roots) == 0 { - return fmt.Errorf("uploader: at least one root required") - } - if len(blocks) == 0 { - return nil - } - - // 1. Encode CAR + record each inner block's byte position. - var carBuf bytes.Buffer - positions, err := cars.WriteWithPositions(&carBuf, roots, blocks) - if err != nil { - return fmt.Errorf("uploader: encode car: %w", err) - } - carBytes := carBuf.Bytes() - - carDigest, err := multihash.Sum(carBytes, multihash.SHA2_256, -1) - if err != nil { - return fmt.Errorf("uploader: hash car: %w", err) - } - - // 2. Upload the CAR as one piri blob. - if _, err := g.client.SpaceBlobAdd(ctx, - bytes.NewReader(carBytes), g.spaceDID, - guppyclient.WithPrecomputedDigest(carDigest, uint64(len(carBytes))), - ); err != nil { - return fmt.Errorf("uploader: SpaceBlobAdd(car): %w", err) - } - - // 3. Build a ShardedDagIndexView that points every inner CID at - // its slice of the CAR. Single shard (the CAR we just uploaded), - // placeholder content (see comment above placeholderCID). - view := blobindex.NewShardedDagIndexView(cidlink.Link{Cid: placeholderCID}, 1) - for _, p := range positions { - view.SetSlice(carDigest, p.CID.Hash(), blobindex.Position{ - Offset: p.Offset, - Length: p.Length, - }) - } - archReader, err := view.Archive() - if err != nil { - return fmt.Errorf("uploader: archive index: %w", err) - } - indexBytes, err := io.ReadAll(archReader) - if err != nil { - return fmt.Errorf("uploader: read archived index: %w", err) - } - indexDigest, err := multihash.Sum(indexBytes, multihash.SHA2_256, -1) - if err != nil { - return fmt.Errorf("uploader: hash index: %w", err) - } - - // 4. Upload the index as its own piri blob. - if _, err := g.client.SpaceBlobAdd(ctx, - bytes.NewReader(indexBytes), g.spaceDID, - guppyclient.WithPrecomputedDigest(indexDigest, uint64(len(indexBytes))), - ); err != nil { - return fmt.Errorf("uploader: SpaceBlobAdd(index): %w", err) - } - - // 5. Register the index with the indexing-service. The index CID - // uses the CAR multicodec, since the index is itself a CAR - // (matching how guppy frames its blobs). - indexCID := cid.NewCidV1(uint64(multicodec.Car), indexDigest) - if err := g.client.SpaceIndexAdd(ctx, - indexCID, uint64(len(indexBytes)), placeholderCID, g.spaceDID, - ); err != nil { - return fmt.Errorf("uploader: SpaceIndexAdd: %w", err) - } - return nil -} - -func (g *Guppy) Flush(context.Context) error { return nil } -func (g *Guppy) Close(context.Context) error { return nil } - -var _ Uploader = (*Guppy)(nil) diff --git a/pkg/ms3t/uploader/uploader.go b/pkg/ms3t/uploader/uploader.go deleted file mode 100644 index e0d1fa0..0000000 --- a/pkg/ms3t/uploader/uploader.go +++ /dev/null @@ -1,285 +0,0 @@ -// Package uploader hands batches of IPLD blocks off to durable storage as -// CAR files. The interface separates submission (queueing a logical PUT's -// blocks) from flushing (forcing buffered work out), so a buffered -// implementation can amortize many small S3 ops into one larger upload -// without changing the caller's flow. -package uploader - -import ( - "bytes" - "context" - "errors" - "fmt" - "os" - "path/filepath" - "sync" - "time" - - "github.com/storacha/sprue/pkg/ms3t/cars" - block "github.com/ipfs/go-block-format" - "github.com/ipfs/go-cid" -) - -// Uploader is the seam between the bucket service and durable storage. -// -// Submit hands one logical batch of blocks (typically the result of a -// single S3 op) to the uploader along with the root CID(s) that -// summarize what was written. The implementation may flush immediately -// or buffer and flush later. -// -// Flush forces any buffered work out to durable storage. Callers use -// this for explicit boundaries (multipart Complete, shutdown) or -// recovery loops. -// -// Close flushes any remaining buffered work and releases resources -// (background goroutines, file handles, network clients). -type Uploader interface { - Submit(ctx context.Context, roots []cid.Cid, blocks []block.Block) error - Flush(ctx context.Context) error - Close(ctx context.Context) error -} - -// === Disk === - -// Disk is a synchronous Uploader that writes one CAR file per Submit -// call into a directory. Useful for development, debugging, and as the -// inner sink of a Batched uploader. -type Disk struct { - dir string - - mu sync.Mutex - count uint64 // unique suffix for files when collisions could occur -} - -// NewDisk creates the target directory if needed and returns a Disk -// uploader. -func NewDisk(dir string) (*Disk, error) { - if err := os.MkdirAll(dir, 0o755); err != nil { - return nil, fmt.Errorf("uploader: mkdir %s: %w", dir, err) - } - return &Disk{dir: dir}, nil -} - -func (d *Disk) Submit(_ context.Context, roots []cid.Cid, blocks []block.Block) error { - if len(roots) == 0 { - return errors.New("uploader: at least one root required") - } - if len(blocks) == 0 { - return nil - } - - var carBuf bytes.Buffer - if err := cars.Write(&carBuf, roots, blocks); err != nil { - return fmt.Errorf("uploader: encode car: %w", err) - } - - final := filepath.Join(d.dir, d.fileName(roots)) - tmp, err := os.CreateTemp(d.dir, ".tmp-*.car") - if err != nil { - return fmt.Errorf("uploader: tmpfile: %w", err) - } - tmpPath := tmp.Name() - committed := false - defer func() { - if !committed { - _ = tmp.Close() - _ = os.Remove(tmpPath) - } - }() - - if _, err := tmp.Write(carBuf.Bytes()); err != nil { - return fmt.Errorf("uploader: write: %w", err) - } - if err := tmp.Sync(); err != nil { - return fmt.Errorf("uploader: sync: %w", err) - } - if err := tmp.Close(); err != nil { - return fmt.Errorf("uploader: close: %w", err) - } - if err := os.Rename(tmpPath, final); err != nil { - return fmt.Errorf("uploader: rename: %w", err) - } - committed = true - return nil -} - -func (d *Disk) Flush(context.Context) error { return nil } -func (d *Disk) Close(context.Context) error { return nil } - -// fileName produces a human-recognizable filename for a CAR, derived -// from the first root and the total root count for multi-root batches. -func (d *Disk) fileName(roots []cid.Cid) string { - first := roots[0].String() - if len(roots) == 1 { - return first + ".car" - } - d.mu.Lock() - d.count++ - n := d.count - d.mu.Unlock() - return fmt.Sprintf("%s+%d-%d.car", first, len(roots)-1, n) -} - -// === Noop === - -// Noop discards all submissions. Useful for tests/benchmarks. -type Noop struct{} - -func (Noop) Submit(context.Context, []cid.Cid, []block.Block) error { return nil } -func (Noop) Flush(context.Context) error { return nil } -func (Noop) Close(context.Context) error { return nil } - -// === Batched === - -// BatchedOptions configures a Batched uploader. -type BatchedOptions struct { - // MaxBytes triggers a flush when the buffered block bytes exceed - // this size. 0 → 64 MiB. - MaxBytes int64 - // MaxAge triggers a flush when the time since the last submit - // exceeds this duration. 0 → 5 seconds. - MaxAge time.Duration - // CheckInterval is how often the background loop wakes to evaluate - // the time-based threshold. 0 → MaxAge / 4 (clamped to a minimum). - CheckInterval time.Duration -} - -func (o *BatchedOptions) defaults() { - if o.MaxBytes <= 0 { - o.MaxBytes = 64 << 20 - } - if o.MaxAge <= 0 { - o.MaxAge = 5 * time.Second - } - if o.CheckInterval <= 0 { - o.CheckInterval = o.MaxAge / 4 - if o.CheckInterval < 100*time.Millisecond { - o.CheckInterval = 100 * time.Millisecond - } - } -} - -// Batched buffers Submit calls in memory and flushes them to an inner -// Uploader as one combined batch when a size or time threshold is hit. -// Multiple roots accumulate; the eventual CAR has all of them. -// -// Crash recovery: Batched does not persist its in-memory queue. If the -// process dies, blocks that were Submitted but not yet Flushed remain -// in the underlying blockstore (canonical) but were not shipped via -// the inner Uploader. Recovery is the responsibility of the caller — -// see bucket.Service.Recover. -type Batched struct { - inner Uploader - opts BatchedOptions - - mu sync.Mutex - rootSet map[cid.Cid]struct{} - roots []cid.Cid - blockSet map[cid.Cid]struct{} - blocks []block.Block - pendingBytes int64 - lastSubmit time.Time - - stop chan struct{} - done chan struct{} -} - -// NewBatched wraps inner with size+time-driven flushing. -func NewBatched(inner Uploader, opts BatchedOptions) *Batched { - opts.defaults() - b := &Batched{ - inner: inner, - opts: opts, - rootSet: map[cid.Cid]struct{}{}, - blockSet: map[cid.Cid]struct{}{}, - stop: make(chan struct{}), - done: make(chan struct{}), - } - go b.loop() - return b -} - -func (b *Batched) Submit(ctx context.Context, roots []cid.Cid, blocks []block.Block) error { - if len(roots) == 0 { - return errors.New("uploader: at least one root required") - } - - b.mu.Lock() - for _, r := range roots { - if _, ok := b.rootSet[r]; !ok { - b.rootSet[r] = struct{}{} - b.roots = append(b.roots, r) - } - } - for _, blk := range blocks { - c := blk.Cid() - if _, ok := b.blockSet[c]; !ok { - b.blockSet[c] = struct{}{} - b.blocks = append(b.blocks, blk) - b.pendingBytes += int64(len(blk.RawData())) - } - } - b.lastSubmit = time.Now() - overSize := b.pendingBytes >= b.opts.MaxBytes - b.mu.Unlock() - - if overSize { - return b.Flush(ctx) - } - return nil -} - -func (b *Batched) Flush(ctx context.Context) error { - b.mu.Lock() - if len(b.blocks) == 0 { - b.mu.Unlock() - return nil - } - roots := b.roots - blocks := b.blocks - b.roots = nil - b.blocks = nil - b.rootSet = map[cid.Cid]struct{}{} - b.blockSet = map[cid.Cid]struct{}{} - b.pendingBytes = 0 - b.mu.Unlock() - - return b.inner.Submit(ctx, roots, blocks) -} - -func (b *Batched) Close(ctx context.Context) error { - close(b.stop) - <-b.done - if err := b.Flush(ctx); err != nil { - return err - } - return b.inner.Close(ctx) -} - -func (b *Batched) loop() { - defer close(b.done) - ticker := time.NewTicker(b.opts.CheckInterval) - defer ticker.Stop() - for { - select { - case <-b.stop: - return - case <-ticker.C: - b.mu.Lock() - shouldFlush := len(b.blocks) > 0 && !b.lastSubmit.IsZero() && - time.Since(b.lastSubmit) >= b.opts.MaxAge - b.mu.Unlock() - if shouldFlush { - _ = b.Flush(context.Background()) - } - } - } -} - -// === Compile-time assertions === - -var ( - _ Uploader = (*Disk)(nil) - _ Uploader = Noop{} - _ Uploader = (*Batched)(nil) -) diff --git a/pkg/ms3t/util.go b/pkg/ms3t/util.go new file mode 100644 index 0000000..133b821 --- /dev/null +++ b/pkg/ms3t/util.go @@ -0,0 +1,42 @@ +package ms3t + +import ( + "fmt" + "os" + + "github.com/storacha/go-ucanto/principal" + "github.com/storacha/go-ucanto/principal/ed25519/signer" +) + +// LoadOrCreateSigner reads a persisted principal.Signer from path or +// generates and writes a fresh one if the file does not exist. The +// on-disk format is the canonical did:key string representation +// (signer.Format). +// +// The returned signer's DID is what the operator passes to a delegator +// when requesting a `space/blob/add` + `space/index/add` delegation. +func LoadOrCreateSigner(path string) (principal.Signer, error) { + data, err := os.ReadFile(path) + if os.IsNotExist(err) { + s, err := signer.Generate() + if err != nil { + return nil, fmt.Errorf("uploader: generate signer: %w", err) + } + formatted, err := signer.Format(s) + if err != nil { + return nil, fmt.Errorf("uploader: format signer: %w", err) + } + if err := os.WriteFile(path, []byte(formatted), 0o600); err != nil { + return nil, fmt.Errorf("uploader: persist signer: %w", err) + } + return s, nil + } + if err != nil { + return nil, fmt.Errorf("uploader: read signer: %w", err) + } + s, err := signer.Parse(string(data)) + if err != nil { + return nil, fmt.Errorf("uploader: parse signer: %w", err) + } + return s, nil +} From afb7f978e93603ea309e077bcd232f174e28110b Mon Sep 17 00:00:00 2001 From: frrist Date: Fri, 1 May 2026 19:07:55 -0700 Subject: [PATCH 3/3] docs: update architectural.md --- pkg/ms3t/architectural.md | 1011 +++++++++++++++++-------------------- 1 file changed, 470 insertions(+), 541 deletions(-) diff --git a/pkg/ms3t/architectural.md b/pkg/ms3t/architectural.md index 1678c26..82f340f 100644 --- a/pkg/ms3t/architectural.md +++ b/pkg/ms3t/architectural.md @@ -1,583 +1,512 @@ -# ms3t — S3 over Forge (MVP / prototype) +# ms3t — S3 over Forge (current state) + +This document describes the implementation under `sprue/pkg/ms3t/` +as it stands today. ms3t is an embedded S3 protocol listener that +runs in-process inside sprue (or, in tests, against an in-memory +harness) and translates S3 requests into mutations of a per-bucket +Merkle Search Tree, durably journaled to a local LSM-style log and +asynchronously shipped to Forge. + +It is still a prototype: many S3 features are unimplemented (see +"Not implemented" near the end), and several knobs that future +production work will tighten are noted as TODOs in code. + +## At a glance + +- **Protocol layer** — `github.com/versity/versitygw`. We get a + near-complete S3 REST front end (sigv4, path-style addressing, + the standard verb shapes) by implementing versitygw's + `backend.Backend` interface. +- **Backend adapter** — `pkg/ms3t/s3frontend.Backend`. Wires every + served verb into ms3t's domain primitives. Anything we haven't + implemented inherits `ErrNotImplemented` from versitygw's + `backend.BackendUnsupported`. +- **Per-op transaction** — `pkg/ms3t/bucketop.Tx`. Acquires the + per-bucket lock, snapshots the bucket's published Root from the + registry, and exposes a per-tx staging buffer + CBOR-typed view + over it. On Commit it fsyncs the batch into the log and + CAS-advances the registry Root in one shot. +- **Storage tiers** — an LSM-style local log: + - *Hot* — current open segment (CAR + .ops sidecar) on local + disk. AppendBatch fsyncs both files before returning. + - *Warm* — sealed segments retained on local disk for fast reads. + - *Cold* — segments shipped off-host to Forge (piri CAR + index + claim). The layered read tier falls through to Forge on misses. +- **Persistent metadata** — Postgres, under the `ms3t` schema. + Per-bucket Root + per-segment lifecycle live in the same database. +- **Identity** — ms3t owns its own ed25519 keypair (the *space*) and + is the root UCAN authority for self-issued + `space/content/retrieve` delegations. Sprue's identity is the + audience for piri allocate/accept invocations. + +## On-disk layout -This document describes what the code in `sprue/pkg/ms3t/` actually -does today, running in smelt with the deployed wiring. - -It is **not** an architecture spec for a production system. ms3t is a -prototype for "expose an S3 API on top of the Forge stack." Every -choice baked into the current shape is up for debate. The job of -this doc is to give the team enough of a map to read the code, ask -"why this and not that?", and weigh in on where to go next. - -If you're looking for what isn't built yet or what was considered -and dropped, see "Choices we made (and the alternatives)" and -"Open questions" near the end. - -## What ms3t is - -A goroutine inside sprue that: +``` +/ +├── space.key # ed25519 keypair (UCAN identity) +└── segments/ + ├── seg-NNNNNNNNNNNNNNNNNNNN.car # one CAR per segment + ├── seg-NNNNNNNNNNNNNNNNNNNN.ops # per-batch (bucket, root) records + └── seg-NNNNNNNNNNNNNNNNNNNN.idx # JSON sidecar (sealed only) +``` -- Listens on a configured `host:port` and speaks the AWS S3 REST - protocol (path-style; subset of operations: bucket CRUD, object - PUT/GET/HEAD/DELETE, ListObjectsV2, range GETs) -- Stores object data as content-addressed CAR files in piri (via - sprue's existing piriclient + routing + indexerclient — same - packages sprue's own UCAN handlers use) -- Stores per-bucket "what's the current MST root?" in a small SQLite - file alongside sprue -- Stores its own UCAN identity (a generated did:key) in a file - alongside sprue +- `.car` — CAR v1 with a placeholder root in the header. Block + frames are appended via `cars.WriteBlocksAt`. Per-batch fsync. +- `.ops` — append-only sidecar of `[bucket: text, root: bytes]` + CBOR records, each prefixed by a 4-byte big-endian length. One + record per AppendBatch (one S3 op). +- `.idx` — written atomically (tmp + rename) at seal time. JSON: + `{seq, size_bytes, sha256_hex, sealed_at, blocks: [{cid, + offset, length}], op_roots: [{bucket, root}]}`. The post-crash + source of truth for sealed segments. -There is no other persistent state. ms3t holds no canonical block -data — every block read goes to the network. +## Postgres schema -## Local state +Migrations are in `pkg/ms3t/migrations/sql/`, applied via goose +against the caller-provided `*pgxpool.Pool` at startup. All ms3t +tables live under the `ms3t` schema; goose's bookkeeping is at +`ms3t.goose_db_version`, so it never collides with other migrations +on the same database. -``` -/ -├── space.key # ed25519 keypair, ms3t's UCAN identity / space root -└── ms3t-registry.db # SQLite, one row per bucket -``` +```sql +CREATE TABLE ms3t.buckets ( + name TEXT PRIMARY KEY, + root_cid BYTEA, -- current MST root, NULL for empty bucket + forge_root_cid BYTEA, -- last MST root whose blocks shipped to Forge + created_at BIGINT NOT NULL +); -The SQLite schema (`pkg/ms3t/registry/sqlite.go`): +CREATE TABLE ms3t.segments ( + seq BIGINT PRIMARY KEY, + state TEXT CHECK (state IN ('open','sealed','flushed')), + sealed_at BIGINT, + flushed_at BIGINT, + size_bytes BIGINT DEFAULT 0, + car_sha256 BYTEA +); -```sql -CREATE TABLE buckets ( - name TEXT PRIMARY KEY, - root_cid BLOB, -- current MST root, NULL for empty bucket - forge_root_cid BLOB, -- last root known to be in Forge - created_at INTEGER NOT NULL +CREATE TABLE ms3t.segment_op_roots ( + seq BIGINT, + seq_within INT, + bucket TEXT NOT NULL, + root_cid BYTEA NOT NULL, + PRIMARY KEY (seq, seq_within), + FOREIGN KEY (seq) REFERENCES ms3t.segments(seq) ON DELETE CASCADE ); +CREATE INDEX ON ms3t.segment_op_roots (bucket, seq); + +CREATE SEQUENCE ms3t.segment_seq; ``` -`forge_root_cid` is plumbing for a batched-writes mode that isn't -currently active; in the deployed sync-writes mode it always equals -`root_cid` after each PUT/DELETE. +`forge_root_cid` is the per-bucket high-water mark of "what's +durably in Forge." When the flusher succeeds, it advances +`forge_root_cid` for every op-root the segment carried in the same +transaction that flips the segment's state to `flushed`. -## How the data is shaped +## Per-object data shape -Each S3 object's bytes get chunked into raw IPLD blocks (default 1 -MiB, raw codec, sha256 multihash) and pointed at by an -`ObjectManifest` (DAG-CBOR): +Each S3 object is represented by an `ObjectManifest` block whose +`Body` field describes how the bytes are framed. The Body shape is +polymorphic via the `Format` string; the only codec today is +`fixed-v1`. ```go type ObjectManifest struct { - Key string - ContentType string - Created int64 - Body Body + Key string `cborgen:"k"` + ContentType string `cborgen:"ct"` + Created int64 `cborgen:"t"` + Body Body `cborgen:"b"` } type Body struct { - Size int64 - ChunkSize int64 - Chunks []cid.Cid - SHA256 []byte // for ETag + Size int64 `cborgen:"s"` + SHA256 []byte `cborgen:"h"` // hex of this is the ETag we serve today + Content cid.Cid `cborgen:"c"` // points at format-specific DAG root + Format string `cborgen:"f"` } -``` - -The bucket itself is an MST keyed by S3 key, with leaves pointing at -manifest CIDs. The "current state" of a bucket is a single CID — the -MST root — held in the registry. - -Every PUT/DELETE produces a new MST root via the -forked-from-atproto MST in `pkg/ms3t/mst/`, which is content-addressed -all the way down. The MST itself is fully described in its package -docs. - -## How the data lives in Forge -For every S3 PUT, ms3t produces **one CAR file** containing: +const FormatFixed = "fixed-v1" -- the new body chunks (raw blocks) -- the new ObjectManifest -- the mutated MST nodes (the path from leaf to root) - -Plus a small **index blob** (also a CAR) describing where each inner -block sits within the data CAR, byte-offset-and-length, encoded as a -`blobindex.ShardedDagIndexView`. - -Both blobs are uploaded to piri. The index is registered with the -indexing-service via `assert/index`. From that point onward, any -inner CID (an MST node, a manifest, a body chunk) is resolvable via: - -1. Indexer query: `multihash → (CAR multihash, byte offset, length)` -2. Piri ranged GET: read `[offset, offset+length)` of the CAR - -The indexer + piri retrieval flow are how reads find anything. - -## The PUT flow - -``` -S3 client ms3t sprue services piri indexer - │ │ │ │ │ - │ PUT k=v │ │ │ │ - ├────────────▶│ │ │ │ - │ │ load HEAD root_cid │ │ │ - │ │ from registry │ │ │ - │ │ │ │ │ - │ │ chunk body into │ │ │ - │ │ raw blocks (in mem) │ │ │ - │ │ │ │ │ - │ │ mst.Add(key, mfCid): │ │ - │ │ reads existing nodes via Forge ─────┤ │ - │ │ ◀─ indexer + ranged piri GETs │ │ - │ │ produces new path nodes (in mem) │ │ - │ │ │ - │ │ pack body + manifest + mst nodes into one CAR │ - │ │ │ - │ │ piriclient.Allocate(carHash, carSize) │ - │ ├──────────────────────▶│ │ │ - │ │◀── presigned URL ─────┤ │ │ - │ │ │ - │ │ HTTP PUT carBytes ────────────────▶│ │ - │ │ │ - │ │ piriclient.Accept │ - │ ├──────────────────────▶│ │ │ - │ │ │ - │ │ build ShardedDagIndexView over CAR offsets │ - │ │ │ - │ │ Allocate + PUT + Accept the index blob ──┐ │ - │ │ ▼ │ - │ │ │ - │ │ self-issue space/content/retrieve │ - │ │ delegation (space → sprue) for the index blob │ - │ │ │ - │ │ indexerclient.PublishIndexClaim ─────────────────▶ - │ │ │ - │ │ registry: CAS root_cid old → new │ - │ │ │ - │ 200 OK + ETag │ - │◀────────────┤ │ +type FixedChunkerIndex struct { + ChunkSize int64 `cborgen:"cs"` + Chunks []cid.Cid `cborgen:"c"` +} ``` -This is **synchronous**: every step blocks the client's PUT. Three -piri round trips per PUT (data CAR allocate+PUT+accept, index -allocate+PUT+accept, then index claim publication). Read-after-write -is correct because the assert/index has been published before 200 is -returned. - -## The GET flow +The `BodyCodec` interface (`pkg/ms3t/bucket/chunker.go`) is the +seam: -``` -S3 client ms3t indexer piri - │ │ │ │ - │ GET k │ │ │ - ├────────────▶│ │ │ - │ │ load HEAD root from registry │ - │ │ │ - │ │ for each MST node walked from │ - │ │ root toward the leaf: │ - │ │ 1. indexer query for cid ────▶│ - │ │ 2. self-issue retrieve UCAN │ - │ │ 3. rclient.Execute on piri ──────────▶│ - │ │ ◀── block bytes (Range) ─────────┤ - │ │ 4. parse, follow next link │ - │ │ │ - │ │ once at the leaf manifest: │ - │ │ for each body chunk: same dance │ - │ │ │ - │ │ stream reassembled body to client │ - │ 200 + bytes │ │ - │◀────────────┤ │ +```go +type BodyWriter interface { + Chunk(ctx context.Context, w blockstore.WriteStore, r io.Reader) (Body, error) +} +type BodyReader interface { + Format() string + Open(ctx context.Context, bs blockstore.ReadStore, body Body) io.ReadCloser + OpenRange(ctx context.Context, bs blockstore.ReadStore, body Body, start, end int64) io.ReadCloser +} +type BodyCodec interface { BodyWriter; BodyReader } ``` -Every block read is a network round trip. There is no local cache -serving any of these reads. +`FixedChunker` reads the body in `ChunkSize`-byte (default 1 MiB) +segments, writes each as a raw IPLD block, and finishes with a +`FixedChunkerIndex` CBOR block listing the chunk CIDs in order. +`Body.Content` points at the index. Reads lazily fetch the index on +first call and stream chunks; ranged reads translate the absolute +range into `(chunkIndex, in-chunk-offset)` and skip ahead. -The `rclient.Execute` call wraps the GET with a UCAN auth header -(`X-Agent-Message`) carrying a `space/content/retrieve` invocation -chained back to the space root — piri rejects unauthenticated -retrievals. +Adding a new codec is a new `BodyCodec` implementation plus a new +`Format` constant; the Body / Manifest shape stays stable. -## Where the UCAN identity comes from +## Bucket as MST -ms3t generates and persists its own ed25519 keypair on first run. -That keypair is the **space**: a `did:key` whose private half is in -`/space.key`. ms3t is the root UCAN authority over its own -space, which lets it self-issue all the delegations it needs: +The bucket is a Merkle Search Tree (forked from the atproto MST in +`pkg/ms3t/mst/`, with relaxed key validation) keyed by S3 object +key. Each leaf points at an ObjectManifest CID. The bucket's +"current state" is a single MST root CID held at +`ms3t.buckets.root_cid`. -- For the indexer: a blanket `space/content/retrieve` with - `NoCaveats` so the indexer can fetch any blob in the space when - validating an index claim -- For piri retrievals: a 60-second `space/content/retrieve` proof - per Get, attached to a typed retrieve invocation -- For PublishIndexClaim: a per-call retrieval delegation scoped to - the specific index blob +Public MST methods used by the backend: `Add`, `Update`, `Delete`, +`Get`, `GetPointer`, `WalkLeavesFromNocache`. The MST is +content-addressed all the way down — every mutation produces a new +root CID. Mutated nodes are written through the staging buffer +(which feeds the log on Commit) via `tx.Put`/`tx.PutBlock`. -Sprue uses its own identity (`upload.pem` in smelt) for the piri -allocate/accept invocations and as the audience of ms3t's -self-issued retrieval delegations. So: - -- **Sprue identity**: signs piri-side blob lifecycle invocations -- **ms3t space keypair**: signs anything that needs to chain back to - "the owner of this space" - -## Components map +## Storage tiers (LSM) ``` - ┌────────────────────────┐ -S3 client ──────▶ │ ms3t HTTP listener │ pkg/ms3t/server/ - │ (S3 protocol → service)│ - └───────────┬────────────┘ - │ - ┌───────────┴────────────┐ - │ bucket.Service │ pkg/ms3t/bucket/ - │ load HEAD, mutate MST, │ - │ build CAR, commit │ - └─────┬───────────┬──────┘ - │ │ - ┌────────────────┘ └─────────────────┐ - │ │ - ┌───▼─────────────┐ ┌────────▼────────────┐ - │ registry.SQL │ │ blockstore.Forge │ - │ SQLite, HEAD │ │ reads via indexer │ - │ pointer per │ │ + piri rclient. │ - │ bucket │ │ Put: no-op. │ - └─────────────────┘ └─────────┬───────────┘ - │ - ┌────────────────────────┐ │ - │ uploader.Internal │ ◀────────┘ writes side - │ Submit: encode CAR, │ - │ piriclient + indexer- │ - │ client per call │ - └────────────┬───────────┘ - │ - ┌──────────────────┼──────────────────────┐ - ▼ ▼ ▼ - ┌──────────┐ ┌──────────┐ ┌────────────────┐ - │ sprue │ │ piri │ │ indexing- │ - │ routing │ │ blob │ │ service │ - │ piriclient │ store │ │ assert/index │ - └──────────┘ └──────────┘ └────────────────┘ - (in-process (HTTP w/ UCAN auth) - Go calls) + ┌────────────────────────────────────────────┐ + │ HOT open segment │ + │ AppendBatch fsyncs CAR + .ops sidecar│ ◀─┐ + │ before returning │ │ + └──────────────┬─────────────────────────────┘ │ + │ │ + seal-on-bytes / seal-on-age │ reads + │ │ fall + ┌──────────────▼─────────────────────────────┐ │ through + │ WARM sealed segments on local disk │ │ here + │ .idx sidecar persisted │ │ + │ (atomic tmp+rename) │ ◀─┤ + │ MarkSegmentSealed in Postgres │ │ + └──────────────┬─────────────────────────────┘ │ + │ │ + Flush callback │ + │ │ + ┌──────────────▼─────────────────────────────┐ │ + │ COLD shipped to Forge (piri + indexer) │ │ + │ per-bucket forge_root_cid advanced │ ◀─┤ + │ retention sweeps after cfg.Retain │ │ + └──────────────┬─────────────────────────────┘ │ + │ │ + network reads │ + ▼ ▼ + ┌──────────────────┐ ┌──────────────────┐ + │ blockstore.Forge │ │ Layered.GetBlock │ + │ indexer + piri │ │ open → sealed → │ + │ ranged GETs │ │ Forge fall-through│ + └──────────────────┘ └──────────────────┘ ``` -ms3t calls **sprue's services in-process** (Go function calls into -`pkg/piriclient`, `pkg/routing`, `pkg/indexerclient`). It does not -loopback through sprue's HTTP/UCAN handler. sprue's own UCAN -endpoint and ms3t's S3 endpoint are two unrelated listeners in the -same process. - -## Choices we made (and the alternatives) - -These are **prototype decisions**, made to ship something working. -Each is a place the team should weigh in on whether the choice -holds up. - -### Sync writes, no local block cache - -Every PUT blocks on three Forge round trips. Every GET hits the -network for every block. There is no local SQLite blockstore active -in this mode. - -- **Why we picked this**: forces the read path to actually work - end-to-end against real Forge. Closes the read-after-write race - by construction. Simplest possible state model: only the registry - is mutable. -- **Why it's awkward**: `aws s3 sync` of many small files is slow. - An MST traversal during a PUT pays N network round trips for N - existing nodes on the path, even though those nodes are - deterministic. -- **Alternative we have code for**: `Batched(Internal)` uploader + - SQLite blockstore as a read-through cache. This is the default - when `ms3t.forge.no_cache: false`. Faster, but the - `forge_root_cid` machinery has to actually do something — and the - read-after-write window opens. - -### ms3t owns its space - -ms3t generates its own ed25519 keypair and is the root UCAN -authority over its own space. Self-issues every delegation it needs. - -- **Why we picked this**: zero out-of-band provisioning. The first - time sprue starts with `forge.enabled`, ms3t writes a key and - uses it. No "go ask the delegator for a delegation, paste it - here." -- **Why it's awkward**: ms3t-as-space-root is unusual. In a real - multi-tenant deployment this doesn't model what we'd want — each - S3 customer would presumably have their own space, with ms3t - acting as a tenant-aware orchestrator. -- **Alternative we considered**: ms3t holds an externally-issued - delegation chain into a pre-provisioned space. Better tenant - story, requires delegation provisioning machinery. - -### One CAR per S3 op (body + structural) - -Body chunks ride in the same CAR as the structural blocks. The -indexer maps inner CIDs to byte ranges within the outer CAR. One -data-CAR upload + one index-blob upload per PUT. - -- **Why we picked this**: matches what guppy does for filesystem - uploads — minimum number of piri round trips per PUT. Body - retrievals work via ranged GETs against the outer CAR. -- **Why it's awkward**: rules out direct-passthrough of body bytes - (we'd want body chunks as their own piri blobs so a 307 redirect - has a stable URL target). -- **Alternative we considered**: separate piri blobs per body - chunk, smaller structural CAR for the MST + manifest. Doubles - the per-PUT round trip count but enables passthrough. - -### ms3t in the data path - -The S3 client uploads body bytes to ms3t; ms3t uploads to piri. -Same on the read side. ms3t pays the bandwidth. - -- **Why we picked this**: the alternative (direct passthrough) - needs a Forge feature we don't have — see "Direct passthrough" - under future directions. -- **Why it's awkward**: the operator running sprue + piri pays - bandwidth twice (client→sprue, sprue→piri) when conceptually - the bytes only need to move once. In a federated model where - piri storage is run by different operators, this becomes - structurally wrong (sprue's operator pays to deposit bytes onto - someone else's hardware). - -### Embedded in sprue - -ms3t lives at `sprue/pkg/ms3t/` and is wired by sprue's fx graph. -No deployment artifact distinct from sprue. - -- **Why we picked this**: zero auth coordination — ms3t is sprue, - it has all sprue's identities and clients in-process. One binary - to ship, one config file. -- **Why it's awkward**: every sprue release ships ms3t, every ms3t - change requires a sprue release. Sprue maintainers inherit MST - + S3 protocol surface area. -- **Alternative**: standalone ms3t binary, talks to sprue/piri via - external UCAN-over-HTTP. (This exists at - github.com/frrist/ms3t — a separate repo that was the original - prototype before we copied into sprue.) - -### Sticky-bucket routing (assumed but not built) - -The current code assumes a single ms3t instance per bucket, via the -in-process `sync.Mutex` per-bucket lock. There is no cross-instance -coordination. - -- **Why we picked this**: works for a single-process MVP. -- **What's needed for HA**: either sticky-bucket routing at a load - balancer (hash bucket name → ms3t instance) or multi-writer with - CAS retry and cache invalidation. Not implemented. - -## Operational characteristics observed - -These are observations from smelt, not promises: - -- `aws s3 cp small.txt s3://demo/k` (small file): a few hundred - milliseconds inside the docker network, dominated by the three - Forge round trips -- `aws s3 cp s3://demo/k -` immediately after: works (sync writes - close the race) -- `aws s3 sync` of many small files: visibly slow — each file pays - the full Forge round-trip cost serially per S3 PUT -- `aws s3 ls`: walks MST through the network; cost grows with - bucket size - -We have not measured anything precisely. These are rough impressions. - -## Known limitations - -- **Slow.** Sync writes + no read cache. No effort has gone into - performance. -- **No GC.** S3 DELETE removes the leaf from the MST. Body chunks - become unreferenced from the current root, but we don't tell - Forge to expire them. Storage grows monotonically. -- **No multipart upload.** S3 client splits files >8 MB into - multipart by default; we don't implement it. Operators have to - set `multipart_threshold = 5GB` in their AWS profile. -- **No `aws-chunked` body decoding.** The current AWS CLI default - upload format. Operators have to set - `request_checksum_calculation = when_required` to disable it. -- **Single-tenant.** One ms3t = one space. -- **Single-instance.** No HA story. -- **Disk and Guppy uploaders are dead code in sprue's wiring.** - They exist in `pkg/ms3t/uploader/` for the standalone-ms3t use - case; sprue only wires `Internal` (when forge enabled) or `Disk` - (when forge disabled, in the cache mode that isn't currently - deployed). - -## Aligning with the RFCs - -There's a parallel design effort for the per-object data layout that -predates this prototype: - -- `shard.rfc` (in this repo) — Forge S3 Facade sharding strategy -- [storacha/RFC #65](https://github.com/storacha/RFC/pull/65) — Filepack archive format -- [storacha/RFC #66](https://github.com/storacha/RFC/pull/66) — Virtual DAG in Sharded DAG Index - -Together these propose: shard at 256 MB; each shard is a Filepack -data archive (raw concatenated bytes, no CAR overhead); a UnixFS -File root links the shards in order; a v0.2 Sharded DAG Index -inlines that UnixFS root via its new `blocks` property. - -The MST-as-bucket idea is **orthogonal to all three RFCs** — they -address per-object data layout, not how a bucket is structured. So -the MST work in this PR is independent of whether we adopt the -RFCs' direction. - -The per-object layer of this prototype diverges from the RFCs: - -| | RFCs | this PR | -|---|---|---| -| shard format | Filepack (raw bytes) | raw IPLD blocks inside one CAR | -| per-object root | UnixFS File node | `ObjectManifest` (CBOR) | -| SDI version | v0.2 with inline `blocks` | v0.1 | -| chunk/shard size | 256 MB | 1 MiB | - -Aligning would mean replacing the body fields of `ObjectManifest` -with a single `cid.Cid` pointing at the UnixFS root, and producing -Filepack shards instead of raw blocks inside a CAR. The MST -machinery is unaffected. +The read path (`blockstore.Layered`): + +1. Open segment's in-memory index (CIDs from blocks just appended). +2. Sealed segments on local disk, newest-first by seq. +3. Forge — only reached on local miss. `blockstore.Forge` queries + the indexer for the block's `(CAR multihash, offset, length)`, + self-issues a scoped retrieval delegation, and does a ranged + GET against piri. + +The write path (per S3 op): + +1. `bucketop.Coordinator.Begin(bucket)` — clones the bucket name + (defends against fiber's recycled request buffer), acquires the + per-bucket lock, snapshots the bucket's State from the registry. +2. `BodyCodec.Chunk(ctx, tx, body)` — writes body chunks + + FixedChunkerIndex through `tx.PutBlock`/`tx.Put` (which buffer + in `OpStaging`). +3. `tx.Put(manifest)` — writes the ObjectManifest block. +4. `t.Add(key, mfCid)` (or Update / Delete) → `t.GetPointer(tx)` — + serializes the new MST nodes through the same staging buffer, + returns the new root CID. +5. `tx.Commit(newRoot)`: + - `staging.Commit` calls `log.AppendBatch(blocks, OpRoot{bucket, + root})`. Segment.append fsyncs CAR + .ops before returning. + - `reg.CASRoot(bucket, expect, next)` advances the bucket Root + in Postgres. + - Releases the per-bucket lock. +6. Return 200 to the client. + +The flush path (background goroutine in `logstore.Store`): + +1. Pick a sealed segment off the queue. +2. Build a `uploader.CARSource` from segment metadata + (`{Path, Size, SHA256, Positions}` — every field already on the + segment, no rescan). +3. `uploader.Forge.SubmitCAR`: + - Allocate + HTTP PUT (streaming straight from `CARSource.Path`) + + Accept the data CAR via a piri selected by routing. + - Build a `ShardedDagIndexView` from `CARSource.Positions`, + archive it, allocate + PUT + Accept the index blob. + - Self-issue a `space/content/retrieve` delegation scoped to the + index blob. + - Publish the index claim against the indexing-service. +4. `meta.MarkSegmentFlushed(seq, flushedAt, opRoots)` in one + Postgres transaction — flips state to `flushed`, writes + `flushed_at`, advances `forge_root_cid` for every op-root the + segment carried. +5. Retention: if there are more than `Retain` flushed segments on + disk, retire the oldest (close fds, unlink files, delete the + Postgres row). + +The default seal triggers (set in `pkg/ms3t/logstore/config.go`) +are 64 MiB or 5s; both can be overridden via `ServerConfig`. + +## Module map -### ObjectManifest still has a place under the RFCs - -Even after aligning with the RFCs, we'd still want a small per-object -manifest block: - -```go -type ObjectManifest struct { - Content cid.Cid // → UnixFS File root - ContentType string // S3 needs this on GET - Created int64 // S3 needs this for Last-Modified - // user metadata, cache-control, etc. as needed -} +``` +pkg/ms3t/ +├── server.go — Server, ServerConfig, ServerDeps, New, newFlushFunc +├── module.go — fx Module + registerLifecycle (production wiring) +├── util.go — LoadOrCreateSigner (space.key) +│ +├── s3frontend/ — versitygw backend.Backend implementation +│ ├── backend.go — Backend, Recover (no-op), Drain (Coordinator.Close) +│ ├── bucket.go — bucket-level handlers + ACL/policy/lock/versioning stubs +│ └── object.go — object-level handlers + listWalk + lookupManifest +│ +├── bucketop/ — per-bucket write transaction primitive +│ └── bucketop.go — Coordinator, Tx, WithTx, WithLock, MutateFn +│ +├── blockstore/ — read/write contracts + impls + Log seam +│ ├── store.go — Reader, Writer, Store, BlockReader/Writer, etc. +│ ├── log.go — Log interface, OpRoot, BlockLoc +│ ├── staging.go — OpStaging (per-op buffer) +│ ├── layered.go — Layered (composite read tier) +│ └── forge.go — Forge (network base reader; no writes) +│ +├── logstore/ — LSM-style segment-based log +│ ├── store.go — Store, Open, AppendBatch, Get, Close +│ ├── segment.go — Segment lifecycle + on-disk format +│ ├── recovery.go — startup reconciliation +│ ├── config.go — Config (Dir/SealBytes/SealAge/Retain/Flush/Meta) +│ └── types.go — Meta interface, SegmentMeta, State +│ +├── uploader/ — ship sealed segment to Forge +│ └── forge.go — Uploader interface, CARSource, Forge.SubmitCAR +│ +├── registry/ — Postgres-backed bucket and segment metadata +│ ├── registry.go — Registry interface + State +│ ├── postgres.go — Postgres bucket methods +│ └── segments.go — Postgres methods satisfying logstore.Meta +│ +├── bucket/ — per-object data model + body codec +│ ├── manifest.go — ObjectManifest, Body, FormatFixed, FixedChunkerIndex +│ ├── chunker.go — BodyWriter / BodyReader / BodyCodec / FixedChunker +│ └── cbor_gen.go — generated by gen/ +│ +├── mst/ — atproto fork (relaxed key validation) +│ +├── cars/ — CAR encoding / scanning helpers +│ ├── encoder.go +│ └── reader.go +│ +├── migrations/ — goose-applied SQL embed +│ └── sql/{00001_init,00002_segments}.sql +│ +├── testing/ — smoke harness + curated suite tests +│ ├── harness.go — StartHarness + in-memory deps fakes +│ ├── integration.go — Run/RunT, upstream Suite values +│ ├── smoke_test.go — TestSmoke_* / TestSmokeXFail_* tables +│ ├── harness_test.go — TestHarnessLifecycle +│ └── listbuckets_test.go — TestListBucketsNamesStable (regression) +│ +└── gen/ — cborgen for bucket/cbor_gen.go ``` -The S3 protocol metadata (Content-Type, Last-Modified, user -`x-amz-meta-*` headers) doesn't have a natural home in UnixFS or -the SDI. UnixFS-Plus extensibility is thin and not well-supported. -Inlining the manifest as a block in the SDI's `blocks` (alongside -the UnixFS root) is possible but mixes layers. - -Decision: keep ObjectManifest as a separate CBOR block in the same -CAR as the MST mutation, with the MST leaf pointing at the manifest -CID — the same shape we have today. Just smaller, with the body -fields replaced by a single Content link to the UnixFS root. - -If GET latency becomes a real concern, inlining the manifest block -in the per-object SDI is a one-line change and saves a network hop. -Defer until needed. - -## Future directions (not implemented) - -### Direct passthrough - -The S3 client uploads body bytes directly to a piri presigned URL -via 307 redirect; ms3t never sees the bytes. Symmetric on reads. - -- ms3t becomes purely control-plane -- Bandwidth shifts to piri's operator (correct in the federated - model) -- Blocked on a Forge feature: piri/sprue must gate the - client-visible 200 on an ms3t-side commit hook so ms3t can - finalize the MST mutation before the client believes the PUT - succeeded. Without this, the PUT-to-MST-commit window is a real - race. - -### Async writes - -`Batched(Internal)` uploader: ack the PUT after local commit, ship -to Forge in the background. Faster, but introduces a window where -PUT-then-immediate-GET fails until the batch flushes. Code already -exists; it's the default mode when `no_cache: false`. We just don't -run with it. - -### Read-through cache - -SQLite blockstore populated on writes, consulted before falling -through to Forge. Order-of-magnitude speedup on hot reads at the -cost of cache invalidation complexity (when does ms3t know its -cached version is stale? Probably "never on its own" — would need -inputs from sprue's existing replay/invalidation mechanisms.). - -### Multi-tenant - -One ms3t serving N S3 customers, each in their own space. Requires -either: -- Per-tenant space delegations imported into ms3t (provisioning - machinery), or -- ms3t generating + tracking per-tenant spaces, with some external - authority for tenant identity - -### Multi-instance - -Either sticky-bucket routing at a load balancer (bucket name → ms3t -instance via consistent hash) or proper multi-writer with CAS retry -+ cache invalidation. Both unbuilt. - -### Multipart upload + aws-chunked - -Real S3 compatibility. Both are well-defined extensions of the -current per-PUT model — multipart effectively becomes "many -UploadPart calls accumulate body chunks; CompleteMultipartUpload -fires the MST mutation." - -### GC - -Walk reachable from current HEAD (and any retained snapshots), mark -those CIDs, ask Forge to expire the rest. Forge would need to grow -an `assert/expire`-style claim, and we'd need a retention policy. - -## Open questions for the team - -1. **Sync vs batched writes for MVP**: is `aws s3 sync` slowness - acceptable for now, or should we wire `Batched` and accept the - read-after-write window? - -2. **Tenant model**: when we want N S3 customers, do they share - ms3t's space or each get their own? The latter implies a - provisioning step we currently avoid. - -3. **Where should ms3t actually run?** Embedded in sprue is what - we have. Standalone ms3t-with-Guppy works too (the original - prototype). Embedded-in-piri was discussed and rejected. Are - there scenarios where standalone matters more than we've - assumed? - -4. **Direct passthrough's commit-hook feature**: is this on - anyone's roadmap? It's the lever for federated topologies. If - not, the "ms3t in the data path" choice becomes load-bearing - for any deployment beyond a single operator. - -5. **Server-side concat for large GETs**: a multi-chunk body has - no clean direct-passthrough path because there's no single URL - to redirect to. Either large-object reads always go through - ms3t (current behavior), or piri grows a "stream this ordered - list of multihashes as one body" capability. - -6. **MST for buckets, registry for buckets**: the registry - (bucket → root CID) is itself a `string → CID` map. We could - make it an MST too, store the registry MST in Forge, and have - only one mutable pointer (the registry MST root). Discussed - earlier; rejected for now because the registry needs SQL-style - transactional CAS that Forge doesn't provide. +## Interfaces and seams -7. **Should the standalone ms3t repo at github.com/frrist/ms3t - continue to exist?** It has the same code (modulo imports) and - no consumer. The Disk and Guppy uploaders only make sense - there. +| Contract | Production impl | Test impl | +|---|---|---| +| `versitygw/backend.Backend` | `s3frontend.Backend` | (same; harness boots the full server) | +| `blockstore.Log` | `logstore.Store` | (same) | +| `blockstore.BlockReader` | `blockstore.Forge` | `testing.nopBaseReader` | +| `registry.Registry` + `logstore.Meta` | `*registry.Postgres` (one struct, both interfaces) | `testing.memStore` (one struct, both) | +| `uploader.Uploader` | `uploader.Forge` | `testing.nopUploader` | +| `bucket.BodyCodec` | `*bucket.FixedChunker` | (same) | + +`s3frontend.Backend` is constructed with `(reg, rs, log, codec)` — +note that the read seam is a `blockstore.ReadStore` (no Put method), +so write paths can't accidentally route through it. Writes go via +`bucketop.Tx` which exposes the staging buffer behind the same +`Reader`/`Writer`/`BlockReader`/`BlockWriter` interfaces. + +## Lifecycle: Server.New → Start → Stop + +`pkg/ms3t/server.go::New(ctx, cfg, deps)`: + +1. Validate inputs (`Addr`, `DataDir`, `RootAccess/RootSecret`, all + `ServerDeps` fields present). +2. Apply defaults (`Region` → `us-east-1`, `ChunkSize` → + `bucket.DefaultChunkSize` = 1 MiB, `MaxConnections` / + `MaxRequests` → 4096). +3. Build a `logstore.FlushFunc` closure capturing the uploader + + meta — this is what runs per sealed segment off the flush + goroutine. +4. `logstore.Open(...)` — runs recovery (see next section), starts + the flush + seal-ticker goroutines. +5. Construct `blockstore.NewLayered(log, deps.BaseBlockReader)`. +6. Construct `s3frontend.New(deps.Registry, layered, log, codec)`. +7. Build the versitygw `s3api.S3ApiServer` with single-account IAM, + no audit/event sinks, generous concurrency limits. + +`Start`: calls `Backend.Recover` (a no-op today; the LSM already +recovered in `logstore.Open`) and spawns the listener goroutine +(`s3api.ServeMultiPort`). + +`Stop`: shuts the listener down and calls `Backend.Drain`, which +calls `Coordinator.Close` → `Log.Close` (force-seal the open +segment, drain the flush queue). Returns the joined error of both +steps. + +## Recovery on startup + +`logstore.Open` runs full reconciliation between disk and Postgres +before accepting writes: + +1. Scan `/segments/` for `.car` files. +2. Query `Meta.ListUnflushedSegments()` for open/sealed rows. +3. Reconcile by `seq`: + - **File + DB open** → rebuild as open via `cars.ScanFile` + + `readAllOps`. Force-seal at startup; we never resume an open + segment from a previous process. + - **File + DB sealed** → load from `.idx`, re-enqueue for flush. + - **File + .idx, no DB row** → rehydrate the DB row (the .idx is + authoritative for sealed state), keep for retention. + - **File only (no .idx, orphan from a torn seal)** → rebuild as + open, seed DB, force-seal. + - **DB row, no file** → log error, delete the DB row. +4. Sealed segments are placed at the head of the read fall-through + list (newest-first by seq) so reads find recent writes first. + +## Identity / Forge wiring + +ms3t generates and persists its own ed25519 keypair on first run at +`/space.key`. That keypair is the **space**: a `did:key` +whose ms3t is the root UCAN authority over. + +| Identity | Used for | +|---|---| +| ms3t's space signer | self-issuing `space/content/retrieve` delegations (read path, indexer claim publication, piri retrievals) | +| sprue's identity | piri allocate/accept invocations, audience of ms3t's self-issued retrieval delegations | + +Sprue is the audience for those delegations because `uploader.Forge` +talks to piri *as sprue*. ms3t-as-space-root keeps zero-out-of-band +provisioning at the cost of a not-very-multi-tenant story; that's a +tradeoff to revisit if/when ms3t serves more than one customer. + +## Testing surface + +- **`pkg/ms3t/testing/harness.go`** — `StartHarness(ctx, opts...)` + boots a real `*ms3t.Server` on a random `127.0.0.1` port with + in-memory deps (`memStore` for Registry+Meta, `nopBaseReader` for + the Layered base, `nopUploader` so flush is a no-op). Options: + `WithLogger`, `WithRegion`, `WithCredentials`, `WithChunkSize`, + `WithSealConfig`, `WithReadyTimeout`. Each call gets its own + scratch tempdir; cleanup is registered against the test's `t`. +- **`pkg/ms3t/testing/integration.go`** — wraps versitygw's upstream + `tests/integration` package. `Run(ctx, c, suite) Result` snapshots + versitygw's package-level pass/fail counters before/after and + returns the delta; `RunT(t, c, suite) Result` drives `Run` and + reports failures via `t.Errorf`. Curated `Suite` constants: + `Smoke`, `CRUD`, `Multipart`, `Tagging`, `ObjectLock`, + `Versioning`, `Auth`, `Full`. +- **`pkg/ms3t/testing/smoke_test.go`** — one top-level `Test` per S3 + group (`TestSmoke_CreateBucket`, `TestSmoke_PutObject`, …) plus + matching `TestSmokeXFail_*` for cases ms3t fails today. Each test + is a table-driven Go test (so GoLand renders one play-icon per + row). XFail tests treat per-case failures as `t.Skip` and only + fail if a case unexpectedly passes — that's the cue to promote + the row to the matching `TestSmoke_*`. + +Today: **66 cases** pass via `TestSmoke_*`, **53 cases** are tracked +as known-failing via `TestSmokeXFail_*` (total 119, matching the +upstream Smoke set). + +## Not implemented + +- **Multipart upload.** Per project decision, multipart in-flight + state will live in service-side storage, NOT folded into the MST. + Out of scope today. +- **ACLs, bucket policy, object lock, versioning, tagging.** The + always-called middleware methods (`GetBucketAcl`, + `GetBucketPolicy`, `GetObjectLockConfiguration`, + `GetBucketVersioning`) return polite empty / "not configured" + responses so PUT/GET don't trip on `ErrNotImplemented`. The + full surface is unimplemented. +- **Standard ETag.** S3 uses `md5(body)` hex for single-part PUTs + (and a different format for multipart). ms3t currently returns + `sha256(body)` hex. Adding md5 tracking to the `Body` record is + the agreed fix; tracked under the `PutObject_success` smoke case. +- **User metadata round-trip** (`x-amz-meta-*`, + `Content-Disposition`, etc.). `ObjectManifest` doesn't carry a + user-metadata map yet. Tracked under `PutObject_with_metadata` / + `HeadObject_success` smoke cases. +- **Range support on HeadObject.** GetObject honors `Range`; + HeadObject doesn't. +- **Conditional reads/writes** (`If-Match`, `If-None-Match`). +- **Server-side checksum surface** (CRC64NVME, etc.). The body's + sha256 is computed, but we don't surface checksum response + headers in `x-amz-checksum-*` form, and we don't validate + client-supplied checksums. +- **GC of unreferenced bodies.** `forge_root_cid` is a high-water + mark — anything reachable from `root_cid` but not from + `forge_root_cid` is "in flight" — but there's no expiry path to + Forge yet, so storage grows monotonically. +- **Multi-tenancy.** ms3t is the space owner; one instance ↔ one + space. +- **Multi-instance / HA.** The per-bucket lock is in-process. A + multi-writer story would need cross-process coordination. + +## Known TODOs in code + +- `pkg/ms3t/blockstore/staging.go` — `OpStaging` buffers an entire + S3 op's blocks in memory until Commit. For multi-GB PUTs this + bounds peak memory at ≈ payload size. A file-backed alternative + (CAR-shaped temp file + `cid → (offset, length)` index) would + cap the per-tx footprint at one chunk + index. The interface is + unchanged; only the storage backend would swap. +- `pkg/ms3t/registry/segments.go` — orphan `forge_root_cid` if + `staging.Commit` succeeds but `reg.CASRoot` fails afterwards. + Proposed fix: conditional `UPDATE … AND root_cid = $newRoot` in + the per-op-root advance, so flush only advances `forge_root_cid` + for buckets whose Root we actually recorded. ## Reading the code -If you're new to ms3t and want to follow a request through: - -- **PUT**: `pkg/ms3t/server/handlers.go::putObject` → - `bucket.Service.PutObject` (in `pkg/ms3t/bucket/bucket.go`) → - `chunker.putBody` → `mst.Add` → `CARBuffer.Commit` → - `uploader.Internal.Submit` (in `pkg/ms3t/uploader/internal.go`) - → registry CAS - -- **GET**: `pkg/ms3t/server/handlers.go::getObject` → - `bucket.Service.GetObject` → `mst.Get` (every node fetched via - `blockstore.Forge.Get` in `pkg/ms3t/blockstore/forge.go`) → - manifest decoded → body chunks fetched the same way → streamed - to client - -- **Where things plug into sprue**: `internal/fx/ms3t.go`. This is - the only sprue-side file that knows about ms3t. - -- **The MST itself**: `pkg/ms3t/mst/`. This is a fork of the - atproto MST with relaxed key validation. Standalone, no - dependencies on the rest of ms3t. +If you're new and want to follow a request through: + +- **PUT**: `s3frontend.Backend.PutObject` (object.go) → + `bucketop.Coordinator.WithTx` (bucketop.go) → + `bucket.FixedChunker.Chunk` (chunker.go) → + `mst.MerkleSearchTree.Add` + `GetPointer` → `Tx.Commit` → + `OpStaging.Commit` → `logstore.Store.AppendBatch` → + `Segment.append` (fsyncs) → `registry.Postgres.CASRoot` → 200 OK. +- **GET**: `s3frontend.Backend.GetObject` (object.go) → + `lookupManifest` (registry → MST.Get over Layered → manifest + decode) → `FixedChunker.Open[Range]` over Layered → stream to + client. Every miss past the open segment falls through to sealed + segments and finally to `blockstore.Forge` (indexer + piri). +- **Flush**: `logstore.Store.flushLoop` → `cfg.Flush` (which is + `newFlushFunc` from `server.go`) → builds `CARSource` from + `Segment.{CARPath, Size, SHA256, BlockPositions}` → + `uploader.Forge.SubmitCAR` (allocate + PUT + accept + index + + claim) → `meta.MarkSegmentFlushed`. +- **Recovery**: `logstore.recovery.go` reconciles + `/segments/` against `Meta.ListUnflushedSegments`. +- **Where ms3t plugs into sprue**: `pkg/ms3t/module.go::Module` is + the only fx-aware file. `registerLifecycle` builds the + production-only collaborators (Forge reader, Postgres registry, + Forge uploader, space signer, migrations) and hands them to + `New` from `server.go`. +- **The MST itself**: `pkg/ms3t/mst/`. Standalone fork of the + atproto MST with relaxed key validation; no other ms3t deps.