diff --git a/cmd/api.go b/cmd/api.go
index 86f3ddb..a9443b1 100644
--- a/cmd/api.go
+++ b/cmd/api.go
@@ -2,6 +2,7 @@ package cmd
 
 import (
 	"context"
+	_ "embed"
 	"encoding/json"
 	"fmt"
 	"net/http"
@@ -25,6 +26,9 @@ import (
 	"github.com/spf13/viper"
 )
 
+//go:embed openapi.yaml
+var openapiSpec []byte
+
 var apiCmd = &cobra.Command{
 	Use:   "api",
 	Short: "Start the Distill API server (standalone, no vector DB required)",
@@ -274,6 +278,8 @@ func runAPI(cmd *cobra.Command, args []string) error {
 	mux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
 		m.Handler().ServeHTTP(w, r)
 	})
+	mux.HandleFunc("/openapi.yaml", server.handleOpenAPISpec)
+	mux.HandleFunc("/docs", server.handleDocs)
 	mux.HandleFunc("/", server.handleRoot)
 
 	// CORS middleware
@@ -349,17 +355,53 @@ func (s *APIServer) handleRoot(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "application/json")
 	_ = json.NewEncoder(w).Encode(map[string]interface{}{
 		"name":    "Distill API",
-		"version": "1.0.0",
-		"docs":    "https://distill.siddhantkhare.com/docs",
+		"version": "0.9.0",
+		"docs":    "/docs",
+		"openapi": "/openapi.yaml",
 		"endpoints": map[string]string{
 			"dedupe":        "POST /v1/dedupe",
 			"dedupe_stream": "POST /v1/dedupe/stream",
+			"pipeline":      "POST /v1/pipeline",
+			"memory_store":  "POST /v1/memory/store",
+			"memory_recall": "POST /v1/memory/recall",
 			"health":        "GET /health",
 			"metrics":       "GET /metrics",
 		},
 	})
 }
 
+func (s *APIServer) handleOpenAPISpec(w http.ResponseWriter, r *http.Request) {
+	w.Header().Set("Content-Type", "application/yaml")
+	w.Header().Set("Access-Control-Allow-Origin", "*")
+	_, _ = w.Write(openapiSpec)
+}
+
+func (s *APIServer) handleDocs(w http.ResponseWriter, r *http.Request) {
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	_, _ = w.Write([]byte(`<!DOCTYPE html>
+<html>
+<head>
+  <title>Distill API Docs</title>
+  <meta charset="utf-8"/>
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <link rel="stylesheet" href="https://unpkg.com/swagger-ui-dist@5/swagger-ui.css">
+</head>
+<body>
+  <div id="swagger-ui"></div>
+  <script src="https://unpkg.com/swagger-ui-dist@5/swagger-ui-bundle.js"></script>
+  <script>
+    SwaggerUIBundle({
+      url: "/openapi.yaml",
+      dom_id: "#swagger-ui",
+      deepLinking: true,
+      presets: [SwaggerUIBundle.presets.apis, SwaggerUIBundle.SwaggerUIStandalonePreset],
+      layout: "BaseLayout"
+    });
+  </script>
+</body>
+</html>`))
+}
+
 func (s *APIServer) handleDedupe(w http.ResponseWriter, r *http.Request) {
 	if r.Method != http.MethodPost {
 		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
diff --git a/cmd/openapi.yaml b/cmd/openapi.yaml
new file mode 100644
index 0000000..d41f7d8
--- /dev/null
+++ b/cmd/openapi.yaml
@@ -0,0 +1,928 @@
+openapi: "3.1.0"
+info:
+  title: Distill API
+  version: 0.9.0
+  description: |
+    Context intelligence layer for LLM agents. Distill deduplicates, compresses,
+    and caches context before it reaches the model, and provides persistent memory
+    with sensitivity tagging and conflict detection.
+  license:
+    name: MIT
+    url: https://github.com/Siddhant-K-code/distill/blob/main/LICENSE
+
+servers:
+  - url: http://localhost:8080
+    description: Local development server
+
+tags:
+  - name: Dedupe
+    description: Semantic deduplication of context chunks
+  - name: Pipeline
+    description: Full dedup + compress + cache pipeline
+  - name: Batch
+    description: Async batch processing
+  - name: Memory
+    description: Persistent context memory store
+  - name: Session
+    description: Stateful context window management
+  - name: Health
+    description: Server health and metrics
+
+paths:
+  /v1/dedupe:
+    post:
+      tags: [Dedupe]
+      summary: Deduplicate chunks
+      description: |
+        Clusters semantically similar chunks and returns one representative per cluster.
+        Supports MMR re-ranking for relevance + diversity balance.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/DedupeRequest"
+      responses:
+        "200":
+          description: Deduplicated chunks
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/DedupeResponse"
+        "400":
+          description: Invalid request
+
+  /v1/dedupe/stream:
+    post:
+      tags: [Dedupe]
+      summary: Deduplicate chunks (SSE stream)
+      description: |
+        Same as `/v1/dedupe` but returns results as Server-Sent Events with
+        per-stage progress updates.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/DedupeRequest"
+      responses:
+        "200":
+          description: SSE stream of dedup progress and results
+          content:
+            text/event-stream: {}
+
+  /v1/pipeline:
+    post:
+      tags: [Pipeline]
+      summary: Run full pipeline
+      description: |
+        Runs the complete dedup → compress → summarize → cache pipeline.
+        Returns processed chunks with per-stage statistics.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/PipelineRequest"
+      responses:
+        "200":
+          description: Pipeline results
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PipelineResponse"
+        "400":
+          description: Invalid request
+
+  /v1/batch:
+    post:
+      tags: [Batch]
+      summary: Submit batch job
+      description: Submit a batch of chunks for async processing.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/BatchSubmitRequest"
+      responses:
+        "202":
+          description: Job accepted
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/BatchSubmitResponse"
+
+  /v1/batch/{job_id}:
+    get:
+      tags: [Batch]
+      summary: Get batch job status
+      parameters:
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      responses:
+        "200":
+          description: Job status
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/BatchStatusResponse"
+        "404":
+          description: Job not found
+
+  /v1/batch/{job_id}/results:
+    get:
+      tags: [Batch]
+      summary: Get batch job results
+      parameters:
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      responses:
+        "200":
+          description: Job results
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/BatchResultsResponse"
+        "404":
+          description: Job not found
+
+  /v1/memory/store:
+    post:
+      tags: [Memory]
+      summary: Store memories
+      description: |
+        Store one or more memory entries with write-time deduplication.
+        Supports sensitivity tagging (explicit or auto-classified) and
+        conflict detection against existing entries.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/StoreRequest"
+      responses:
+        "200":
+          description: Store result with conflict information
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/StoreResult"
+
+  /v1/memory/recall:
+    post:
+      tags: [Memory]
+      summary: Recall memories
+      description: |
+        Retrieve memories ranked by relevance and recency. Supports tag boosting,
+        task context matching, and minimum relevance filtering.
+        Expired entries are excluded by default.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/RecallRequest"
+      responses:
+        "200":
+          description: Recalled memories with sensitivity metadata
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/RecallResult"
+
+  /v1/memory/forget:
+    post:
+      tags: [Memory]
+      summary: Forget memories
+      description: Permanently remove memories by ID, tag, or age.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ForgetRequest"
+      responses:
+        "200":
+          description: Forget result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForgetResult"
+
+  /v1/memory/expire:
+    post:
+      tags: [Memory]
+      summary: Expire memories
+      description: |
+        Mark memories as expired. Expired entries are excluded from recall
+        by default but remain in the store for auditing.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ExpireRequest"
+      responses:
+        "200":
+          description: Expire result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ExpireResult"
+
+  /v1/memory/supersede:
+    post:
+      tags: [Memory]
+      summary: Supersede a memory
+      description: |
+        Mark a memory as superseded by a newer entry. The old entry is expired
+        and a forward pointer to the replacement is stored.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/SupersedeRequest"
+      responses:
+        "200":
+          description: Supersede result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SupersedeResult"
+        "404":
+          description: Old entry not found
+        "409":
+          description: Old entry already expired
+
+  /v1/memory/stats:
+    get:
+      tags: [Memory]
+      summary: Memory store statistics
+      responses:
+        "200":
+          description: Store statistics
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/MemoryStats"
+
+  /v1/session/create:
+    post:
+      tags: [Session]
+      summary: Create a session
+      description: Create a new context window session with a token budget.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/SessionCreateRequest"
+      responses:
+        "200":
+          description: Created session
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Session"
+        "409":
+          description: Session already exists
+
+  /v1/session/push:
+    post:
+      tags: [Session]
+      summary: Push entries to a session
+      description: |
+        Add context entries to a session. Distill deduplicates and compresses
+        to stay within the token budget.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/SessionPushRequest"
+      responses:
+        "200":
+          description: Push result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SessionPushResult"
+        "404":
+          description: Session not found
+        "413":
+          description: Over token budget
+
+  /v1/session/context:
+    post:
+      tags: [Session]
+      summary: Get session context
+      description: Retrieve the current context window for a session.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/SessionContextRequest"
+      responses:
+        "200":
+          description: Session context
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SessionContextResult"
+        "404":
+          description: Session not found
+
+  /v1/session/get:
+    get:
+      tags: [Session]
+      summary: Get session metadata
+      parameters:
+        - name: session_id
+          in: query
+          required: true
+          schema:
+            type: string
+      responses:
+        "200":
+          description: Session metadata
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Session"
+        "404":
+          description: Session not found
+
+  /v1/session/delete:
+    post:
+      tags: [Session]
+      summary: Delete a session
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              required: [session_id]
+              properties:
+                session_id:
+                  type: string
+      responses:
+        "200":
+          description: Delete result
+        "404":
+          description: Session not found
+
+  /health:
+    get:
+      tags: [Health]
+      summary: Health check
+      responses:
+        "200":
+          description: Server is healthy
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  status:
+                    type: string
+                    example: ok
+
+  /metrics:
+    get:
+      tags: [Health]
+      summary: Prometheus metrics
+      responses:
+        "200":
+          description: Prometheus-formatted metrics
+          content:
+            text/plain: {}
+
+components:
+  schemas:
+    DedupeChunk:
+      type: object
+      required: [text]
+      properties:
+        id:
+          type: string
+        text:
+          type: string
+        embedding:
+          type: array
+          items:
+            type: number
+            format: float
+        score:
+          type: number
+          format: float
+        cache_control:
+          type: string
+          description: Anthropic cache_control marker
+
+    DedupeRequest:
+      type: object
+      required: [chunks]
+      properties:
+        chunks:
+          type: array
+          items:
+            $ref: "#/components/schemas/DedupeChunk"
+        threshold:
+          type: number
+          format: double
+          description: Cosine distance threshold for clustering
+        lambda:
+          type: number
+          format: double
+          description: MMR lambda (0=diversity, 1=relevance)
+        target_k:
+          type: integer
+          description: Target number of output chunks
+        options:
+          type: object
+          properties:
+            preserve_cache_prefix:
+              type: boolean
+              description: Freeze chunks before the last cache_control marker
+
+    DedupeResponse:
+      type: object
+      properties:
+        chunks:
+          type: array
+          items:
+            type: object
+            properties:
+              id:
+                type: string
+              text:
+                type: string
+              score:
+                type: number
+                format: float
+              cluster_id:
+                type: integer
+              cache_control:
+                type: string
+        stats:
+          type: object
+          properties:
+            input_chunks:
+              type: integer
+            output_chunks:
+              type: integer
+            reduction_pct:
+              type: number
+            clusters:
+              type: integer
+            latency_ms:
+              type: number
+
+    PipelineRequest:
+      type: object
+      required: [chunks]
+      properties:
+        chunks:
+          type: array
+          items:
+            $ref: "#/components/schemas/DedupeChunk"
+        options:
+          type: object
+          properties:
+            dedup:
+              type: boolean
+            compress:
+              type: boolean
+            summarize:
+              type: boolean
+            cache:
+              type: boolean
+
+    PipelineResponse:
+      type: object
+      properties:
+        chunks:
+          type: array
+          items:
+            $ref: "#/components/schemas/DedupeChunk"
+        stats:
+          type: object
+          properties:
+            total_input_tokens:
+              type: integer
+            total_output_tokens:
+              type: integer
+            total_reduction:
+              type: number
+            total_latency_ms:
+              type: number
+            stages:
+              type: object
+              additionalProperties:
+                type: object
+                properties:
+                  enabled:
+                    type: boolean
+                  input_tokens:
+                    type: integer
+                  output_tokens:
+                    type: integer
+                  reduction:
+                    type: number
+                  latency_ms:
+                    type: number
+
+    BatchSubmitRequest:
+      type: object
+      required: [chunks]
+      properties:
+        chunks:
+          type: array
+          items:
+            $ref: "#/components/schemas/DedupeChunk"
+        options:
+          $ref: "#/components/schemas/PipelineRequest/properties/options"
+
+    BatchSubmitResponse:
+      type: object
+      properties:
+        job_id:
+          type: string
+        status:
+          type: string
+
+    BatchStatusResponse:
+      type: object
+      properties:
+        job_id:
+          type: string
+        status:
+          type: string
+          enum: [pending, running, completed, failed]
+        progress:
+          type: number
+        error:
+          type: string
+        created_at:
+          type: string
+        started_at:
+          type: string
+        completed_at:
+          type: string
+
+    BatchResultsResponse:
+      type: object
+      properties:
+        job_id:
+          type: string
+        status:
+          type: string
+        chunks:
+          type: array
+          items:
+            $ref: "#/components/schemas/DedupeChunk"
+        stats:
+          $ref: "#/components/schemas/PipelineResponse/properties/stats"
+
+    StoreRequest:
+      type: object
+      required: [entries]
+      properties:
+        session_id:
+          type: string
+        entries:
+          type: array
+          items:
+            type: object
+            required: [text]
+            properties:
+              text:
+                type: string
+              embedding:
+                type: array
+                items:
+                  type: number
+                  format: float
+              source:
+                type: string
+              tags:
+                type: array
+                items:
+                  type: string
+              metadata:
+                type: object
+                additionalProperties: true
+              expires_at:
+                type: string
+                format: date-time
+              sensitivity:
+                type: integer
+                description: "0=none, 1=pii, 2=internal, 3=credentials"
+              auto_classify:
+                type: boolean
+                description: Run pattern-based sensitivity classification
+
+    StoreResult:
+      type: object
+      properties:
+        stored:
+          type: integer
+        merged:
+          type: integer
+        deduplicated:
+          type: integer
+        total_memories:
+          type: integer
+        conflicts:
+          type: array
+          items:
+            $ref: "#/components/schemas/Conflict"
+
+    Conflict:
+      type: object
+      properties:
+        new_id:
+          type: string
+        new_text:
+          type: string
+        existing_id:
+          type: string
+        existing_text:
+          type: string
+        distance:
+          type: number
+          format: double
+
+    RecallRequest:
+      type: object
+      required: [query]
+      properties:
+        query:
+          type: string
+        query_embedding:
+          type: array
+          items:
+            type: number
+            format: float
+        tags:
+          type: array
+          items:
+            type: string
+        max_tokens:
+          type: integer
+        max_results:
+          type: integer
+        recency_weight:
+          type: number
+          format: double
+          description: Weight for recency vs relevance (0-1)
+        include_expired:
+          type: boolean
+        task_context:
+          type: string
+          description: Task description for source-matching boost
+        boost_tags:
+          type: array
+          items:
+            type: string
+          description: Tags that receive a relevance boost
+        min_relevance:
+          type: number
+          format: double
+          description: Filter out memories below this score (0-1)
+
+    RecallResult:
+      type: object
+      properties:
+        memories:
+          type: array
+          items:
+            type: object
+            properties:
+              id:
+                type: string
+              text:
+                type: string
+              source:
+                type: string
+              tags:
+                type: array
+                items:
+                  type: string
+              relevance:
+                type: number
+                format: double
+              decay_level:
+                type: integer
+              sensitivity:
+                type: integer
+              last_referenced:
+                type: string
+                format: date-time
+        stats:
+          type: object
+          properties:
+            candidates:
+              type: integer
+            deduplicated:
+              type: integer
+            returned:
+              type: integer
+            token_count:
+              type: integer
+        max_sensitivity:
+          type: integer
+          description: Highest sensitivity level across returned memories
+        sensitive_chunks:
+          type: array
+          items:
+            type: object
+            properties:
+              chunk_id:
+                type: string
+              sensitivity:
+                type: integer
+
+    ForgetRequest:
+      type: object
+      properties:
+        ids:
+          type: array
+          items:
+            type: string
+        tags:
+          type: array
+          items:
+            type: string
+        before:
+          type: string
+          format: date-time
+
+    ForgetResult:
+      type: object
+      properties:
+        forgotten:
+          type: integer
+
+    ExpireRequest:
+      type: object
+      required: [ids]
+      properties:
+        ids:
+          type: array
+          items:
+            type: string
+
+    ExpireResult:
+      type: object
+      properties:
+        expired:
+          type: integer
+
+    SupersedeRequest:
+      type: object
+      required: [old_id]
+      properties:
+        old_id:
+          type: string
+        new_id:
+          type: string
+
+    SupersedeResult:
+      type: object
+      properties:
+        superseded:
+          type: boolean
+
+    MemoryStats:
+      type: object
+      properties:
+        total_memories:
+          type: integer
+        expired_count:
+          type: integer
+        active_count:
+          type: integer
+        by_decay_level:
+          type: object
+          additionalProperties:
+            type: integer
+        by_source:
+          type: object
+          additionalProperties:
+            type: integer
+        oldest_memory:
+          type: string
+          format: date-time
+        newest_memory:
+          type: string
+          format: date-time
+
+    SessionCreateRequest:
+      type: object
+      required: [max_tokens]
+      properties:
+        session_id:
+          type: string
+          description: Auto-generated if empty
+        max_tokens:
+          type: integer
+        dedup_threshold:
+          type: number
+          format: double
+        preserve_recent:
+          type: integer
+          description: Always keep last N entries at full fidelity
+
+    Session:
+      type: object
+      properties:
+        id:
+          type: string
+        max_tokens:
+          type: integer
+        used_tokens:
+          type: integer
+        entry_count:
+          type: integer
+        created_at:
+          type: string
+          format: date-time
+
+    SessionPushRequest:
+      type: object
+      required: [session_id, entries]
+      properties:
+        session_id:
+          type: string
+        entries:
+          type: array
+          items:
+            type: object
+            required: [role, content]
+            properties:
+              role:
+                type: string
+              content:
+                type: string
+              embedding:
+                type: array
+                items:
+                  type: number
+                  format: float
+
+    SessionPushResult:
+      type: object
+      properties:
+        added:
+          type: integer
+        deduplicated:
+          type: integer
+        compressed:
+          type: integer
+        tokens_used:
+          type: integer
+        tokens_remaining:
+          type: integer
+
+    SessionContextRequest:
+      type: object
+      required: [session_id]
+      properties:
+        session_id:
+          type: string
+        max_tokens:
+          type: integer
+          description: "0 = return full window"
+        role:
+          type: string
+          description: Filter by role
+
+    SessionContextResult:
+      type: object
+      properties:
+        entries:
+          type: array
+          items:
+            type: object
+            properties:
+              role:
+                type: string
+              content:
+                type: string
+              compression_level:
+                type: string
+              tokens:
+                type: integer
+        total_tokens:
+          type: integer
diff --git a/openapi.yaml b/openapi.yaml
new file mode 100644
index 0000000..d41f7d8
--- /dev/null
+++ b/openapi.yaml
@@ -0,0 +1,928 @@
+openapi: "3.1.0"
+info:
+  title: Distill API
+  version: 0.9.0
+  description: |
+    Context intelligence layer for LLM agents. Distill deduplicates, compresses,
+    and caches context before it reaches the model, and provides persistent memory
+    with sensitivity tagging and conflict detection.
+  license:
+    name: MIT
+    url: https://github.com/Siddhant-K-code/distill/blob/main/LICENSE
+
+servers:
+  - url: http://localhost:8080
+    description: Local development server
+
+tags:
+  - name: Dedupe
+    description: Semantic deduplication of context chunks
+  - name: Pipeline
+    description: Full dedup + compress + cache pipeline
+  - name: Batch
+    description: Async batch processing
+  - name: Memory
+    description: Persistent context memory store
+  - name: Session
+    description: Stateful context window management
+  - name: Health
+    description: Server health and metrics
+
+paths:
+  /v1/dedupe:
+    post:
+      tags: [Dedupe]
+      summary: Deduplicate chunks
+      description: |
+        Clusters semantically similar chunks and returns one representative per cluster.
+        Supports MMR re-ranking for relevance + diversity balance.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/DedupeRequest"
+      responses:
+        "200":
+          description: Deduplicated chunks
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/DedupeResponse"
+        "400":
+          description: Invalid request
+
+  /v1/dedupe/stream:
+    post:
+      tags: [Dedupe]
+      summary: Deduplicate chunks (SSE stream)
+      description: |
+        Same as `/v1/dedupe` but returns results as Server-Sent Events with
+        per-stage progress updates.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/DedupeRequest"
+      responses:
+        "200":
+          description: SSE stream of dedup progress and results
+          content:
+            text/event-stream: {}
+
+  /v1/pipeline:
+    post:
+      tags: [Pipeline]
+      summary: Run full pipeline
+      description: |
+        Runs the complete dedup → compress → summarize → cache pipeline.
+        Returns processed chunks with per-stage statistics.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/PipelineRequest"
+      responses:
+        "200":
+          description: Pipeline results
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PipelineResponse"
+        "400":
+          description: Invalid request
+
+  /v1/batch:
+    post:
+      tags: [Batch]
+      summary: Submit batch job
+      description: Submit a batch of chunks for async processing.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/BatchSubmitRequest"
+      responses:
+        "202":
+          description: Job accepted
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/BatchSubmitResponse"
+
+  /v1/batch/{job_id}:
+    get:
+      tags: [Batch]
+      summary: Get batch job status
+      parameters:
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      responses:
+        "200":
+          description: Job status
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/BatchStatusResponse"
+        "404":
+          description: Job not found
+
+  /v1/batch/{job_id}/results:
+    get:
+      tags: [Batch]
+      summary: Get batch job results
+      parameters:
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      responses:
+        "200":
+          description: Job results
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/BatchResultsResponse"
+        "404":
+          description: Job not found
+
+  /v1/memory/store:
+    post:
+      tags: [Memory]
+      summary: Store memories
+      description: |
+        Store one or more memory entries with write-time deduplication.
+        Supports sensitivity tagging (explicit or auto-classified) and
+        conflict detection against existing entries.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/StoreRequest"
+      responses:
+        "200":
+          description: Store result with conflict information
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/StoreResult"
+
+  /v1/memory/recall:
+    post:
+      tags: [Memory]
+      summary: Recall memories
+      description: |
+        Retrieve memories ranked by relevance and recency. Supports tag boosting,
+        task context matching, and minimum relevance filtering.
+        Expired entries are excluded by default.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/RecallRequest"
+      responses:
+        "200":
+          description: Recalled memories with sensitivity metadata
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/RecallResult"
+
+  /v1/memory/forget:
+    post:
+      tags: [Memory]
+      summary: Forget memories
+      description: Permanently remove memories by ID, tag, or age.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ForgetRequest"
+      responses:
+        "200":
+          description: Forget result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForgetResult"
+
+  /v1/memory/expire:
+    post:
+      tags: [Memory]
+      summary: Expire memories
+      description: |
+        Mark memories as expired. Expired entries are excluded from recall
+        by default but remain in the store for auditing.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ExpireRequest"
+      responses:
+        "200":
+          description: Expire result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ExpireResult"
+
+  /v1/memory/supersede:
+    post:
+      tags: [Memory]
+      summary: Supersede a memory
+      description: |
+        Mark a memory as superseded by a newer entry. The old entry is expired
+        and a forward pointer to the replacement is stored.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/SupersedeRequest"
+      responses:
+        "200":
+          description: Supersede result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SupersedeResult"
+        "404":
+          description: Old entry not found
+        "409":
+          description: Old entry already expired
+
+  /v1/memory/stats:
+    get:
+      tags: [Memory]
+      summary: Memory store statistics
+      responses:
+        "200":
+          description: Store statistics
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/MemoryStats"
+
+  /v1/session/create:
+    post:
+      tags: [Session]
+      summary: Create a session
+      description: Create a new context window session with a token budget.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/SessionCreateRequest"
+      responses:
+        "200":
+          description: Created session
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Session"
+        "409":
+          description: Session already exists
+
+  /v1/session/push:
+    post:
+      tags: [Session]
+      summary: Push entries to a session
+      description: |
+        Add context entries to a session. Distill deduplicates and compresses
+        to stay within the token budget.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/SessionPushRequest"
+      responses:
+        "200":
+          description: Push result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SessionPushResult"
+        "404":
+          description: Session not found
+        "413":
+          description: Over token budget
+
+  /v1/session/context:
+    post:
+      tags: [Session]
+      summary: Get session context
+      description: Retrieve the current context window for a session.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/SessionContextRequest"
+      responses:
+        "200":
+          description: Session context
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SessionContextResult"
+        "404":
+          description: Session not found
+
+  /v1/session/get:
+    get:
+      tags: [Session]
+      summary: Get session metadata
+      parameters:
+        - name: session_id
+          in: query
+          required: true
+          schema:
+            type: string
+      responses:
+        "200":
+          description: Session metadata
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Session"
+        "404":
+          description: Session not found
+
+  /v1/session/delete:
+    post:
+      tags: [Session]
+      summary: Delete a session
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              required: [session_id]
+              properties:
+                session_id:
+                  type: string
+      responses:
+        "200":
+          description: Delete result
+        "404":
+          description: Session not found
+
+  /health:
+    get:
+      tags: [Health]
+      summary: Health check
+      responses:
+        "200":
+          description: Server is healthy
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  status:
+                    type: string
+                    example: ok
+
+  /metrics:
+    get:
+      tags: [Health]
+      summary: Prometheus metrics
+      responses:
+        "200":
+          description: Prometheus-formatted metrics
+          content:
+            text/plain: {}
+
+components:
+  schemas:
+    DedupeChunk:
+      type: object
+      required: [text]
+      properties:
+        id:
+          type: string
+        text:
+          type: string
+        embedding:
+          type: array
+          items:
+            type: number
+            format: float
+        score:
+          type: number
+          format: float
+        cache_control:
+          type: string
+          description: Anthropic cache_control marker
+
+    DedupeRequest:
+      type: object
+      required: [chunks]
+      properties:
+        chunks:
+          type: array
+          items:
+            $ref: "#/components/schemas/DedupeChunk"
+        threshold:
+          type: number
+          format: double
+          description: Cosine distance threshold for clustering
+        lambda:
+          type: number
+          format: double
+          description: MMR lambda (0=diversity, 1=relevance)
+        target_k:
+          type: integer
+          description: Target number of output chunks
+        options:
+          type: object
+          properties:
+            preserve_cache_prefix:
+              type: boolean
+              description: Freeze chunks before the last cache_control marker
+
+    DedupeResponse:
+      type: object
+      properties:
+        chunks:
+          type: array
+          items:
+            type: object
+            properties:
+              id:
+                type: string
+              text:
+                type: string
+              score:
+                type: number
+                format: float
+              cluster_id:
+                type: integer
+              cache_control:
+                type: string
+        stats:
+          type: object
+          properties:
+            input_chunks:
+              type: integer
+            output_chunks:
+              type: integer
+            reduction_pct:
+              type: number
+            clusters:
+              type: integer
+            latency_ms:
+              type: number
+
+    PipelineRequest:
+      type: object
+      required: [chunks]
+      properties:
+        chunks:
+          type: array
+          items:
+            $ref: "#/components/schemas/DedupeChunk"
+        options:
+          type: object
+          properties:
+            dedup:
+              type: boolean
+            compress:
+              type: boolean
+            summarize:
+              type: boolean
+            cache:
+              type: boolean
+
+    PipelineResponse:
+      type: object
+      properties:
+        chunks:
+          type: array
+          items:
+            $ref: "#/components/schemas/DedupeChunk"
+        stats:
+          type: object
+          properties:
+            total_input_tokens:
+              type: integer
+            total_output_tokens:
+              type: integer
+            total_reduction:
+              type: number
+            total_latency_ms:
+              type: number
+            stages:
+              type: object
+              additionalProperties:
+                type: object
+                properties:
+                  enabled:
+                    type: boolean
+                  input_tokens:
+                    type: integer
+                  output_tokens:
+                    type: integer
+                  reduction:
+                    type: number
+                  latency_ms:
+                    type: number
+
+    BatchSubmitRequest:
+      type: object
+      required: [chunks]
+      properties:
+        chunks:
+          type: array
+          items:
+            $ref: "#/components/schemas/DedupeChunk"
+        options:
+          $ref: "#/components/schemas/PipelineRequest/properties/options"
+
+    BatchSubmitResponse:
+      type: object
+      properties:
+        job_id:
+          type: string
+        status:
+          type: string
+
+    BatchStatusResponse:
+      type: object
+      properties:
+        job_id:
+          type: string
+        status:
+          type: string
+          enum: [pending, running, completed, failed]
+        progress:
+          type: number
+        error:
+          type: string
+        created_at:
+          type: string
+        started_at:
+          type: string
+        completed_at:
+          type: string
+
+    BatchResultsResponse:
+      type: object
+      properties:
+        job_id:
+          type: string
+        status:
+          type: string
+        chunks:
+          type: array
+          items:
+            $ref: "#/components/schemas/DedupeChunk"
+        stats:
+          $ref: "#/components/schemas/PipelineResponse/properties/stats"
+
+    StoreRequest:
+      type: object
+      required: [entries]
+      properties:
+        session_id:
+          type: string
+        entries:
+          type: array
+          items:
+            type: object
+            required: [text]
+            properties:
+              text:
+                type: string
+              embedding:
+                type: array
+                items:
+                  type: number
+                  format: float
+              source:
+                type: string
+              tags:
+                type: array
+                items:
+                  type: string
+              metadata:
+                type: object
+                additionalProperties: true
+              expires_at:
+                type: string
+                format: date-time
+              sensitivity:
+                type: integer
+                description: "0=none, 1=pii, 2=internal, 3=credentials"
+              auto_classify:
+                type: boolean
+                description: Run pattern-based sensitivity classification
+
+    StoreResult:
+      type: object
+      properties:
+        stored:
+          type: integer
+        merged:
+          type: integer
+        deduplicated:
+          type: integer
+        total_memories:
+          type: integer
+        conflicts:
+          type: array
+          items:
+            $ref: "#/components/schemas/Conflict"
+
+    Conflict:
+      type: object
+      properties:
+        new_id:
+          type: string
+        new_text:
+          type: string
+        existing_id:
+          type: string
+        existing_text:
+          type: string
+        distance:
+          type: number
+          format: double
+
+    RecallRequest:
+      type: object
+      required: [query]
+      properties:
+        query:
+          type: string
+        query_embedding:
+          type: array
+          items:
+            type: number
+            format: float
+        tags:
+          type: array
+          items:
+            type: string
+        max_tokens:
+          type: integer
+        max_results:
+          type: integer
+        recency_weight:
+          type: number
+          format: double
+          description: Weight for recency vs relevance (0-1)
+        include_expired:
+          type: boolean
+        task_context:
+          type: string
+          description: Task description for source-matching boost
+        boost_tags:
+          type: array
+          items:
+            type: string
+          description: Tags that receive a relevance boost
+        min_relevance:
+          type: number
+          format: double
+          description: Filter out memories below this score (0-1)
+
+    RecallResult:
+      type: object
+      properties:
+        memories:
+          type: array
+          items:
+            type: object
+            properties:
+              id:
+                type: string
+              text:
+                type: string
+              source:
+                type: string
+              tags:
+                type: array
+                items:
+                  type: string
+              relevance:
+                type: number
+                format: double
+              decay_level:
+                type: integer
+              sensitivity:
+                type: integer
+              last_referenced:
+                type: string
+                format: date-time
+        stats:
+          type: object
+          properties:
+            candidates:
+              type: integer
+            deduplicated:
+              type: integer
+            returned:
+              type: integer
+            token_count:
+              type: integer
+        max_sensitivity:
+          type: integer
+          description: Highest sensitivity level across returned memories
+        sensitive_chunks:
+          type: array
+          items:
+            type: object
+            properties:
+              chunk_id:
+                type: string
+              sensitivity:
+                type: integer
+
+    ForgetRequest:
+      type: object
+      properties:
+        ids:
+          type: array
+          items:
+            type: string
+        tags:
+          type: array
+          items:
+            type: string
+        before:
+          type: string
+          format: date-time
+
+    ForgetResult:
+      type: object
+      properties:
+        forgotten:
+          type: integer
+
+    ExpireRequest:
+      type: object
+      required: [ids]
+      properties:
+        ids:
+          type: array
+          items:
+            type: string
+
+    ExpireResult:
+      type: object
+      properties:
+        expired:
+          type: integer
+
+    SupersedeRequest:
+      type: object
+      required: [old_id]
+      properties:
+        old_id:
+          type: string
+        new_id:
+          type: string
+
+    SupersedeResult:
+      type: object
+      properties:
+        superseded:
+          type: boolean
+
+    MemoryStats:
+      type: object
+      properties:
+        total_memories:
+          type: integer
+        expired_count:
+          type: integer
+        active_count:
+          type: integer
+        by_decay_level:
+          type: object
+          additionalProperties:
+            type: integer
+        by_source:
+          type: object
+          additionalProperties:
+            type: integer
+        oldest_memory:
+          type: string
+          format: date-time
+        newest_memory:
+          type: string
+          format: date-time
+
+    SessionCreateRequest:
+      type: object
+      required: [max_tokens]
+      properties:
+        session_id:
+          type: string
+          description: Auto-generated if empty
+        max_tokens:
+          type: integer
+        dedup_threshold:
+          type: number
+          format: double
+        preserve_recent:
+          type: integer
+          description: Always keep last N entries at full fidelity
+
+    Session:
+      type: object
+      properties:
+        id:
+          type: string
+        max_tokens:
+          type: integer
+        used_tokens:
+          type: integer
+        entry_count:
+          type: integer
+        created_at:
+          type: string
+          format: date-time
+
+    SessionPushRequest:
+      type: object
+      required: [session_id, entries]
+      properties:
+        session_id:
+          type: string
+        entries:
+          type: array
+          items:
+            type: object
+            required: [role, content]
+            properties:
+              role:
+                type: string
+              content:
+                type: string
+              embedding:
+                type: array
+                items:
+                  type: number
+                  format: float
+
+    SessionPushResult:
+      type: object
+      properties:
+        added:
+          type: integer
+        deduplicated:
+          type: integer
+        compressed:
+          type: integer
+        tokens_used:
+          type: integer
+        tokens_remaining:
+          type: integer
+
+    SessionContextRequest:
+      type: object
+      required: [session_id]
+      properties:
+        session_id:
+          type: string
+        max_tokens:
+          type: integer
+          description: "0 = return full window"
+        role:
+          type: string
+          description: Filter by role
+
+    SessionContextResult:
+      type: object
+      properties:
+        entries:
+          type: array
+          items:
+            type: object
+            properties:
+              role:
+                type: string
+              content:
+                type: string
+              compression_level:
+                type: string
+              tokens:
+                type: integer
+        total_tokens:
+          type: integer