Skip to content

feat(graph): extend QueryBuilder and migrate production queries #262

feat(graph): extend QueryBuilder and migrate production queries

feat(graph): extend QueryBuilder and migrate production queries #262

Workflow file for this run

name: Deploy
on:
push:
branches: [main]
workflow_dispatch:
inputs:
force_deploy_all:
description: 'Force deploy all services'
required: false
default: 'false'
type: boolean
force_infra:
description: 'Force infrastructure apply'
required: false
default: 'false'
type: boolean
concurrency:
group: deploy-${{ github.ref }}
cancel-in-progress: false
env:
DOMAIN: engram.rawcontext.com
ROOT_DOMAIN: rawcontext.com # For Vercel DNS (must be root domain)
jobs:
# Wait for CI to pass
ci:
uses: ./.github/workflows/ci.yml
secrets: inherit
# Detect which components changed
changes:
runs-on: ubuntu-latest
needs: ci
outputs:
infra: ${{ steps.filter.outputs.infra }}
api: ${{ steps.filter.outputs.api }}
search: ${{ steps.filter.outputs.search }}
tuner: ${{ steps.filter.outputs.tuner }}
observatory: ${{ steps.filter.outputs.observatory }}
console: ${{ steps.filter.outputs.console }}
ingestion: ${{ steps.filter.outputs.ingestion }}
memory: ${{ steps.filter.outputs.memory }}
docker_compose: ${{ steps.filter.outputs.docker_compose }}
caddy: ${{ steps.filter.outputs.caddy }}
any_service: ${{ steps.any.outputs.result }}
steps:
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
infra:
- 'packages/infra/**'
api:
- 'apps/api/**'
- 'packages/common/**'
- 'packages/logger/**'
- 'packages/events/**'
- 'packages/storage/**'
- 'packages/graph/**'
search:
- 'apps/search/**'
tuner:
- 'apps/tuner/**'
observatory:
- 'apps/observatory/**'
console:
- 'apps/console/**'
ingestion:
- 'apps/ingestion/**'
- 'packages/common/**'
- 'packages/logger/**'
- 'packages/events/**'
- 'packages/storage/**'
- 'packages/parser/**'
memory:
- 'apps/memory/**'
- 'packages/common/**'
- 'packages/logger/**'
- 'packages/events/**'
- 'packages/storage/**'
- 'packages/graph/**'
docker_compose:
- 'docker-compose.prod.yml'
caddy:
- 'configs/Caddyfile'
- name: Check if any service changed
id: any
run: |
if [[ "${{ steps.filter.outputs.api }}" == "true" ]] || \
[[ "${{ steps.filter.outputs.search }}" == "true" ]] || \
[[ "${{ steps.filter.outputs.tuner }}" == "true" ]] || \
[[ "${{ steps.filter.outputs.observatory }}" == "true" ]] || \
[[ "${{ steps.filter.outputs.console }}" == "true" ]] || \
[[ "${{ steps.filter.outputs.ingestion }}" == "true" ]] || \
[[ "${{ steps.filter.outputs.memory }}" == "true" ]] || \
[[ "${{ steps.filter.outputs.docker_compose }}" == "true" ]] || \
[[ "${{ inputs.force_deploy_all }}" == "true" ]]; then
echo "result=true" >> $GITHUB_OUTPUT
else
echo "result=false" >> $GITHUB_OUTPUT
fi
# Infrastructure deployment with OpenTofu
# Runs after deploy-api because it uses the API as the Terraform state backend
infrastructure:
runs-on: ubuntu-latest
needs: [changes, sync, deploy-api]
if: |
always() &&
needs.sync.result == 'success' &&
(needs.deploy-api.result == 'success' || needs.deploy-api.result == 'skipped') &&
(needs.changes.outputs.infra == 'true' || inputs.force_infra == true)
environment: production
permissions:
contents: read
pull-requests: write
steps:
- uses: actions/checkout@v4
- name: Setup OpenTofu
uses: opentofu/setup-opentofu@v1
with:
tofu_version: 1.8.0
- name: Verify Secrets
run: |
if [ -z "${{ secrets.TF_HTTP_PASSWORD }}" ]; then
echo "::error::TF_HTTP_PASSWORD secret is not set!"
exit 1
fi
echo "TF_HTTP_PASSWORD is set (length: $(echo -n '${{ secrets.TF_HTTP_PASSWORD }}' | wc -c))"
- name: Wait for API State Backend
run: |
echo "Checking if API state backend is available..."
# Quick check - if API responds, great. If not, continue anyway
# (state backend will fail later if truly unavailable, but we can
# use -lock=false and -migrate-state to bootstrap)
for i in {1..6}; do
if curl -sf --max-time 5 "https://api.engram.rawcontext.com/v1/health" > /dev/null 2>&1; then
echo "API is healthy, proceeding with OpenTofu init"
exit 0
fi
echo "Attempt $i/6: API not ready, waiting 10s..."
sleep 10
done
echo "::warning::API state backend not available - will attempt to continue anyway"
- name: OpenTofu Init
working-directory: packages/infra
run: |
tofu init -reconfigure \
-backend-config="username=tofu" \
-backend-config="password=${{ secrets.TF_HTTP_PASSWORD }}"
- name: OpenTofu Validate
working-directory: packages/infra
run: tofu validate
- name: Import Existing Resources
working-directory: packages/infra
env:
TF_HTTP_USERNAME: tofu
TF_HTTP_PASSWORD: ${{ secrets.TF_HTTP_PASSWORD }}
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_vercel_api_token: ${{ secrets.VERCEL_API_TOKEN }}
TF_VAR_domain: ${{ env.ROOT_DOMAIN }}
TF_VAR_ssh_public_key: ${{ secrets.HETZNER_SSH_PUBLIC_KEY }}
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
VERCEL_API_TOKEN: ${{ secrets.VERCEL_API_TOKEN }}
run: |
# Install hcloud CLI
curl -sL https://github.com/hetznercloud/cli/releases/download/v1.49.0/hcloud-linux-amd64.tar.gz | tar xz
chmod +x hcloud
# Check if all resources exist in state already
STATE_COUNT=$(tofu state list 2>/dev/null | wc -l || echo "0")
if [ "$STATE_COUNT" -ge 8 ]; then
echo "All resources appear to be in state ($STATE_COUNT items), skipping import"
exit 0
fi
echo "Found $STATE_COUNT items in state, checking for missing resources..."
# Import Hetzner resources
echo "=== Importing Hetzner resources ==="
# Get SSH key ID
SSH_KEY_ID=$(./hcloud ssh-key list -o noheader | grep "engram-key" | awk '{print $1}' || true)
if [ -n "$SSH_KEY_ID" ]; then
if ! tofu state list 2>/dev/null | grep -q "hcloud_ssh_key.engram"; then
echo "Importing SSH key (ID: $SSH_KEY_ID)..."
tofu import -lock=false hcloud_ssh_key.engram "$SSH_KEY_ID" || echo "SSH key import failed or already exists"
fi
fi
# Get server ID
SERVER_ID=$(./hcloud server list -o noheader | grep "engram" | awk '{print $1}' || true)
if [ -n "$SERVER_ID" ]; then
if ! tofu state list 2>/dev/null | grep -q "hcloud_server.engram"; then
echo "Importing server (ID: $SERVER_ID)..."
tofu import -lock=false hcloud_server.engram "$SERVER_ID" || echo "Server import failed or already exists"
fi
fi
# Get firewall ID
FIREWALL_ID=$(./hcloud firewall list -o noheader | grep "engram-firewall" | awk '{print $1}' || true)
if [ -n "$FIREWALL_ID" ]; then
if ! tofu state list 2>/dev/null | grep -q "hcloud_firewall.engram"; then
echo "Importing firewall (ID: $FIREWALL_ID)..."
tofu import -lock=false hcloud_firewall.engram "$FIREWALL_ID" || echo "Firewall import failed or already exists"
fi
# Import firewall attachment if both firewall and server exist
if [ -n "$SERVER_ID" ]; then
if ! tofu state list 2>/dev/null | grep -q "hcloud_firewall_attachment.engram"; then
echo "Importing firewall attachment..."
tofu import -lock=false hcloud_firewall_attachment.engram "$FIREWALL_ID" || echo "Firewall attachment import failed or already exists"
fi
fi
fi
# Import Vercel DNS records
echo "=== Importing Vercel DNS records ==="
DOMAIN="rawcontext.com"
# Clear any corrupted DNS record states (domain mismatch bug)
for record in apex api observatory console; do
if tofu state list 2>/dev/null | grep -q "vercel_dns_record.$record"; then
echo "Removing potentially corrupted state for vercel_dns_record.$record..."
tofu state rm -lock=false "vercel_dns_record.$record" 2>/dev/null || true
fi
done
# Fetch all DNS records from Vercel
DNS_RECORDS=$(curl -s -H "Authorization: Bearer $VERCEL_API_TOKEN" \
"https://api.vercel.com/v4/domains/$DOMAIN/records" | jq -r '.records // []')
# Import apex record (engram.rawcontext.com)
if ! tofu state list 2>/dev/null | grep -q "vercel_dns_record.apex"; then
APEX_ID=$(echo "$DNS_RECORDS" | jq -r '.[] | select(.name == "engram" and .type == "A") | .id' | head -1)
if [ -n "$APEX_ID" ] && [ "$APEX_ID" != "null" ]; then
echo "Importing apex DNS record (ID: $APEX_ID)..."
tofu import -lock=false vercel_dns_record.apex "$APEX_ID" || echo "Apex DNS import failed"
fi
fi
# Import api record (api.engram.rawcontext.com)
if ! tofu state list 2>/dev/null | grep -q "vercel_dns_record.api"; then
API_ID=$(echo "$DNS_RECORDS" | jq -r '.[] | select(.name == "api.engram" and .type == "A") | .id' | head -1)
if [ -n "$API_ID" ] && [ "$API_ID" != "null" ]; then
echo "Importing api DNS record (ID: $API_ID)..."
tofu import -lock=false vercel_dns_record.api "$API_ID" || echo "API DNS import failed"
fi
fi
# Import observatory record (observatory.engram.rawcontext.com)
if ! tofu state list 2>/dev/null | grep -q "vercel_dns_record.observatory"; then
OBS_ID=$(echo "$DNS_RECORDS" | jq -r '.[] | select(.name == "observatory.engram" and .type == "A") | .id' | head -1)
if [ -n "$OBS_ID" ] && [ "$OBS_ID" != "null" ]; then
echo "Importing observatory DNS record (ID: $OBS_ID)..."
tofu import -lock=false vercel_dns_record.observatory "$OBS_ID" || echo "Observatory DNS import failed"
fi
fi
# Import console record (console.engram.rawcontext.com)
if ! tofu state list 2>/dev/null | grep -q "vercel_dns_record.console"; then
CONSOLE_ID=$(echo "$DNS_RECORDS" | jq -r '.[] | select(.name == "console.engram" and .type == "A") | .id' | head -1)
if [ -n "$CONSOLE_ID" ] && [ "$CONSOLE_ID" != "null" ]; then
echo "Importing console DNS record (ID: $CONSOLE_ID)..."
tofu import -lock=false vercel_dns_record.console "$CONSOLE_ID" || echo "Console DNS import failed"
fi
fi
echo "Import complete. Checking state..."
tofu state list || true
- name: OpenTofu Plan
working-directory: packages/infra
env:
TF_HTTP_USERNAME: tofu
TF_HTTP_PASSWORD: ${{ secrets.TF_HTTP_PASSWORD }}
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_vercel_api_token: ${{ secrets.VERCEL_API_TOKEN }}
TF_VAR_domain: ${{ env.ROOT_DOMAIN }}
TF_VAR_ssh_public_key: ${{ secrets.HETZNER_SSH_PUBLIC_KEY }}
TF_VAR_engram_api_client_secret: ${{ secrets.ENGRAM_API_CLIENT_SECRET }}
TF_VAR_engram_search_client_secret: ${{ secrets.ENGRAM_SEARCH_CLIENT_SECRET }}
TF_VAR_engram_tuner_client_secret: ${{ secrets.ENGRAM_TUNER_CLIENT_SECRET }}
TF_VAR_engram_ingestion_client_secret: ${{ secrets.ENGRAM_INGESTION_CLIENT_SECRET }}
TF_VAR_engram_memory_client_secret: ${{ secrets.ENGRAM_MEMORY_CLIENT_SECRET }}
TF_VAR_engram_console_client_secret: ${{ secrets.ENGRAM_CONSOLE_CLIENT_SECRET }}
run: |
# Disable state locking - workflow concurrency already prevents parallel runs
tofu plan -out=tfplan -no-color -lock=false
- name: OpenTofu Apply
working-directory: packages/infra
env:
TF_HTTP_USERNAME: tofu
TF_HTTP_PASSWORD: ${{ secrets.TF_HTTP_PASSWORD }}
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_vercel_api_token: ${{ secrets.VERCEL_API_TOKEN }}
TF_VAR_domain: ${{ env.ROOT_DOMAIN }}
TF_VAR_ssh_public_key: ${{ secrets.HETZNER_SSH_PUBLIC_KEY }}
TF_VAR_engram_api_client_secret: ${{ secrets.ENGRAM_API_CLIENT_SECRET }}
TF_VAR_engram_search_client_secret: ${{ secrets.ENGRAM_SEARCH_CLIENT_SECRET }}
TF_VAR_engram_tuner_client_secret: ${{ secrets.ENGRAM_TUNER_CLIENT_SECRET }}
TF_VAR_engram_ingestion_client_secret: ${{ secrets.ENGRAM_INGESTION_CLIENT_SECRET }}
TF_VAR_engram_memory_client_secret: ${{ secrets.ENGRAM_MEMORY_CLIENT_SECRET }}
TF_VAR_engram_console_client_secret: ${{ secrets.ENGRAM_CONSOLE_CLIENT_SECRET }}
run: |
tofu apply -auto-approve -lock=false tfplan
- name: Get Server IP
id: server
working-directory: packages/infra
run: |
echo "ip=$(tofu output -raw server_ip)" >> $GITHUB_OUTPUT
outputs:
server_ip: ${{ steps.server.outputs.ip }}
# Sync files to server (runs if any service changed)
sync:
runs-on: ubuntu-latest
needs: [changes]
if: |
always() &&
needs.changes.result == 'success' &&
needs.changes.outputs.any_service == 'true'
environment: production
steps:
- uses: actions/checkout@v4
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Cleanup conflicting paths
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
# Remove conflicting paths and fix permissions
sudo rm -rf /opt/engram/configs/Caddyfile 2>/dev/null || true
# Ensure configs directory exists with correct ownership
sudo mkdir -p /opt/engram/configs
sudo chown -R engram:engram /opt/engram/configs
EOF
- name: Sync files to server
run: |
rsync -avz --delete --no-group --no-owner \
--exclude '.git' \
--exclude 'node_modules' \
--exclude '.venv' \
--exclude '__pycache__' \
--exclude '*.pyc' \
--exclude 'dist' \
--exclude 'data' \
--exclude '.turbo' \
--exclude '.next' \
--exclude '.terraform' \
--exclude 'terraform.tfstate*' \
. engram@${{ env.DOMAIN }}:/opt/engram/
- name: Create .env file on server
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} "cat > /opt/engram/.env << 'ENVEOF'
POSTGRES_USER=${{ secrets.POSTGRES_USER }}
POSTGRES_PASSWORD=${{ secrets.POSTGRES_PASSWORD }}
POSTGRES_DB=${{ secrets.POSTGRES_DB }}
HF_API_TOKEN=${{ secrets.HF_API_TOKEN }}
BETTER_AUTH_SECRET=${{ secrets.BETTER_AUTH_SECRET }}
GOOGLE_CLIENT_ID=${{ secrets.GOOGLE_CLIENT_ID }}
GOOGLE_CLIENT_SECRET=${{ secrets.GOOGLE_CLIENT_SECRET }}
ENGRAM_API_CLIENT_SECRET=${{ secrets.ENGRAM_API_CLIENT_SECRET }}
ENGRAM_SEARCH_CLIENT_SECRET=${{ secrets.ENGRAM_SEARCH_CLIENT_SECRET }}
ENGRAM_TUNER_CLIENT_SECRET=${{ secrets.ENGRAM_TUNER_CLIENT_SECRET }}
ENGRAM_INGESTION_CLIENT_SECRET=${{ secrets.ENGRAM_INGESTION_CLIENT_SECRET }}
ENGRAM_MEMORY_CLIENT_SECRET=${{ secrets.ENGRAM_MEMORY_CLIENT_SECRET }}
ENGRAM_CONSOLE_CLIENT_SECRET=${{ secrets.ENGRAM_CONSOLE_CLIENT_SECRET }}
ENVEOF"
- name: Fix postgres permissions and run migrations
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
cd /opt/engram
# Ensure postgres data directory has correct permissions (UID 70 for alpine)
echo "Fixing postgres permissions..."
sudo mkdir -p /var/lib/engram/postgres
sudo chown -R 70:70 /var/lib/engram/postgres
# Restart postgres if it's unhealthy due to permission issues
if ! docker compose -f docker-compose.prod.yml exec -T postgres pg_isready -U ${POSTGRES_USER:-engram} 2>/dev/null; then
echo "Postgres not ready, restarting..."
docker compose -f docker-compose.prod.yml up -d postgres
sleep 5
fi
# Wait for postgres to be ready
for i in {1..30}; do
if docker compose -f docker-compose.prod.yml exec -T postgres pg_isready -U ${POSTGRES_USER:-engram} 2>/dev/null; then
echo "Postgres is ready"
break
fi
echo "Waiting for postgres... ($i/30)"
sleep 2
done
# Run auth migrations on the engram database
cat /opt/engram/scripts/migrate-auth.sql | docker compose -f docker-compose.prod.yml exec -T postgres psql -U ${POSTGRES_USER:-engram} -d engram || echo "Migration may have already been applied"
EOF
# Deploy API service
deploy-api:
runs-on: ubuntu-latest
needs: [changes, sync]
if: |
always() &&
needs.sync.result == 'success' &&
(needs.changes.outputs.api == 'true' || inputs.force_deploy_all == true)
environment: production
steps:
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Deploy API
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
cd /opt/engram
echo "Building and deploying API..."
docker compose -f docker-compose.prod.yml build api
docker compose -f docker-compose.prod.yml up -d --force-recreate api
echo "Waiting for API health check..."
sleep 10
if ! docker compose -f docker-compose.prod.yml ps api | grep -q "(healthy)"; then
echo "=== API Container Logs ==="
docker compose -f docker-compose.prod.yml logs --tail=100 api
echo "==========================="
fi
docker compose -f docker-compose.prod.yml ps api
EOF
# Deploy Search service
deploy-search:
runs-on: ubuntu-latest
needs: [changes, sync]
if: |
always() &&
needs.sync.result == 'success' &&
(needs.changes.outputs.search == 'true' || inputs.force_deploy_all == true)
environment: production
steps:
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Deploy Search
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
cd /opt/engram
echo "Building and deploying Search..."
docker compose -f docker-compose.prod.yml build --no-cache search
docker compose -f docker-compose.prod.yml up -d --force-recreate search
echo "Waiting for Search health check..."
sleep 5
docker compose -f docker-compose.prod.yml ps search
EOF
# Deploy Tuner service
deploy-tuner:
runs-on: ubuntu-latest
needs: [changes, sync]
if: |
always() &&
needs.sync.result == 'success' &&
(needs.changes.outputs.tuner == 'true' || inputs.force_deploy_all == true)
environment: production
steps:
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Deploy Tuner
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
cd /opt/engram
echo "Building and deploying Tuner..."
docker compose -f docker-compose.prod.yml build --no-cache tuner
docker compose -f docker-compose.prod.yml up -d --force-recreate tuner
echo "Waiting for Tuner health check..."
sleep 10
if ! docker compose -f docker-compose.prod.yml ps tuner | grep -q "(healthy)"; then
echo "=== Tuner Container Logs ==="
docker compose -f docker-compose.prod.yml logs --tail=100 tuner
echo "==========================="
fi
docker compose -f docker-compose.prod.yml ps tuner
EOF
# Deploy Observatory service
deploy-observatory:
runs-on: ubuntu-latest
needs: [changes, sync]
if: |
always() &&
needs.sync.result == 'success' &&
(needs.changes.outputs.observatory == 'true' || inputs.force_deploy_all == true)
environment: production
steps:
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Deploy Observatory
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
cd /opt/engram
echo "Building and deploying Observatory..."
docker compose -f docker-compose.prod.yml build observatory
docker compose -f docker-compose.prod.yml up -d --force-recreate observatory
echo "Waiting for Observatory health check..."
sleep 5
docker compose -f docker-compose.prod.yml ps observatory
EOF
# Deploy Console service
deploy-console:
runs-on: ubuntu-latest
needs: [changes, sync]
if: |
always() &&
needs.sync.result == 'success' &&
(needs.changes.outputs.console == 'true' || inputs.force_deploy_all == true)
environment: production
steps:
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Deploy Console
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
cd /opt/engram
echo "Building and deploying Console..."
docker compose -f docker-compose.prod.yml build console
docker compose -f docker-compose.prod.yml up -d --force-recreate console
echo "Waiting for Console health check..."
sleep 5
docker compose -f docker-compose.prod.yml ps console
EOF
# Deploy Ingestion service
deploy-ingestion:
runs-on: ubuntu-latest
needs: [changes, sync]
if: |
always() &&
needs.sync.result == 'success' &&
(needs.changes.outputs.ingestion == 'true' || inputs.force_deploy_all == true)
environment: production
steps:
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Deploy Ingestion
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
cd /opt/engram
echo "Building and deploying Ingestion..."
docker compose -f docker-compose.prod.yml build ingestion
docker compose -f docker-compose.prod.yml up -d --force-recreate ingestion
echo "Waiting for Ingestion health check..."
sleep 5
docker compose -f docker-compose.prod.yml ps ingestion
EOF
# Deploy Memory service
deploy-memory:
runs-on: ubuntu-latest
needs: [changes, sync]
if: |
always() &&
needs.sync.result == 'success' &&
(needs.changes.outputs.memory == 'true' || inputs.force_deploy_all == true)
environment: production
steps:
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Deploy Memory
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
cd /opt/engram
echo "Building and deploying Memory..."
docker compose -f docker-compose.prod.yml build memory
docker compose -f docker-compose.prod.yml up -d --force-recreate memory
echo "Waiting for Memory startup..."
sleep 5
docker compose -f docker-compose.prod.yml ps memory
EOF
# Deploy databases/infrastructure services if docker-compose changed
deploy-infrastructure-services:
runs-on: ubuntu-latest
needs: [changes, sync]
if: |
always() &&
needs.sync.result == 'success' &&
(needs.changes.outputs.docker_compose == 'true' || inputs.force_deploy_all == true)
environment: production
steps:
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Deploy Infrastructure Services
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
cd /opt/engram
# Create data directories at /var/lib/engram (outside Docker build context)
echo "Creating data directories at /var/lib/engram..."
sudo mkdir -p /var/lib/engram/{qdrant,falkordb,postgres,nats,caddy/data,caddy/config}
# Set ownership - postgres:17-alpine uses UID 70, others can use engram
sudo chown -R engram:engram /var/lib/engram/qdrant /var/lib/engram/falkordb /var/lib/engram/nats /var/lib/engram/caddy
sudo chown -R 70:70 /var/lib/engram/postgres
# Migrate existing data if present (one-time migration)
if [ -d "/opt/engram/data" ] && [ "$(ls -A /opt/engram/data 2>/dev/null)" ]; then
echo "Migrating existing data from /opt/engram/data..."
for dir in qdrant falkordb nats; do
if [ -d "/opt/engram/data/$dir" ] && [ "$(ls -A /opt/engram/data/$dir 2>/dev/null)" ]; then
echo " Migrating $dir..."
sudo rsync -a /opt/engram/data/$dir/ /var/lib/engram/$dir/ || true
fi
done
# PostgreSQL requires special handling - migration would need DB stopped first
# We'll let it re-initialize for now if migration is needed
if [ -d "/opt/engram/data/caddy" ]; then
echo " Migrating caddy..."
sudo rsync -a /opt/engram/data/caddy/ /var/lib/engram/caddy/ || true
fi
fi
echo "Pulling infrastructure images..."
docker compose -f docker-compose.prod.yml pull nats qdrant falkordb postgres
echo "Starting infrastructure services..."
docker compose -f docker-compose.prod.yml up -d nats qdrant falkordb postgres
echo "Waiting for services to be healthy..."
sleep 15
docker compose -f docker-compose.prod.yml ps
EOF
# Start/reload Caddy reverse proxy (only full restart if Caddyfile changed)
deploy-caddy:
runs-on: ubuntu-latest
needs:
- changes
- sync
- deploy-api
- deploy-search
- deploy-tuner
- deploy-observatory
- deploy-console
if: |
always() &&
needs.sync.result == 'success'
environment: production
steps:
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Reload or Start Caddy
env:
CADDY_CHANGED: ${{ needs.changes.outputs.caddy }}
DOCKER_COMPOSE_CHANGED: ${{ needs.changes.outputs.docker_compose }}
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << EOF
cd /opt/engram
# Check if Caddy is already running
CADDY_RUNNING=\$(docker compose -f docker-compose.prod.yml ps caddy --format json 2>/dev/null | grep -c '"running"' || echo "0")
if [[ "$CADDY_CHANGED" == "true" ]] || [[ "$DOCKER_COMPOSE_CHANGED" == "true" ]]; then
echo "Caddyfile or docker-compose changed - full Caddy restart..."
# Only restart Caddy, not all services
docker compose -f docker-compose.prod.yml up -d --force-recreate caddy
echo "Waiting for Caddy to be ready..."
sleep 10
elif [[ "\$CADDY_RUNNING" == "0" ]]; then
echo "Caddy not running - starting it..."
docker compose -f docker-compose.prod.yml up -d caddy
echo "Waiting for Caddy to be ready..."
sleep 10
else
echo "Caddy already running and no config changes - skipping restart"
fi
docker compose -f docker-compose.prod.yml ps
EOF
# Cleanup old Docker images
cleanup:
runs-on: ubuntu-latest
needs: [deploy-caddy, sync]
if: |
always() &&
needs.deploy-caddy.result == 'success'
environment: production
steps:
- name: Setup SSH
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.HETZNER_SSH_PRIVATE_KEY }}
- name: Add host to known_hosts
run: |
for i in {1..3}; do
if ssh-keyscan -T 30 -H ${{ env.DOMAIN }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH key scan successful"
exit 0
fi
echo "Attempt $i failed, retrying in 5s..."
sleep 5
done
echo "::error::Failed to scan SSH host after 3 attempts"
exit 1
- name: Prune old Docker images
run: |
ssh -o ServerAliveInterval=30 -o ServerAliveCountMax=20 engram@${{ env.DOMAIN }} << 'EOF'
echo "Pruning unused Docker images..."
docker image prune -af
echo ""
echo "Pruning Docker build cache..."
docker builder prune -af
echo ""
echo "Pruning unused Docker volumes..."
docker volume prune -f
echo ""
echo "Pruning unused Docker networks..."
docker network prune -f
echo ""
echo "Disk usage after cleanup:"
docker system df
df -h /
EOF
# Summary job
summary:
runs-on: ubuntu-latest
needs:
- changes
- infrastructure
- sync
- deploy-api
- deploy-search
- deploy-tuner
- deploy-observatory
- deploy-console
- deploy-ingestion
- deploy-memory
- deploy-infrastructure-services
- deploy-caddy
- cleanup
if: always()
steps:
- name: Deployment Summary
run: |
echo "## Deployment Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Component | Changed | Status |" >> $GITHUB_STEP_SUMMARY
echo "|-----------|---------|--------|" >> $GITHUB_STEP_SUMMARY
echo "| Infrastructure | ${{ needs.changes.outputs.infra }} | ${{ needs.infrastructure.result }} |" >> $GITHUB_STEP_SUMMARY
echo "| API | ${{ needs.changes.outputs.api }} | ${{ needs.deploy-api.result }} |" >> $GITHUB_STEP_SUMMARY
echo "| Search | ${{ needs.changes.outputs.search }} | ${{ needs.deploy-search.result }} |" >> $GITHUB_STEP_SUMMARY
echo "| Tuner | ${{ needs.changes.outputs.tuner }} | ${{ needs.deploy-tuner.result }} |" >> $GITHUB_STEP_SUMMARY
echo "| Observatory | ${{ needs.changes.outputs.observatory }} | ${{ needs.deploy-observatory.result }} |" >> $GITHUB_STEP_SUMMARY
echo "| Console | ${{ needs.changes.outputs.console }} | ${{ needs.deploy-console.result }} |" >> $GITHUB_STEP_SUMMARY
echo "| Ingestion | ${{ needs.changes.outputs.ingestion }} | ${{ needs.deploy-ingestion.result }} |" >> $GITHUB_STEP_SUMMARY
echo "| Memory | ${{ needs.changes.outputs.memory }} | ${{ needs.deploy-memory.result }} |" >> $GITHUB_STEP_SUMMARY
echo "| Docker Compose | ${{ needs.changes.outputs.docker_compose }} | ${{ needs.deploy-infrastructure-services.result }} |" >> $GITHUB_STEP_SUMMARY
echo "| Caddy | ${{ needs.changes.outputs.caddy }} | ${{ needs.deploy-caddy.result }} |" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Service URLs" >> $GITHUB_STEP_SUMMARY
echo "- API: https://api.engram.rawcontext.com/v1" >> $GITHUB_STEP_SUMMARY
echo "- Search: https://api.engram.rawcontext.com/v1/search" >> $GITHUB_STEP_SUMMARY
echo "- Tuner: https://api.engram.rawcontext.com/v1/tuner" >> $GITHUB_STEP_SUMMARY
echo "- Observatory: https://observatory.engram.rawcontext.com" >> $GITHUB_STEP_SUMMARY
echo "- Console: https://console.engram.rawcontext.com" >> $GITHUB_STEP_SUMMARY