diff --git a/archive/2026-03-observability-teardown/ansible/play-k3s--clickhouse.yml b/.archive/2026-03-observability-teardown/ansible/play-k3s--clickhouse.yml similarity index 100% rename from archive/2026-03-observability-teardown/ansible/play-k3s--clickhouse.yml rename to .archive/2026-03-observability-teardown/ansible/play-k3s--clickhouse.yml diff --git a/archive/2026-03-observability-teardown/ansible/play-o11y--vector.yml b/.archive/2026-03-observability-teardown/ansible/play-o11y--vector.yml similarity index 100% rename from archive/2026-03-observability-teardown/ansible/play-o11y--vector.yml rename to .archive/2026-03-observability-teardown/ansible/play-o11y--vector.yml diff --git a/archive/2026-03-observability-teardown/ansible/vector-template/vector.yaml.j2 b/.archive/2026-03-observability-teardown/ansible/vector-template/vector.yaml.j2 similarity index 100% rename from archive/2026-03-observability-teardown/ansible/vector-template/vector.yaml.j2 rename to .archive/2026-03-observability-teardown/ansible/vector-template/vector.yaml.j2 diff --git a/archive/2026-03-observability-teardown/dashboards/clickhouse-monitoring.json b/.archive/2026-03-observability-teardown/dashboards/clickhouse-monitoring.json similarity index 100% rename from archive/2026-03-observability-teardown/dashboards/clickhouse-monitoring.json rename to .archive/2026-03-observability-teardown/dashboards/clickhouse-monitoring.json diff --git a/archive/2026-03-observability-teardown/dashboards/nginx-access-logs.json b/.archive/2026-03-observability-teardown/dashboards/nginx-access-logs.json similarity index 100% rename from archive/2026-03-observability-teardown/dashboards/nginx-access-logs.json rename to .archive/2026-03-observability-teardown/dashboards/nginx-access-logs.json diff --git a/archive/2026-03-observability-teardown/grafana-nginx-dashboard.md b/.archive/2026-03-observability-teardown/grafana-nginx-dashboard.md similarity index 100% rename from archive/2026-03-observability-teardown/grafana-nginx-dashboard.md rename to .archive/2026-03-observability-teardown/grafana-nginx-dashboard.md diff --git a/archive/2026-03-observability-teardown/nginx-logs-schema.md b/.archive/2026-03-observability-teardown/nginx-logs-schema.md similarity index 100% rename from archive/2026-03-observability-teardown/nginx-logs-schema.md rename to .archive/2026-03-observability-teardown/nginx-logs-schema.md diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/charts/grafana/values.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/charts/grafana/values.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/charts/grafana/values.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/charts/grafana/values.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/gateway.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/gateway.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/gateway.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/gateway.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/httproutes.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/httproutes.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/httproutes.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/httproutes.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/kustomization.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/kustomization.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/kustomization.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/kustomization.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/namespace.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/namespace.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/namespace.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/namespace.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.gitignore b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.gitignore similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.gitignore rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.gitignore diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.secrets.env.sample b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.secrets.env.sample similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.secrets.env.sample rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.secrets.env.sample diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/gateway.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/gateway.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/gateway.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/gateway.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/httproutes.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/httproutes.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/httproutes.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/httproutes.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/kustomization.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/kustomization.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/kustomization.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/kustomization.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-deployment.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-deployment.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-deployment.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-deployment.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-service.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-service.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-service.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-service.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-worker-deployment.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-worker-deployment.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-worker-deployment.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-worker-deployment.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/namespace.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/namespace.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/namespace.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/namespace.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-deployment.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-deployment.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-deployment.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-deployment.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-service.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-service.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-service.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-service.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/pvc.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/pvc.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/pvc.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/pvc.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-deployment.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-deployment.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-deployment.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-deployment.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-service.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-service.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-service.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-service.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/secrets/.gitignore b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/secrets/.gitignore similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/secrets/.gitignore rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/secrets/.gitignore diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/charts/kube-prometheus-stack/values.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/charts/kube-prometheus-stack/values.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/charts/kube-prometheus-stack/values.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/charts/kube-prometheus-stack/values.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/kustomization.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/kustomization.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/kustomization.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/kustomization.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/longhorn-servicemonitor.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/longhorn-servicemonitor.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/longhorn-servicemonitor.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/longhorn-servicemonitor.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/namespace.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/namespace.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/namespace.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/namespace.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/secrets/.gitignore b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/secrets/.gitignore similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/secrets/.gitignore rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/secrets/.gitignore diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/tailscale-ingress.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/tailscale-ingress.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/tailscale-ingress.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/tailscale-ingress.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/.gitignore b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/.gitignore similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/.gitignore rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/.gitignore diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-installation.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-installation.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-installation.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-installation.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-keeper.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-keeper.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-keeper.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-keeper.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/kustomization.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/kustomization.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/kustomization.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/kustomization.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/namespace.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/namespace.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/namespace.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/namespace.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/.gitignore b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/.gitignore similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/.gitignore rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/.gitignore diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/users-secret.yaml.sample b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/users-secret.yaml.sample similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/users-secret.yaml.sample rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/users-secret.yaml.sample diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/service-tailscale.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/service-tailscale.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/service-tailscale.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/service-tailscale.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/002-logs-nginx-stg.sql b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/002-logs-nginx-stg.sql similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/002-logs-nginx-stg.sql rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/002-logs-nginx-stg.sql diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/003-logs-nginx-prd.sql b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/003-logs-nginx-prd.sql similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/003-logs-nginx-prd.sql rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/003-logs-nginx-prd.sql diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/tailscale-operator/values.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/tailscale-operator/values.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/tailscale-operator/values.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/tailscale-operator/values.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/traefik/values.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/traefik/values.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/traefik/values.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/traefik/values.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/README.md b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/README.md similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/README.md rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/README.md diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/operator-values.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/operator-values.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/operator-values.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/operator-values.yaml diff --git a/archive/2026-03-observability-teardown/teardown-runbook.md b/.archive/2026-03-observability-teardown/teardown-runbook.md similarity index 100% rename from archive/2026-03-observability-teardown/teardown-runbook.md rename to .archive/2026-03-observability-teardown/teardown-runbook.md diff --git a/.envrc b/.envrc new file mode 100644 index 000000000..072f6caea --- /dev/null +++ b/.envrc @@ -0,0 +1,28 @@ +SECRETS_DIR="${SECRETS_DIR:-$(expand_path ../infra-secrets)}" + +use_sops() { + local path="$1" + local type="${2:-dotenv}" + if [ -f "$path" ]; then + local decrypted + decrypted=$(sops -d --input-type "$type" --output-type "$type" "$path" 2>&1) || { + log_error "sops decrypt failed for $path" + return + } + eval "$(echo "$decrypted" | direnv dotenv bash /dev/stdin)" + watch_file "$path" + fi +} + +if [ -d "$SECRETS_DIR" ]; then + use_sops "$SECRETS_DIR/global/.env.enc" +else + log_error "infra-secrets repo not found at $SECRETS_DIR" + log_error "Clone it: git clone git@github.com:freeCodeCamp/infra-secrets.git ../infra-secrets" +fi + +dotenv_if_exists .env + +if [ -d ansible/.venv ]; then + PATH_add ansible/.venv/bin +fi diff --git a/.github/workflows/k8s--validate.yml b/.github/workflows/k8s--validate.yml new file mode 100644 index 000000000..d18849cdb --- /dev/null +++ b/.github/workflows/k8s--validate.yml @@ -0,0 +1,35 @@ +name: K8s -- Manifest Validation + +on: + push: + branches: + - main + pull_request: + branches: + - main + workflow_dispatch: + +permissions: + contents: read + +jobs: + validate: + name: K8s -- Manifest Validation + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Install just + run: | + curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to /usr/local/bin + just --version + + - name: Install kubeconform + run: | + curl -sL https://github.com/yannh/kubeconform/releases/download/v0.7.0/kubeconform-linux-amd64.tar.gz | tar xz -C /usr/local/bin + kubeconform -v + + - name: Validate K8s manifests + run: just k8s-validate 1.32.0 diff --git a/.gitignore b/.gitignore index 73b3f2a3a..35f623d08 100644 --- a/.gitignore +++ b/.gitignore @@ -36,7 +36,7 @@ terraform.rc .vscode/ # Ignore User-specific temporary files -__scratchpad__/ +.scratchpad # Ignore generated files manifest.json @@ -48,7 +48,6 @@ ansible/inventory/hosts # Secrets *.env *.env.* -.envrc .kubeconfig.yaml *.crt @@ -61,3 +60,7 @@ secrets.overrides.yaml o11y/defaults/ .beads-credential-key + +# Beads / Dolt files (added by bd init) +.dolt/ +*.db diff --git a/__scratchpad__/.gitkeep b/.scratchpad/.gitkeep similarity index 100% rename from __scratchpad__/.gitkeep rename to .scratchpad/.gitkeep diff --git a/__scratchpad__/README.md b/.scratchpad/README.md similarity index 100% rename from __scratchpad__/README.md rename to .scratchpad/README.md diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg index 682945304..d28e0756a 100644 --- a/ansible/ansible.cfg +++ b/ansible/ansible.cfg @@ -8,6 +8,8 @@ inventory = ./inventory home = ./.ansible collections_path = ./.ansible/collections:./roles roles_path = ./.ansible/roles:./roles +# Secrets managed via sops+age in the infra-secrets private repo +# Env vars loaded via direnv; vault vars via community.sops collection when needed [inventory] enable_plugins = yaml, ini, toml, community.general.linode, community.digitalocean.digitalocean diff --git a/ansible/inventory/digitalocean.yml b/ansible/inventory/digitalocean.yml index c9f0eeb36..464576de6 100644 --- a/ansible/inventory/digitalocean.yml +++ b/ansible/inventory/digitalocean.yml @@ -2,6 +2,7 @@ plugin: community.digitalocean.digitalocean api_token: "{{ lookup('ansible.builtin.env', 'DO_API_TOKEN') }}" attributes: + - id - name - tags - networks diff --git a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml new file mode 100644 index 000000000..88a567b8c --- /dev/null +++ b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml @@ -0,0 +1,42 @@ +--- +# gxy-management galaxy configuration +# Applied automatically when targeting the gxy_mgmt_k3s inventory group +# +# To add a new galaxy: create a new file matching the DO inventory tag. + +galaxy_name: gxy-management +k3s_version: v1.34.5+k3s1 +cilium_cluster_id: 1 + +# k3s config.yaml — written to /etc/rancher/k3s/config.yaml by the role +# Keys are hyphenated, matching CLI flags. Docs: +# https://docs.k3s.io/installation/configuration +# https://docs.k3s.io/security/hardening-guide +# +# Do NOT add tls-san, token, cluster-init, or server here — +# those are managed by the k3s-ansible role via extra_server_args. +server_config_yaml: | + flannel-backend: "none" + disable-network-policy: true + # kube-proxy replacement disabled — breaks etcd on k3s HA (see field-notes Failure 7) + # Cilium still provides CNI + network policies + Hubble without it + # Revisit on bare metal where performance matters + disable-kube-proxy: false + cluster-cidr: "10.1.0.0/16" + service-cidr: "10.11.0.0/16" + protect-kernel-defaults: true + secrets-encryption: true + kube-apiserver-arg: + - "admission-control-config-file=/etc/rancher/k3s/pss-admission.yaml" + - "audit-log-path=/var/log/k3s/audit.log" + - "audit-policy-file=/etc/rancher/k3s/audit-policy.yaml" + - "audit-log-maxage=30" + - "audit-log-maxbackup=10" + - "audit-log-maxsize=100" + etcd-s3: true + etcd-s3-endpoint: "fra1.digitaloceanspaces.com" + etcd-s3-bucket: "net.freecodecamp.universe-backups" + etcd-s3-folder: "etcd/gxy-management" + etcd-s3-region: "fra1" + etcd-snapshot-schedule-cron: "0 */6 * * *" + etcd-snapshot-retention: 20 diff --git a/ansible/justfile b/ansible/justfile deleted file mode 100644 index dc61cc942..000000000 --- a/ansible/justfile +++ /dev/null @@ -1,63 +0,0 @@ -set shell := ["bash", "-cu"] - -venv := ".venv" -INVENTORY := env("INVENTORY", "linode.yml") - -# Show available recipes -default: - @just --list - @echo "" - @echo "UV-BASED WORKFLOW:" - @echo " 1. just install # Install ansible + deps with uv" - @echo " 2. direnv allow # Auto-activate venv (one-time)" - @echo " 3. just test # Test connection" - @echo " 4. ansible-playbook ... # Run playbooks" - -# Install ansible and dependencies using uv -install: - #!/usr/bin/env bash - set -eu - if ! command -v uv >/dev/null 2>&1; then - echo "ERROR: uv not found. Please install uv first:" - echo " curl -LsSf https://astral.sh/uv/install.sh | sh" - exit 1 - fi - uv sync - source {{venv}}/bin/activate && ansible-galaxy install -r requirements.yml - -# Remove virtual environment and ansible directories -[confirm("This will delete the virtual environment and .ansible directory. Continue?")] -clean: - rm -rf {{venv}} .ansible - -# Test connection to random VM (set INVENTORY env var, default: linode.yml) -test: - #!/usr/bin/env bash - set -eu - echo "Counting VMs in inventory..." - if ! command -v ansible >/dev/null 2>&1; then - echo "ERROR: ansible not found - did you source the venv?" - echo "Run: source {{venv}}/bin/activate" - exit 1 - fi - if ! command -v jq >/dev/null 2>&1; then - echo "ERROR: jq not found - please install jq" - exit 1 - fi - VM_COUNT=$(ansible-inventory -i inventory/{{INVENTORY}} --list 2>/dev/null | jq -r '._meta.hostvars | keys | length') - if [ $? -ne 0 ]; then - echo "ERROR: Failed to parse inventory" - exit 1 - fi - echo "Found $VM_COUNT VMs in inventory" - if [ "$VM_COUNT" -eq 0 ]; then - echo "ERROR: No VMs found in inventory" - exit 1 - fi - RANDOM_INDEX=$(( RANDOM % VM_COUNT )) - echo "Testing connection to VM at index $RANDOM_INDEX..." - if ! ansible -i inventory/{{INVENTORY}} "all[$RANDOM_INDEX]" -m ping --one-line -v; then - echo "ERROR: Connection test failed" - exit 1 - fi - echo "SUCCESS: Connection test passed" diff --git a/ansible/play-k3s--bootstrap.yml b/ansible/play-k3s--bootstrap.yml new file mode 100644 index 000000000..811a5f4be --- /dev/null +++ b/ansible/play-k3s--bootstrap.yml @@ -0,0 +1,245 @@ +--- +# Deploy k3s HA galaxy cluster with Cilium CNI +# +# Generic playbook for any Universe galaxy. All galaxy-specific config +# lives in inventory/group_vars/.yml — not in this file. +# +# Plays: validate → prereqs → k3s server → cilium → verify + kubeconfig +# +# Prerequisites: +# - VMs provisioned with VPC (eth1) and Tailscale connected +# - DO cloud firewall allows: 2379-2380, 4240, 5001, 6443, 8472, 10250 between VPC nodes +# - Env vars loaded via direnv (cd into cluster dir first) +# - Group vars populated for the target inventory group +# +# Usage: +# cd k3s/ +# just play k3s--galaxy + +# Play 1: Validate prerequisites and build dynamic groups +- name: "K3s {{ variable_host }} - Validate" + hosts: "{{ variable_host }}" + gather_facts: true + become: true + vars: + do_spaces_access_key: "{{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" + do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" + + tasks: + - name: Validate required group_vars + assert: + that: + - galaxy_name is defined and galaxy_name | length > 0 + - k3s_version is defined and k3s_version | length > 0 + - server_config_yaml is defined and server_config_yaml | length > 0 + - cilium_cluster_id is defined and cilium_cluster_id | string | length > 0 + fail_msg: "Missing group_vars. Populate inventory/group_vars/{{ variable_host }}.yml" + + - name: Validate env vars loaded + assert: + that: + - do_spaces_access_key | length > 0 + - do_spaces_secret_key | length > 0 + fail_msg: "DO_SPACES_ACCESS_KEY/SECRET_KEY not set. cd into cluster dir first." + + - name: Validate VPC interface (eth1) + assert: + that: + - ansible_eth1 is defined + - ansible_eth1.ipv4 is defined + - ansible_eth1.ipv4.address | regex_search('^10\.') + fail_msg: "VPC interface eth1 not found or IP not in 10.x.x.x range." + + - name: Validate Tailscale connected + assert: + that: + - ansible_tailscale0 is defined + - ansible_tailscale0.ipv4 is defined + fail_msg: "Tailscale not connected." + + - name: Set network facts + set_fact: + vpc_ip: "{{ ansible_eth1.ipv4.address }}" + tailscale_ip: "{{ ansible_tailscale0.ipv4.address }}" + + - name: Display configuration + debug: + msg: "{{ inventory_hostname }}: VPC={{ vpc_ip }}, TS={{ tailscale_ip }}, galaxy={{ galaxy_name }}" + + - name: Build dynamic groups + group_by: + key: "{{ item }}" + loop: [k3s_cluster, server] + +# Play 2: System prerequisites (before k3s starts) +- name: "K3s {{ variable_host }} - Prerequisites" + hosts: k3s_cluster + gather_facts: true + become: true + vars: + cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" + + pre_tasks: + - name: Set CIS kubelet kernel parameters + ansible.posix.sysctl: + name: "{{ item.key }}" + value: "{{ item.value }}" + sysctl_file: /etc/sysctl.d/90-kubelet.conf + reload: true + loop: + - { key: vm.overcommit_memory, value: "1" } + - { key: vm.panic_on_oom, value: "0" } + - { key: kernel.panic, value: "10" } + - { key: kernel.panic_on_oops, value: "1" } + + - name: Ensure k3s directories exist + file: + path: "{{ item }}" + state: directory + mode: "0755" + loop: + - /etc/rancher/k3s + - /var/lib/rancher/k3s/server/manifests + - /var/log/k3s + + - name: Copy PSS admission config + copy: + src: "{{ cluster_config_dir }}/cluster/security/pss-admission.yaml" + dest: /etc/rancher/k3s/pss-admission.yaml + mode: "0600" + + - name: Copy audit policy + copy: + src: "{{ cluster_config_dir }}/cluster/security/audit-policy.yaml" + dest: /etc/rancher/k3s/audit-policy.yaml + mode: "0600" + + - name: Copy Traefik HelmChartConfig + copy: + src: "{{ cluster_config_dir }}/cluster/traefik-config.yaml" + dest: /var/lib/rancher/k3s/server/manifests/traefik-config.yaml + mode: "0600" + + roles: + - role: k3s.orchestration.prereq + +# Play 3: Deploy k3s server +- name: "K3s {{ variable_host }} - Deploy" + hosts: server + gather_facts: true + become: true + vars: + api_endpoint: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" + extra_server_args: >- + --node-ip={{ hostvars[inventory_hostname]['vpc_ip'] }} + --advertise-address={{ hostvars[inventory_hostname]['vpc_ip'] }} + --tls-san={{ hostvars[inventory_hostname]['vpc_ip'] }} + --tls-san={{ hostvars[inventory_hostname]['tailscale_ip'] }} + extra_service_envs: + - "AWS_ACCESS_KEY_ID={{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" + - "AWS_SECRET_ACCESS_KEY={{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" + server_group: server + user_kubectl: false + roles: + - role: k3s.orchestration.k3s_server + +# Play 4: Install Cilium CNI +- name: "K3s {{ variable_host }} - Cilium" + hosts: server[0] + gather_facts: false + become: true + vars: + cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" + roles: + - role: cilium + vars: + cilium_cluster_name: "{{ galaxy_name }}" + cilium_values_file: "{{ cluster_config_dir }}/cluster/cilium/values.yaml" + cilium_k8s_service_host: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" + +# Play 5: Verify cluster and fetch kubeconfig +- name: "K3s {{ variable_host }} - Verify" + hosts: server[0] + gather_facts: false + become: true + vars: + cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" + + tasks: + - name: Wait for all nodes Ready + command: k3s kubectl wait --for=condition=Ready nodes --all --timeout=300s + register: wait_result + retries: 3 + delay: 30 + until: wait_result.rc == 0 + changed_when: false + + - name: Display cluster status + command: k3s kubectl get nodes -o wide + register: cluster_status + changed_when: false + + - name: Cluster ready + debug: + msg: "{{ cluster_status.stdout_lines }}" + + - name: Read kubeconfig + slurp: + src: /etc/rancher/k3s/k3s.yaml + register: kubeconfig_raw + no_log: true + + - name: Write kubeconfig locally + copy: + content: "{{ kubeconfig_raw.content | b64decode }}" + dest: "{{ cluster_config_dir }}/.kubeconfig.yaml" + mode: "0600" + delegate_to: localhost + become: false + no_log: true + + - name: Fix kubeconfig server address (use Tailscale IP) + delegate_to: localhost + become: false + ansible.builtin.replace: + path: "{{ cluster_config_dir }}/.kubeconfig.yaml" + regexp: 'https://127\.0\.0\.1:6443' + replace: "https://{{ hostvars[inventory_hostname]['tailscale_ip'] }}:6443" + + - name: Fix kubeconfig context name + delegate_to: localhost + become: false + ansible.builtin.replace: + path: "{{ cluster_config_dir }}/.kubeconfig.yaml" + regexp: '(\s+)(name|cluster|user|current-context): default' + replace: '\1\2: {{ galaxy_name }}' + + - name: Verify kubectl connectivity + command: kubectl get nodes + environment: + KUBECONFIG: "{{ cluster_config_dir }}/.kubeconfig.yaml" + register: kubectl_result + changed_when: false + delegate_to: localhost + become: false + + - name: Patch metrics-server for hostNetwork (pod→nodeIP workaround) + command: >- + k3s kubectl -n kube-system patch deploy metrics-server + --type=json + -p='[{"op":"add","path":"/spec/template/spec/hostNetwork","value":true}, + {"op":"replace","path":"/spec/template/spec/containers/0/ports/0/containerPort","value":4443}, + {"op":"replace","path":"/spec/template/spec/containers/0/args","value":["--cert-dir=/tmp","--secure-port=4443","--kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname","--kubelet-use-node-status-port","--metric-resolution=15s"]}]' + register: patch_result + changed_when: "'patched' in patch_result.stdout" + failed_when: false + retries: 5 + delay: 10 + until: patch_result.rc == 0 + + - name: Done + debug: + msg: + - "=== {{ galaxy_name }} cluster ready ===" + - "Kubeconfig: k3s/{{ galaxy_name }}/.kubeconfig.yaml" + - "{{ kubectl_result.stdout }}" diff --git a/ansible/play-k3s--reset.yml b/ansible/play-k3s--reset.yml new file mode 100644 index 000000000..365429de5 --- /dev/null +++ b/ansible/play-k3s--reset.yml @@ -0,0 +1,70 @@ +--- +# Reset k3s cluster — removes k3s, Cilium, etcd data, all k8s state. +# Preserves: Tailscale, cloud-init hardening, DO infrastructure, CIS sysctls. +# +# Usage: +# cd k3s/ +# just play k3s--reset + +- name: "K3s {{ variable_host }} - Build groups" + hosts: "{{ variable_host }}" + gather_facts: false + tasks: + - name: Build dynamic groups + group_by: + key: "{{ item }}" + loop: [k3s_cluster, server] + +- name: "K3s {{ variable_host }} - Reset" + hosts: k3s_cluster + gather_facts: true + become: true + tasks: + - name: Run k3s uninstall (server) + when: "'server' in group_names" + command: + cmd: k3s-uninstall.sh + removes: /var/lib/rancher/k3s/* + + - name: Run k3s uninstall (agent) + when: "'agent' in group_names" + command: + cmd: k3s-agent-uninstall.sh + removes: /var/lib/rancher/k3s/* + + - name: Clean up remaining files + file: + path: "{{ item }}" + state: absent + loop: + - "~{{ ansible_user_id }}/.kube/config" + - /usr/local/bin/k3s-install.sh + - /etc/rancher + - /var/lib/rancher + - /etc/systemd/system/k3s.service.env + - /usr/local/bin/helm + - /var/log/k3s + + - name: Clean up Cilium BPF state + command: + cmd: rm -rf /sys/fs/bpf/cilium + removes: /sys/fs/bpf/cilium + + - name: Flush Cilium iptables chains + shell: | + iptables-save | grep -iv cilium | iptables-restore + ip6tables-save | grep -iv cilium | ip6tables-restore + changed_when: true + ignore_errors: true + +- name: "K3s {{ variable_host }} - Clean local kubeconfig" + hosts: server[0] + gather_facts: false + become: false + tasks: + - name: Remove local kubeconfig + file: + path: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}/.kubeconfig.yaml" + state: absent + delegate_to: localhost + when: galaxy_name is defined diff --git a/ansible/requirements.yml b/ansible/requirements.yml index 89788f790..c5970c72e 100644 --- a/ansible/requirements.yml +++ b/ansible/requirements.yml @@ -8,6 +8,8 @@ collections: version: ">=1.27.0,<2.0.0" - name: ansible.posix version: ">=2.0.0,<3.0.0" + - name: kubernetes.core + version: ">=5.0.0,<7.0.0" - name: grafana.grafana version: ">=5.7.0,<6.0.0" - name: https://github.com/k3s-io/k3s-ansible.git diff --git a/ansible/roles/cilium/defaults/main.yml b/ansible/roles/cilium/defaults/main.yml new file mode 100644 index 000000000..f63c3b65c --- /dev/null +++ b/ansible/roles/cilium/defaults/main.yml @@ -0,0 +1,8 @@ +--- +cilium_version: "1.19.2" +helm_version: "v3.17.3" +cilium_cluster_name: "" +cilium_cluster_id: "" +cilium_values_file: "" +cilium_k8s_service_host: "" +cilium_k8s_service_port: "6443" diff --git a/ansible/roles/cilium/tasks/main.yml b/ansible/roles/cilium/tasks/main.yml new file mode 100644 index 000000000..14ea98c67 --- /dev/null +++ b/ansible/roles/cilium/tasks/main.yml @@ -0,0 +1,91 @@ +--- +- name: Validate required Cilium variables + ansible.builtin.assert: + that: + - cilium_cluster_name | length > 0 + - cilium_cluster_id | string | length > 0 + - cilium_values_file | length > 0 + - cilium_k8s_service_host | length > 0 + fail_msg: >- + Required variables missing. Set cilium_cluster_name, cilium_cluster_id, + cilium_values_file, and cilium_k8s_service_host. + +- name: Copy Cilium values file to server + ansible.builtin.copy: + src: "{{ cilium_values_file }}" + dest: /etc/rancher/k3s/cilium-values.yaml + mode: "0600" + +- name: Install Helm + ansible.builtin.shell: + cmd: set -o pipefail && curl https://raw.githubusercontent.com/helm/helm/{{ helm_version }}/scripts/get-helm-3 | DESIRED_VERSION={{ helm_version }} bash + executable: /bin/bash + creates: /usr/local/bin/helm + +- name: Add Cilium Helm repo + kubernetes.core.helm_repository: + name: cilium + repo_url: https://helm.cilium.io/ + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml + +- name: Install Cilium + kubernetes.core.helm: + name: cilium + chart_ref: cilium/cilium + chart_version: "{{ cilium_version }}" + release_namespace: kube-system + update_repo_cache: true + values_files: + - /etc/rancher/k3s/cilium-values.yaml + set_values: + - value: "cluster.name={{ cilium_cluster_name }}" + - value: "cluster.id={{ cilium_cluster_id }}" + - value: "k8sServiceHost={{ cilium_k8s_service_host }}" + - value: "k8sServicePort={{ cilium_k8s_service_port }}" + wait: true + timeout: "10m0s" + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml + +- name: Clean up Cilium values file + ansible.builtin.file: + path: /etc/rancher/k3s/cilium-values.yaml + state: absent + +- name: Wait for Cilium agent DaemonSet rollout + ansible.builtin.command: + cmd: kubectl -n kube-system rollout status daemonset/cilium --timeout=180s + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml + changed_when: false + retries: 3 + delay: 15 + register: cilium_agent_rollout + until: cilium_agent_rollout.rc == 0 + +- name: Wait for Cilium operator Deployment rollout + ansible.builtin.command: + cmd: kubectl -n kube-system rollout status deployment/cilium-operator --timeout=180s + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml + changed_when: false + retries: 3 + delay: 15 + register: cilium_operator_rollout + until: cilium_operator_rollout.rc == 0 + +- name: Verify Cilium status + ansible.builtin.command: + cmd: kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium status --brief + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml + register: cilium_status + changed_when: false + retries: 5 + delay: 20 + until: cilium_status.rc == 0 + +- name: Display Cilium status + ansible.builtin.debug: + msg: "{{ cilium_status.stdout_lines }}" diff --git a/cloud-init/basic.yml b/cloud-init/basic.yml index e4d6dc37f..5d37707f6 100644 --- a/cloud-init/basic.yml +++ b/cloud-init/basic.yml @@ -1,26 +1,51 @@ #cloud-config +package_update: true +package_upgrade: true +package_reboot_if_required: true +packages: + - apt-transport-https + - ca-certificates + - curl + - gnupg-agent + - software-properties-common + - fail2ban users: - name: freecodecamp groups: - sudo - - docker shell: /bin/bash sudo: "ALL=(ALL) NOPASSWD:ALL" ssh_import_id: - gh:camperbot - - raisedadead + - gh:raisedadead +write_files: + - path: /etc/fail2ban/jail.local + content: | + [sshd] + enabled = true + port = ssh + filter = sshd + maxretry = 5 + bantime = 3600 + findtime = 600 + owner: root:root + permissions: "0644" + - path: /etc/ssh/sshd_config.d/99-hardening.conf + content: | + PermitRootLogin no + PasswordAuthentication no + PubkeyAuthentication yes + AllowUsers freecodecamp + owner: root:root + permissions: "0644" runcmd: - # Configure sshd - - | - sed -i -e '/^PermitRootLogin/s/^.*$/PermitRootLogin no/' /etc/ssh/sshd_config - sed -i -e '/^PasswordAuthentication/s/^.*$/PasswordAuthentication no/' /etc/ssh/sshd_config - sed -i -e '/^PubkeyAuthentication/s/^.*$/PubkeyAuthentication yes/' /etc/ssh/sshd_config - sed -i -e '$aAllowUsers freecodecamp' /etc/ssh/sshd_config + - systemctl enable fail2ban + - systemctl start fail2ban # :-----------------------: WARNING :-----------------------: # # This next line should be the last command in the list, # because it involves restarting the ssh service. # # :-----------------------: WARNING :-----------------------: - - systemctl restart sshd + - systemctl restart ssh || systemctl restart sshd || true final_message: "Setup complete" diff --git a/cloud-init/docker.yml b/cloud-init/docker.yml index ec3eeec04..0653cbedc 100644 --- a/cloud-init/docker.yml +++ b/cloud-init/docker.yml @@ -3,13 +3,13 @@ package_update: true package_upgrade: true package_reboot_if_required: true packages: - - unattended-upgrades - apt-transport-https - ca-certificates - curl - gnupg-agent - software-properties-common - lsb-release + - fail2ban users: - name: freecodecamp groups: @@ -19,9 +19,28 @@ users: sudo: "ALL=(ALL) NOPASSWD:ALL" ssh_import_id: - gh:camperbot - - raisedadead + - gh:raisedadead +write_files: + - path: /etc/fail2ban/jail.local + content: | + [sshd] + enabled = true + port = ssh + filter = sshd + maxretry = 5 + bantime = 3600 + findtime = 600 + owner: root:root + permissions: "0644" + - path: /etc/ssh/sshd_config.d/99-hardening.conf + content: | + PermitRootLogin no + PasswordAuthentication no + PubkeyAuthentication yes + AllowUsers freecodecamp + owner: root:root + permissions: "0644" runcmd: - # This will install docker on the virtual machine and add the freeCodeCamp user to docker usergroup - sudo mkdir -p /etc/apt/keyrings - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg @@ -35,4 +54,13 @@ runcmd: - sudo systemctl start docker - sudo systemctl enable docker - sudo usermod -aG docker freecodecamp + - systemctl enable fail2ban + - systemctl start fail2ban + # :-----------------------: WARNING :-----------------------: + # + # This next line should be the last command in the list, + # because it involves restarting the ssh service. + # + # :-----------------------: WARNING :-----------------------: + - systemctl restart ssh || systemctl restart sshd || true final_message: "Setup complete" diff --git a/justfile b/justfile new file mode 100644 index 000000000..389b53fce --- /dev/null +++ b/justfile @@ -0,0 +1,196 @@ +set shell := ["bash", "-cu"] + +secrets_dir := env("SECRETS_DIR", justfile_directory() + "/../infra-secrets") +crds_schema := 'https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/{{.Group}}/{{.ResourceKind}}_{{.ResourceAPIVersion}}.json' + +# Show available recipes +default: + @just --list + +# --------------------------------------------------------------------------- +# Secrets (sops + age — stored in infra-secrets private repo) +# --------------------------------------------------------------------------- + +# View a decrypted secret (auto-detects format from extension) +[group('secrets')] +secret-view name: + #!/usr/bin/env bash + set -eu + FILE=$(find "{{secrets_dir}}/{{name}}" -name '*.enc' -type f | head -1) + [ -f "$FILE" ] || { echo "Error: no .enc file in {{secrets_dir}}/{{name}}/"; exit 1; } + case "$FILE" in + *.env.enc) sops -d --input-type dotenv --output-type dotenv "$FILE" ;; + *.yaml.enc|*.yml.enc) sops -d --input-type yaml --output-type yaml "$FILE" ;; + *) sops -d "$FILE" ;; + esac + +# Edit a secret in $EDITOR +[group('secrets')] +secret-edit name: + sops "{{secrets_dir}}/{{name}}/.env.enc" + +# Verify all encrypted secrets are readable +[group('secrets')] +secret-verify-all: + #!/usr/bin/env bash + set -eu + for f in $(find "{{secrets_dir}}" -name '*.enc' -type f | sort); do + echo -n "$f: " + case "$f" in + *.env.enc) sops -d --input-type dotenv --output-type dotenv "$f" > /dev/null 2>&1 ;; + *.yaml.enc|*.yml.enc) sops -d --input-type yaml --output-type yaml "$f" > /dev/null 2>&1 ;; + *) sops -d "$f" > /dev/null 2>&1 ;; + esac && echo "OK" || echo "FAILED" + done + +# --------------------------------------------------------------------------- +# K8s / K3s +# --------------------------------------------------------------------------- + +# Decrypt kubeconfig from infra-secrets (run once after clone) +[group('k3s')] +kubeconfig-sync cluster: + #!/usr/bin/env bash + set -eu + SRC="{{secrets_dir}}/k3s/{{cluster}}/kubeconfig.yaml.enc" + DST="k3s/{{cluster}}/.kubeconfig.yaml" + [ -f "$SRC" ] || { echo "Error: $SRC not found (cluster not yet bootstrapped?)"; exit 1; } + umask 077 + sops -d --input-type yaml --output-type yaml "$SRC" > "$DST" + chmod 600 "$DST" + echo "Synced kubeconfig → $DST" + +# Deploy app (decrypt secrets + TLS → kustomize apply → cleanup) +[group('k3s')] +deploy cluster app: + #!/usr/bin/env bash + set -eu + ENC_DIR="{{secrets_dir}}/k3s/{{cluster}}" + APP_SECRETS="k3s/{{cluster}}/apps/{{app}}/manifests/base/secrets" + CLEANUP="" + + if [ -f "$ENC_DIR/{{app}}.secrets.env.enc" ]; then + sops -d --input-type dotenv --output-type dotenv "$ENC_DIR/{{app}}.secrets.env.enc" > "$APP_SECRETS/.secrets.env" + CLEANUP="$APP_SECRETS/.secrets.env" + trap "rm -f $CLEANUP" EXIT + fi + if [ -f "$ENC_DIR/{{app}}.tls.crt.enc" ]; then + sops -d "$ENC_DIR/{{app}}.tls.crt.enc" > "$APP_SECRETS/tls.crt" + CLEANUP="$CLEANUP $APP_SECRETS/tls.crt" + trap "rm -f $CLEANUP" EXIT + fi + if [ -f "$ENC_DIR/{{app}}.tls.key.enc" ]; then + sops -d "$ENC_DIR/{{app}}.tls.key.enc" > "$APP_SECRETS/tls.key" + CLEANUP="$CLEANUP $APP_SECRETS/tls.key" + trap "rm -f $CLEANUP" EXIT + fi + + [ -n "$CLEANUP" ] || { echo "Error: no secrets found for {{app}} in $ENC_DIR"; exit 1; } + + cd k3s/{{cluster}} + export KUBECONFIG="$(pwd)/.kubeconfig.yaml" + kubectl apply -k apps/{{app}}/manifests/base/ + echo "Deployed {{app}} to {{cluster}}" + +# Install or upgrade a Helm chart (overlays secret values from infra-secrets if present) +[group('k3s')] +helm-upgrade cluster app: + #!/usr/bin/env bash + set -eu + cd k3s/{{cluster}} + export KUBECONFIG="$(pwd)/.kubeconfig.yaml" + CHART_DIR=$(find "apps/{{app}}/charts" -maxdepth 1 -mindepth 1 -type d | head -1) + [ -d "$CHART_DIR" ] || { echo "Error: no chart dir in apps/{{app}}/charts/"; exit 1; } + CHART_NAME=$(basename "$CHART_DIR") + VALUES="$CHART_DIR/values.yaml" + [ -f "$VALUES" ] || { echo "Error: $VALUES not found"; exit 1; } + REPO_FILE="$CHART_DIR/repo" + [ -f "$REPO_FILE" ] || { echo "Error: $REPO_FILE not found (one line: chart repo URL)"; exit 1; } + REPO_URL=$(cat "$REPO_FILE") + + HELM_ARGS="-f $VALUES" + CLEANUP="" + SECRET_VALUES="{{secrets_dir}}/k3s/{{cluster}}/{{app}}.values.yaml.enc" + if [ -f "$SECRET_VALUES" ]; then + TMPVALS=$(mktemp) + sops -d --input-type yaml --output-type yaml "$SECRET_VALUES" > "$TMPVALS" + HELM_ARGS="$HELM_ARGS -f $TMPVALS" + CLEANUP="$TMPVALS" + trap "rm -f $CLEANUP" EXIT + fi + + echo "Installing {{app}} (chart: $CHART_NAME) from $REPO_URL" + helm upgrade --install {{app}} "$CHART_NAME" \ + --repo "$REPO_URL" \ + -n {{app}} --create-namespace \ + $HELM_ARGS + +# Validate K8s manifests with kubeconform +[group('k3s')] +k8s-validate version="1.32.0": + kubeconform \ + -summary \ + -output text \ + -strict \ + -ignore-missing-schemas \ + -kubernetes-version {{version}} \ + -schema-location default \ + -schema-location '{{crds_schema}}' \ + -ignore-filename-pattern 'kustomization\.yaml' \ + -ignore-filename-pattern '\.kubeconfig\.yaml' \ + -ignore-filename-pattern 'values\.yaml' \ + -ignore-filename-pattern 'operator-values\.yaml' \ + -ignore-filename-pattern 'pnpm-lock\.yaml' \ + -ignore-filename-pattern 'pss-admission\.yaml' \ + -ignore-filename-pattern 'audit-policy\.yaml' \ + -ignore-filename-pattern '\.sample' \ + -ignore-filename-pattern 'node_modules' \ + -ignore-filename-pattern '\.json' \ + -ignore-filename-pattern 'dashboards/' \ + k3s/ k8s/ + +# --------------------------------------------------------------------------- +# Ansible +# --------------------------------------------------------------------------- + +# Run any ansible playbook (logs to ansible/.ansible/logs/) +[group('ansible')] +[positional-arguments] +play playbook host *args: + #!/usr/bin/env bash + set -eu + mkdir -p ansible/.ansible/logs + LOGFILE="$(pwd)/ansible/.ansible/logs/$(date +%Y%m%d-%H%M%S)-{{playbook}}.log" + cd ansible && uv run ansible-playbook -i inventory/digitalocean.yml play-{{playbook}}.yml \ + -e variable_host={{host}} {{args}} 2>&1 | tee "$LOGFILE" + echo "Log: $LOGFILE" + +# Install ansible dependencies +[group('ansible')] +ansible-install: + cd ansible && uv sync && uv run ansible-galaxy install -r requirements.yml + +# --------------------------------------------------------------------------- +# Terraform +# --------------------------------------------------------------------------- + +# Run terraform on one or all workspaces +[group('terraform')] +tf cmd workspace="all": + #!/usr/bin/env bash + set -eu + if [ "{{workspace}}" = "all" ]; then + for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \; | sort); do + echo "==> $ws: terraform {{cmd}}" + terraform -chdir=$ws {{cmd}} + done + else + ws="terraform/{{workspace}}" + [ -d "$ws" ] || { echo "Error: $ws not found"; exit 1; } + terraform -chdir=$ws {{cmd}} + fi + +# List terraform workspaces +[group('terraform')] +tf-list: + @find terraform -name ".terraform.lock.hcl" -exec dirname {} \; | sort diff --git a/k3s/README.md b/k3s/README.md index 1eefa6a18..292e57b9e 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -4,15 +4,19 @@ Self-hosted k3s clusters on DigitalOcean. ## Clusters -| Cluster | Purpose | Apps | -| -------------------- | -------------- | ----------------- | -| ops-backoffice-tools | Internal tools | Appsmith, Outline | +| Cluster | Purpose | Apps | +| -------------------- | ----------------- | --------------------- | +| ops-backoffice-tools | Internal tools | Appsmith, Outline | +| gxy-management | Universe platform | Windmill, ArgoCD, Zot | ## Quick Access ```bash # Tools cluster cd k3s/ops-backoffice-tools && export KUBECONFIG=$(pwd)/.kubeconfig.yaml + +# Galaxy management cluster +cd k3s/gxy-management && export KUBECONFIG=$(pwd)/.kubeconfig.yaml ``` ## Structure @@ -20,6 +24,14 @@ cd k3s/ops-backoffice-tools && export KUBECONFIG=$(pwd)/.kubeconfig.yaml ``` k3s/ ├── archive/ # Archived configs (historical reference) +├── gxy-management/ +│ ├── apps/ +│ │ ├── argocd/ +│ │ ├── windmill/ +│ │ └── zot/ +│ └── cluster/ +│ ├── cilium/ +│ └── security/ ├── ops-backoffice-tools/ │ ├── apps/ │ │ ├── appsmith/ @@ -45,9 +57,10 @@ k3s/ ### Droplets -| Cluster | Name Pattern | Count | Specs | Tags | -| ------- | ------------------------ | ----- | ------------------ | -------------- | -| tools | ops-vm-tools-k3s-nyc3-0X | 3 | 4 vCPU, 8GB, 160GB | k3s, tools_k3s | +| Cluster | Name Pattern | Count | Specs | Tags | +| -------------- | --------------------------- | ----- | ------------------- | ------------------- | +| tools | ops-vm-tools-k3s-nyc3-0X | 3 | 4 vCPU, 8GB, 160GB | k3s, tools_k3s | +| gxy-management | ops-vm-gxy-mgmt-k3s-fra1-0X | 3 | 8 vCPU, 16GB, 320GB | k3s, \_gxy-mgmt-k3s | ### Load Balancer @@ -60,15 +73,14 @@ k3s/ ## Ansible Deployment ```bash -cd ansible +# Deploy tools cluster +just play k3s--cluster tools_k3s -# Deploy cluster -uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--cluster.yml \ - -e variable_host=tools_k3s +# Longhorn storage (tools) +just play k3s--longhorn tools_k3s -# Longhorn storage -uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--longhorn.yml \ - -e variable_host=tools_k3s +# Deploy gxy-management galaxy (decrypts vault vars automatically) +just play k3s--bootstrap gxy_mgmt_k3s ``` --- @@ -97,10 +109,13 @@ See `tailscale/README.md` (repo root) for device inventory. ## DNS (Cloudflare) -| Record | Type | Value | -| ------------------------- | ---- | -------- | -| appsmith.freecodecamp.net | A | tools LB | -| outline.freecodecamp.net | A | tools LB | +| Record | Type | Value | +| ------------------------- | ---- | ----------------------- | +| appsmith.freecodecamp.net | A | tools LB | +| outline.freecodecamp.net | A | tools LB | +| windmill.freecodecamp.net | A | gxy-management node IPs | +| argocd.freecodecamp.net | A | gxy-management node IPs | +| registry.freecodecamp.net | A | gxy-management node IPs | --- @@ -115,7 +130,7 @@ kubectl port-forward -n longhorn-system svc/longhorn-frontend 8080:80 ### Update Apps ```bash -kubectl apply -k apps//manifests/base/ +just deploy ``` --- @@ -142,11 +157,30 @@ Internet → Cloudflare → DO LB → Traefik (NodePort) → Gateway API → App | Appsmith | 1 | 10Gi | Embedded | | Outline | 1 | 10Gi + 10Gi | PostgreSQL sidecar | +### gxy-management + +``` +Internet → Cloudflare → Node Public IPs → Traefik (ServiceLB) → Gateway API → Apps + (Access) │ + ┌─────────────┼─────────────┐ + ↓ ↓ ↓ + Windmill ArgoCD Zot + +CNI: Cilium Storage: local-path SSH/kubectl: Tailscale +``` + +| App | Replicas | Access | Notes | +| -------- | ------------------- | ----------------- | ---------- | +| Windmill | 1 server, 2 workers | Cloudflare Access | | +| ArgoCD | 1 (single replica) | Cloudflare Access | | +| Zot | 1 (single replica) | Cloudflare Access | S3 backend | + --- ## Playbooks Reference -| Playbook | Purpose | -| ---------------------- | ------------------------ | -| play-k3s--cluster.yml | Deploy k3s HA cluster | -| play-k3s--longhorn.yml | Install Longhorn storage | +| Playbook | Purpose | +| ----------------------- | ----------------------------------------------------- | +| play-k3s--cluster.yml | Deploy k3s HA cluster | +| play-k3s--longhorn.yml | Install Longhorn storage | +| play-k3s--bootstrap.yml | Deploy any Universe galaxy (K3s + Cilium + Tailscale) | diff --git a/k3s/gxy-management/.envrc b/k3s/gxy-management/.envrc new file mode 100644 index 000000000..da3847029 --- /dev/null +++ b/k3s/gxy-management/.envrc @@ -0,0 +1,9 @@ +source_env ../../.envrc + +if [ -d "$SECRETS_DIR" ]; then + use_sops "$SECRETS_DIR/do-universe/.env.enc" +fi + +export KUBECONFIG="$(expand_path .kubeconfig.yaml)" + +dotenv_if_exists .env diff --git a/k3s/gxy-management/.gitignore b/k3s/gxy-management/.gitignore new file mode 100644 index 000000000..7220cce0a --- /dev/null +++ b/k3s/gxy-management/.gitignore @@ -0,0 +1,2 @@ +# Decrypted secrets (temporary, generated by just deploy) +apps/*/manifests/base/secrets/tls.yaml diff --git a/k3s/gxy-management/README.md b/k3s/gxy-management/README.md new file mode 100644 index 000000000..c419a3b39 --- /dev/null +++ b/k3s/gxy-management/README.md @@ -0,0 +1,84 @@ +# gxy-management + +First Universe galaxy. Control plane brain — manages all galaxies. + +## Specifications + +- **Nodes**: 3× DigitalOcean s-8vcpu-16gb (FRA1) +- **CNI**: Cilium (eBPF, Hubble observability) +- **Pod CIDR**: 10.1.0.0/16 +- **Service CIDR**: 10.11.0.0/16 +- **Storage**: local-path (K3s default) +- **Ingress**: Traefik (Day 0), Cilium Gateway API (target) + +## Applications + +| App | Purpose | Access | +| -------- | --------------------- | ----------------------------------------- | +| Windmill | Workflow engine | windmill.freecodecamp.net (all staff) | +| ArgoCD | GitOps (all galaxies) | argocd.freecodecamp.net (platform team) | +| Zot | Container registry | registry.freecodecamp.net (platform team) | + +## Quick Access + +```bash +cd k3s/gxy-management # direnv loads KUBECONFIG + DO_API_TOKEN +kubectl get nodes +``` + +## Deploy + +```bash +just play k3s--bootstrap gxy_mgmt_k3s +``` + +## Deployment Runbook + +### Pre-deployment (ClickOps) + +1. Create 3x DO droplets (s-8vcpu-16gb) in FRA1 -- attach to VPC, configure firewall (80, 443, 6443 from VPC, 22 from Tailscale) +2. Create DO Spaces bucket `net.freecodecamp.universe-backups` in FRA1 (etcd snapshots) +3. Create DO Spaces bucket `net.freecodecamp.universe-registry` in FRA1 (Zot images) +4. Install Tailscale: `just play tailscale--0-install gxy_mgmt_k3s` then `just play tailscale--1b-up-with-ssh gxy_mgmt_k3s` +5. Create Cloudflare origin certificate for `*.freecodecamp.net` (15-year, RSA) +6. Populate app secrets in infra-secrets repo (see samples in each app directory) + +### K3s Bootstrap + +```bash +just play k3s--bootstrap gxy_mgmt_k3s +``` + +Deploys k3s HA cluster with Cilium CNI, Traefik ingress, etcd S3 backups, and fetches kubeconfig. + +### Helm Installations + +After playbook completes: + +```bash +just helm-upgrade gxy-management argocd +just helm-upgrade gxy-management windmill +just helm-upgrade gxy-management zot +``` + +Release names match the app directory names. The recipe reads the chart repo URL from `charts//repo` and the values from `charts//values.yaml`. + +### App Secrets and Manifests + +```bash +just deploy gxy-management argocd +just deploy gxy-management windmill +just deploy gxy-management zot +``` + +### Post-deployment (ClickOps) + +1. Create DNS A records (proxied) for windmill/argocd/registry.freecodecamp.net pointing to all 3 node public IPs +2. Create Cloudflare Access policies for each service + +### Smoke Tests + +1. `kubectl get nodes` -- all 3 Ready +2. `cilium status` -- all green +3. `curl -H "Host: windmill.freecodecamp.net" https:// -k` +4. Verify Cloudflare Access gate diff --git a/k3s/gxy-management/apps/argocd/charts/argo-cd/repo b/k3s/gxy-management/apps/argocd/charts/argo-cd/repo new file mode 100644 index 000000000..fba634c54 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/charts/argo-cd/repo @@ -0,0 +1 @@ +https://argoproj.github.io/argo-helm diff --git a/k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml b/k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml new file mode 100644 index 000000000..d6b16ee27 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml @@ -0,0 +1,60 @@ +# Argo CD Helm values for gxy-management cluster +# Chart: argo-cd (https://argoproj.github.io/argo-helm) +# Non-HA, Cloudflare Access-gated (platform team only) + +# -- Single-replica (non-HA) deployment for 8GB nodes +controller: + replicas: 1 + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +server: + replicas: 1 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +repoServer: + replicas: 1 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +redis: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + +# -- Disable DEX (external SSO not needed) +dex: + enabled: false + +# -- Disable notifications controller (not needed) +notifications: + enabled: false + +# -- Disable ApplicationSet controller (not needed initially) +applicationSet: + enabled: false + +configs: + params: + # TLS terminated at Traefik (Cloudflare origin cert) + "server.insecure": true diff --git a/k3s/gxy-management/apps/argocd/manifests/base/gateway.yaml b/k3s/gxy-management/apps/argocd/manifests/base/gateway.yaml new file mode 100644 index 000000000..5c6257384 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/gateway.yaml @@ -0,0 +1,26 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: argocd-gateway + namespace: argocd +spec: + gatewayClassName: traefik + listeners: + - name: websecure + protocol: HTTPS + port: 8443 + hostname: argocd.freecodecamp.net + tls: + mode: Terminate + certificateRefs: + - name: argocd-tls-cloudflare + allowedRoutes: + namespaces: + from: Same + - name: web + protocol: HTTP + port: 8000 + hostname: argocd.freecodecamp.net + allowedRoutes: + namespaces: + from: Same diff --git a/k3s/gxy-management/apps/argocd/manifests/base/httproutes.yaml b/k3s/gxy-management/apps/argocd/manifests/base/httproutes.yaml new file mode 100644 index 000000000..6759e2500 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/httproutes.yaml @@ -0,0 +1,75 @@ +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: secure-headers + namespace: argocd +spec: + headers: + customRequestHeaders: + X-Forwarded-Proto: "https" + +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: redirect-https + namespace: argocd +spec: + redirectScheme: + scheme: https + permanent: true + +--- +# HTTP to HTTPS redirect +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: http-redirect + namespace: argocd +spec: + parentRefs: + - name: argocd-gateway + namespace: argocd + sectionName: web + hostnames: + - argocd.freecodecamp.net + rules: + - filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: redirect-https + backendRefs: + - name: argocd-server + port: 80 + +--- +# Main ArgoCD route +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: argocd-route + namespace: argocd +spec: + parentRefs: + - name: argocd-gateway + namespace: argocd + sectionName: websecure + hostnames: + - argocd.freecodecamp.net + rules: + - matches: + - path: + type: PathPrefix + value: / + filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: secure-headers + backendRefs: + - name: argocd-server + port: 80 diff --git a/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml new file mode 100644 index 000000000..e8cab94e3 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml @@ -0,0 +1,23 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: argocd + +resources: + - namespace.yaml + - gateway.yaml + - httproutes.yaml + +secretGenerator: + - name: argocd-secrets + type: Opaque + envs: + - secrets/.secrets.env + options: + disableNameSuffixHash: true + - name: argocd-tls-cloudflare + type: kubernetes.io/tls + files: + - tls.crt=secrets/tls.crt + - tls.key=secrets/tls.key + options: + disableNameSuffixHash: true diff --git a/k3s/gxy-management/apps/argocd/manifests/base/namespace.yaml b/k3s/gxy-management/apps/argocd/manifests/base/namespace.yaml new file mode 100644 index 000000000..a040f2ba5 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: argocd diff --git a/k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore b/k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore new file mode 100644 index 000000000..d05f4f5da --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore @@ -0,0 +1,3 @@ +.secrets.env +tls.crt +tls.key diff --git a/k3s/gxy-management/apps/argocd/manifests/base/secrets/tls.yaml.sample b/k3s/gxy-management/apps/argocd/manifests/base/secrets/tls.yaml.sample new file mode 100644 index 000000000..9674a94b9 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/secrets/tls.yaml.sample @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: argocd-tls-cloudflare + namespace: argocd +type: kubernetes.io/tls +data: + # Base64-encoded Cloudflare origin certificate (*.freecodecamp.net) + tls.crt: + # Base64-encoded private key + tls.key: diff --git a/k3s/gxy-management/apps/windmill/charts/windmill/repo b/k3s/gxy-management/apps/windmill/charts/windmill/repo new file mode 100644 index 000000000..9ff3dfef4 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/charts/windmill/repo @@ -0,0 +1 @@ +https://windmill-labs.github.io/windmill-helm-charts/ diff --git a/k3s/gxy-management/apps/windmill/charts/windmill/values.yaml b/k3s/gxy-management/apps/windmill/charts/windmill/values.yaml new file mode 100644 index 000000000..5d9e5c034 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/charts/windmill/values.yaml @@ -0,0 +1,73 @@ +# Windmill Helm chart values +# Chart: windmill/windmill +# Repo: https://windmill-labs.github.io/windmill-helm-charts/ +# Source: https://github.com/windmill-labs/windmill-helm-charts + +windmill: + baseDomain: windmill.freecodecamp.net + baseProtocol: https + appReplicas: 1 + extraReplicas: 1 + # databaseUrl set via secret values overlay (infra-secrets) + + app: + resources: + requests: + memory: "512Mi" + limits: + memory: "2Gi" + + workerGroups: + - name: "default" + controller: "Deployment" + replicas: 2 + privileged: true + podSecurityContext: + runAsUser: 0 + runAsNonRoot: false + resources: + requests: + memory: "512Mi" + limits: + memory: "2Gi" + + - name: "native" + controller: "Deployment" + replicas: 1 + privileged: false + podSecurityContext: + runAsUser: 0 + runAsNonRoot: false + resources: + requests: + memory: "256Mi" + limits: + memory: "1Gi" + extraEnv: + - name: "NATIVE_MODE" + value: "true" + - name: "SLEEP_QUEUE" + value: "200" + + indexer: + enabled: true + resources: + requests: + memory: "256Mi" + limits: + memory: "2Gi" + +postgresql: + enabled: true + auth: + # Credentials set via secret values overlay (infra-secrets) + database: windmill + persistence: + enabled: true + size: 10Gi + +ingress: + enabled: false + +enterprise: + enabled: false diff --git a/k3s/gxy-management/apps/windmill/manifests/base/gateway.yaml b/k3s/gxy-management/apps/windmill/manifests/base/gateway.yaml new file mode 100644 index 000000000..cb80f00f0 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/gateway.yaml @@ -0,0 +1,26 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: windmill-gateway + namespace: windmill +spec: + gatewayClassName: traefik + listeners: + - name: websecure + protocol: HTTPS + port: 8443 + hostname: windmill.freecodecamp.net + tls: + mode: Terminate + certificateRefs: + - name: windmill-tls-cloudflare + allowedRoutes: + namespaces: + from: Same + - name: web + protocol: HTTP + port: 8000 + hostname: windmill.freecodecamp.net + allowedRoutes: + namespaces: + from: Same diff --git a/k3s/gxy-management/apps/windmill/manifests/base/httproutes.yaml b/k3s/gxy-management/apps/windmill/manifests/base/httproutes.yaml new file mode 100644 index 000000000..30891ff4e --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/httproutes.yaml @@ -0,0 +1,75 @@ +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: secure-headers + namespace: windmill +spec: + headers: + customRequestHeaders: + X-Forwarded-Proto: "https" + +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: redirect-https + namespace: windmill +spec: + redirectScheme: + scheme: https + permanent: true + +--- +# HTTP to HTTPS redirect +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: http-redirect + namespace: windmill +spec: + parentRefs: + - name: windmill-gateway + namespace: windmill + sectionName: web + hostnames: + - windmill.freecodecamp.net + rules: + - filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: redirect-https + backendRefs: + - name: windmill-app + port: 8000 + +--- +# Main Windmill route +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: windmill-route + namespace: windmill +spec: + parentRefs: + - name: windmill-gateway + namespace: windmill + sectionName: websecure + hostnames: + - windmill.freecodecamp.net + rules: + - matches: + - path: + type: PathPrefix + value: / + filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: secure-headers + backendRefs: + - name: windmill-app + port: 8000 diff --git a/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml new file mode 100644 index 000000000..93ac095f6 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: windmill + +resources: + - namespace.yaml + - gateway.yaml + - httproutes.yaml + +secretGenerator: + - name: windmill-tls-cloudflare + type: kubernetes.io/tls + files: + - tls.crt=secrets/tls.crt + - tls.key=secrets/tls.key + options: + disableNameSuffixHash: true diff --git a/k3s/gxy-management/apps/windmill/manifests/base/namespace.yaml b/k3s/gxy-management/apps/windmill/manifests/base/namespace.yaml new file mode 100644 index 000000000..5e1d9ca40 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: windmill diff --git a/k3s/gxy-management/apps/windmill/manifests/base/secrets/.gitignore b/k3s/gxy-management/apps/windmill/manifests/base/secrets/.gitignore new file mode 100644 index 000000000..d05f4f5da --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/secrets/.gitignore @@ -0,0 +1,3 @@ +.secrets.env +tls.crt +tls.key diff --git a/k3s/gxy-management/apps/windmill/manifests/base/secrets/tls.yaml.sample b/k3s/gxy-management/apps/windmill/manifests/base/secrets/tls.yaml.sample new file mode 100644 index 000000000..bfa5bc863 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/secrets/tls.yaml.sample @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: windmill-tls-cloudflare + namespace: windmill +type: kubernetes.io/tls +data: + # Base64-encoded Cloudflare origin certificate (*.freecodecamp.net) + tls.crt: + # Base64-encoded private key + tls.key: diff --git a/k3s/gxy-management/apps/zot/charts/zot/repo b/k3s/gxy-management/apps/zot/charts/zot/repo new file mode 100644 index 000000000..6329dc84a --- /dev/null +++ b/k3s/gxy-management/apps/zot/charts/zot/repo @@ -0,0 +1 @@ +https://zotregistry.dev/helm-charts/ diff --git a/k3s/gxy-management/apps/zot/charts/zot/values.yaml b/k3s/gxy-management/apps/zot/charts/zot/values.yaml new file mode 100644 index 000000000..3deda55b8 --- /dev/null +++ b/k3s/gxy-management/apps/zot/charts/zot/values.yaml @@ -0,0 +1,90 @@ +# Zot OCI Registry Helm values for gxy-management cluster +# Chart: zot (https://github.com/project-zot/helm-charts) +# Chart version: 0.1.104, Image: v2.1.15 +# Cloudflare Access-gated (platform team + Woodpecker CI) + +replicaCount: 1 + +image: + repository: ghcr.io/project-zot/zot + pullPolicy: IfNotPresent + tag: "v2.1.15" + +service: + type: ClusterIP + port: 5000 + +# Routing managed via Gateway API (see manifests/base/gateway.yaml) +ingress: + enabled: false + +# Mount config.json from ConfigMap +mountConfig: true +configFiles: + config.json: |- + { + "storage": { + "rootDirectory": "/var/lib/registry", + "storageDriver": { + "name": "s3", + "regionendpoint": "https://fra1.digitaloceanspaces.com", + "region": "fra1", + "bucket": "net.freecodecamp.universe-registry", + "secure": true, + "skipverify": false, + "forcepathstyle": true + } + }, + "http": { + "address": "0.0.0.0", + "port": "5000", + "auth": { + "htpasswd": { + "path": "/secret/htpasswd" + } + } + }, + "log": { + "level": "info" + } + } + +# Mount htpasswd from Secret (managed via kustomize secretGenerator) +mountSecret: true +secretFiles: + # Placeholder — actual htpasswd managed via kustomize secrets/.secrets.env + # Generate entries with: htpasswd -nbBC 10 + htpasswd: "" + +# S3 credentials injected as environment variables +# The S3 driver picks up AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY +# automatically when accesskey/secretkey are omitted from config.json +env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: zot-secrets + key: S3_ACCESS_KEY + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: zot-secrets + key: S3_SECRET_KEY + +# Local PVC for registry cache +persistence: true +pvc: + create: true + accessModes: + - ReadWriteOnce + storage: 8Gi + storageClassName: local-path + +# Probes +httpGet: + scheme: HTTP + port: 5000 +startupProbe: + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 3 diff --git a/k3s/gxy-management/apps/zot/manifests/base/gateway.yaml b/k3s/gxy-management/apps/zot/manifests/base/gateway.yaml new file mode 100644 index 000000000..1ddce28ac --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/gateway.yaml @@ -0,0 +1,26 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: zot-gateway + namespace: zot +spec: + gatewayClassName: traefik + listeners: + - name: websecure + protocol: HTTPS + port: 8443 + hostname: registry.freecodecamp.net + tls: + mode: Terminate + certificateRefs: + - name: zot-tls-cloudflare + allowedRoutes: + namespaces: + from: Same + - name: web + protocol: HTTP + port: 8000 + hostname: registry.freecodecamp.net + allowedRoutes: + namespaces: + from: Same diff --git a/k3s/gxy-management/apps/zot/manifests/base/httproutes.yaml b/k3s/gxy-management/apps/zot/manifests/base/httproutes.yaml new file mode 100644 index 000000000..e8f0b32e2 --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/httproutes.yaml @@ -0,0 +1,75 @@ +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: secure-headers + namespace: zot +spec: + headers: + customRequestHeaders: + X-Forwarded-Proto: "https" + +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: redirect-https + namespace: zot +spec: + redirectScheme: + scheme: https + permanent: true + +--- +# HTTP to HTTPS redirect +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: http-redirect + namespace: zot +spec: + parentRefs: + - name: zot-gateway + namespace: zot + sectionName: web + hostnames: + - registry.freecodecamp.net + rules: + - filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: redirect-https + backendRefs: + - name: zot + port: 5000 + +--- +# Main Zot route +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: zot-route + namespace: zot +spec: + parentRefs: + - name: zot-gateway + namespace: zot + sectionName: websecure + hostnames: + - registry.freecodecamp.net + rules: + - matches: + - path: + type: PathPrefix + value: / + filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: secure-headers + backendRefs: + - name: zot + port: 5000 diff --git a/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml new file mode 100644 index 000000000..69a92e59a --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml @@ -0,0 +1,23 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: zot + +resources: + - namespace.yaml + - gateway.yaml + - httproutes.yaml + +secretGenerator: + - name: zot-secrets + type: Opaque + envs: + - secrets/.secrets.env + options: + disableNameSuffixHash: true + - name: zot-tls-cloudflare + type: kubernetes.io/tls + files: + - tls.crt=secrets/tls.crt + - tls.key=secrets/tls.key + options: + disableNameSuffixHash: true diff --git a/k3s/gxy-management/apps/zot/manifests/base/namespace.yaml b/k3s/gxy-management/apps/zot/manifests/base/namespace.yaml new file mode 100644 index 000000000..017091887 --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: zot diff --git a/k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore b/k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore new file mode 100644 index 000000000..d05f4f5da --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore @@ -0,0 +1,3 @@ +.secrets.env +tls.crt +tls.key diff --git a/k3s/gxy-management/apps/zot/manifests/base/secrets/tls.yaml.sample b/k3s/gxy-management/apps/zot/manifests/base/secrets/tls.yaml.sample new file mode 100644 index 000000000..e017111c4 --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/secrets/tls.yaml.sample @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: zot-tls-cloudflare + namespace: zot +type: kubernetes.io/tls +data: + # Base64-encoded Cloudflare origin certificate (*.freecodecamp.net) + tls.crt: + # Base64-encoded private key + tls.key: diff --git a/k3s/gxy-management/cluster/cilium/values.yaml b/k3s/gxy-management/cluster/cilium/values.yaml new file mode 100644 index 000000000..7b5242a24 --- /dev/null +++ b/k3s/gxy-management/cluster/cilium/values.yaml @@ -0,0 +1,34 @@ +# cluster.name, cluster.id, k8sServiceHost set at deploy time via Ansible --set flags +k8sServicePort: "6443" + +kubeProxyReplacement: false +devices: [eth0, eth1] +mtu: 1500 + +ipam: + operator: + clusterPoolIPv4PodCIDRList: + - "10.1.0.0/16" + +operator: + replicas: 1 + +hubble: + enabled: true + tls: + auto: + method: cronJob + relay: + enabled: true + ui: + enabled: false + +gatewayAPI: + enabled: false + +resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 512Mi diff --git a/k3s/gxy-management/cluster/security/audit-policy.yaml b/k3s/gxy-management/cluster/security/audit-policy.yaml new file mode 100644 index 000000000..7243d0578 --- /dev/null +++ b/k3s/gxy-management/cluster/security/audit-policy.yaml @@ -0,0 +1,38 @@ +# Kubernetes API audit policy +# Copied to /etc/rancher/k3s/audit-policy.yaml by Ansible +# +# Phase 1: minimal — log secret access and anonymous requests only +# Phase 2: expand to full request/response logging for sensitive resources +apiVersion: audit.k8s.io/v1 +kind: Policy +rules: + # Log secret read/write at Metadata level + - level: Metadata + resources: + - group: "" + resources: ["secrets"] + + # Log anonymous/unauthenticated requests + - level: Metadata + users: ["system:anonymous"] + + # Log RBAC changes + - level: Metadata + resources: + - group: "rbac.authorization.k8s.io" + resources: + ["clusterroles", "clusterrolebindings", "roles", "rolebindings"] + + # Skip noisy read-only system requests + - level: None + users: ["system:kube-proxy"] + - level: None + resources: + - group: "" + resources: ["endpoints", "services", "services/status"] + verbs: ["get", "watch", "list"] + + # Default: log everything else at Metadata + - level: Metadata + omitStages: + - "RequestReceived" diff --git a/k3s/gxy-management/cluster/security/pss-admission.yaml b/k3s/gxy-management/cluster/security/pss-admission.yaml new file mode 100644 index 000000000..4f02cc90a --- /dev/null +++ b/k3s/gxy-management/cluster/security/pss-admission.yaml @@ -0,0 +1,26 @@ +# Pod Security Standards admission configuration +# Copied to /etc/rancher/k3s/pss-admission.yaml by Ansible +# +# - baseline: enforced (blocks privileged containers, host networking, hostPath) +# - restricted: audit + warn only (logs violations, does not block) +# - System namespaces exempted (Windmill workers and Tailscale need elevated privileges; +# Cilium installs into kube-system which is already exempted) +apiVersion: apiserver.config.k8s.io/v1 +kind: AdmissionConfiguration +plugins: + - name: PodSecurity + configuration: + apiVersion: pod-security.admission.config.k8s.io/v1 + kind: PodSecurityConfiguration + defaults: + enforce: "baseline" + enforce-version: "latest" + audit: "restricted" + audit-version: "latest" + warn: "restricted" + warn-version: "latest" + exemptions: + namespaces: + - kube-system + - windmill + - tailscale diff --git a/k3s/gxy-management/cluster/traefik-config.yaml b/k3s/gxy-management/cluster/traefik-config.yaml new file mode 100644 index 000000000..aceab7f77 --- /dev/null +++ b/k3s/gxy-management/cluster/traefik-config.yaml @@ -0,0 +1,55 @@ +# Traefik HelmChartConfig for gxy-management galaxy +# Reference: https://docs.k3s.io/networking/networking-services#customizing-the-traefik-helm-chart +# +# This file is copied to /var/lib/rancher/k3s/server/manifests/traefik-config.yaml +# by Ansible play-k3s--galaxy.yml + +apiVersion: helm.cattle.io/v1 +kind: HelmChartConfig +metadata: + name: traefik + namespace: kube-system +spec: + valuesContent: |- + # Enable Gateway API provider + providers: + kubernetesGateway: + enabled: true + + # Enable Gateway provisioning + gateway: + enabled: true + + # Use LoadBalancer with ServiceLB (Klipper) to bind host ports 80/443 + service: + type: LoadBalancer + + ports: + web: {} + websecure: + tls: + enabled: true + transport: + respondingTimeouts: + readTimeout: "0s" + writeTimeout: "0s" + idleTimeout: "180s" + + # Logging + logs: + general: + level: INFO + access: + enabled: true + + # Security context + securityContext: + capabilities: + drop: [ALL] + add: [NET_BIND_SERVICE] + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 65532 + + podSecurityContext: + fsGroup: 65532 diff --git a/k3s/ops-backoffice-tools/.envrc b/k3s/ops-backoffice-tools/.envrc new file mode 100644 index 000000000..bafa883b9 --- /dev/null +++ b/k3s/ops-backoffice-tools/.envrc @@ -0,0 +1,7 @@ +source_env ../../.envrc + +if [ -d "$SECRETS_DIR" ]; then + use_sops "$SECRETS_DIR/do-primary/.env.enc" +fi + +dotenv_if_exists .env diff --git a/k3s/ops-backoffice-tools/README.md b/k3s/ops-backoffice-tools/README.md index 4b9c27ed5..e0fb10204 100644 --- a/k3s/ops-backoffice-tools/README.md +++ b/k3s/ops-backoffice-tools/README.md @@ -45,8 +45,8 @@ helm upgrade tailscale-operator tailscale/tailscale-operator \ All apps use Kustomize: ```bash -# Deploy -kubectl apply -k apps//manifests/base/ +# Deploy (decrypts secrets + TLS, applies, cleans up) +just deploy ops-backoffice-tools # Check kubectl get all -n diff --git a/k3s/ops-mgmt/.gitignore b/k3s/ops-mgmt/.gitignore new file mode 100644 index 000000000..e5b010291 --- /dev/null +++ b/k3s/ops-mgmt/.gitignore @@ -0,0 +1,5 @@ +.kubeconfig.yaml +**/secrets/.secrets.env +**/secrets/tls.crt +**/secrets/tls.key +!**/secrets/.secrets.enc.env diff --git a/k3s/ops-mgmt/README.md b/k3s/ops-mgmt/README.md new file mode 100644 index 000000000..a85f90810 --- /dev/null +++ b/k3s/ops-mgmt/README.md @@ -0,0 +1,46 @@ +# ops-mgmt k3s Cluster + +Rancher management cluster for provisioning and managing downstream Kubernetes clusters. + +## Specifications + +- **Node**: 1x s-4vcpu-8gb (DigitalOcean, nyc3) +- **Pod CIDR**: 10.40.0.0/16 +- **Service CIDR**: 10.41.0.0/16 +- **Tailscale hostname**: ops-k3s-mgmt-subnet + +## Quick Access + +```bash +cd k3s/ops-mgmt && export $(cat .env | xargs) +kubectl get nodes +``` + +## Deployment + +Everything is managed by a single Ansible playbook (8 plays): + +```bash +just play k3s--ops-mgmt mgmt_k3s +``` + +The playbook handles: k3s install, security hardening (secrets-encryption, PSS, audit logging), +cert-manager, Rancher, rancher-backup + schedule, Tailscale operator + Connector, +kubeconfig fetch, and DO firewall lockdown. + +Prerequisites: VM provisioned with Tailscale installed, secrets populated in infra-secrets repo. + +## Re-runs + +After first run, the DO firewall restricts SSH to Tailscale only. Re-run via Tailscale IP: + +```bash +just play k3s--ops-mgmt mgmt_k3s -e ansible_host= +``` + +## Disaster Recovery + +- **rancher-backup operator** takes snapshots every 6 hours to DO Spaces (`net.freecodecamp.ops-k3s-backups/rancher-backup`) +- **etcd snapshots** every 6 hours to DO Spaces (`net.freecodecamp.ops-k3s-backups/etcd/ops-mgmt`) +- Downstream clusters continue operating independently if ops-mgmt is lost +- Restore: deploy fresh k3s + Rancher, then `kubectl apply -f` a Restore CR pointing to the backup diff --git a/k3s/ops-mgmt/apps/rancher/backup-schedule.yaml b/k3s/ops-mgmt/apps/rancher/backup-schedule.yaml new file mode 100644 index 000000000..1e9955d6e --- /dev/null +++ b/k3s/ops-mgmt/apps/rancher/backup-schedule.yaml @@ -0,0 +1,18 @@ +# Recurring Rancher backup to DO Spaces +# Apply: kubectl apply -f backup-schedule.yaml +apiVersion: resources.cattle.io/v1 +kind: Backup +metadata: + name: rancher-scheduled-backup +spec: + resourceSetName: rancher-resource-set + retentionCount: 20 + schedule: "0 */6 * * *" + storageLocation: + s3: + bucketName: net.freecodecamp.ops-k3s-backups + folder: rancher-backup + region: nyc3 + endpoint: nyc3.digitaloceanspaces.com + credentialSecretName: rancher-backup-s3-creds + credentialSecretNamespace: cattle-resources-system diff --git a/k3s/ops-mgmt/cluster/security/audit-policy.yaml b/k3s/ops-mgmt/cluster/security/audit-policy.yaml new file mode 100644 index 000000000..7243d0578 --- /dev/null +++ b/k3s/ops-mgmt/cluster/security/audit-policy.yaml @@ -0,0 +1,38 @@ +# Kubernetes API audit policy +# Copied to /etc/rancher/k3s/audit-policy.yaml by Ansible +# +# Phase 1: minimal — log secret access and anonymous requests only +# Phase 2: expand to full request/response logging for sensitive resources +apiVersion: audit.k8s.io/v1 +kind: Policy +rules: + # Log secret read/write at Metadata level + - level: Metadata + resources: + - group: "" + resources: ["secrets"] + + # Log anonymous/unauthenticated requests + - level: Metadata + users: ["system:anonymous"] + + # Log RBAC changes + - level: Metadata + resources: + - group: "rbac.authorization.k8s.io" + resources: + ["clusterroles", "clusterrolebindings", "roles", "rolebindings"] + + # Skip noisy read-only system requests + - level: None + users: ["system:kube-proxy"] + - level: None + resources: + - group: "" + resources: ["endpoints", "services", "services/status"] + verbs: ["get", "watch", "list"] + + # Default: log everything else at Metadata + - level: Metadata + omitStages: + - "RequestReceived" diff --git a/k3s/ops-mgmt/cluster/security/pss-admission.yaml b/k3s/ops-mgmt/cluster/security/pss-admission.yaml new file mode 100644 index 000000000..52e5f679b --- /dev/null +++ b/k3s/ops-mgmt/cluster/security/pss-admission.yaml @@ -0,0 +1,31 @@ +# Pod Security Standards admission configuration +# Copied to /etc/rancher/k3s/pss-admission.yaml by Ansible +# +# - baseline: enforced (blocks privileged containers, host networking, hostPath) +# - restricted: audit + warn only (logs violations, does not block) +# - System namespaces exempted (Rancher, cert-manager, Longhorn, Tailscale need elevated privileges) +apiVersion: apiserver.config.k8s.io/v1 +kind: AdmissionConfiguration +plugins: + - name: PodSecurity + configuration: + apiVersion: pod-security.admission.config.k8s.io/v1 + kind: PodSecurityConfiguration + defaults: + enforce: "baseline" + enforce-version: "latest" + audit: "restricted" + audit-version: "latest" + warn: "restricted" + warn-version: "latest" + exemptions: + namespaces: + - kube-system + - cattle-system + - cattle-fleet-system + - cattle-fleet-local-system + - cattle-resources-system + - cattle-provisioning-capi-system + - cert-manager + - longhorn-system + - tailscale diff --git a/terraform/justfile b/terraform/justfile deleted file mode 100644 index 0c0a90e80..000000000 --- a/terraform/justfile +++ /dev/null @@ -1,68 +0,0 @@ -# Find all directories containing .terraform.lock.hcl files -workspaces := `find . -name ".terraform.lock.hcl" -exec dirname {} \; | tr '\n' ' '` - -# Show available recipes -default: - @just --list - -# Format Terraform files in all workspaces -format: - @echo "Formatting Terraform files in all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Formatting $workspace"; \ - terraform -chdir=$workspace fmt; \ - done - @echo "Formatting complete." - -# List all detected Terraform workspaces -list-workspaces: - @echo "Detected Terraform workspaces:" - @for workspace in {{workspaces}}; do \ - echo " $workspace"; \ - done - -# Validate Terraform configurations in all workspaces -validate: - @echo "Validating Terraform configurations in all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Validating $workspace"; \ - terraform -chdir=$workspace validate; \ - done - @echo "Validation complete." - -# Initialize Terraform in all workspaces -init: - @echo "Initializing Terraform in all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Initializing $workspace"; \ - terraform -chdir=$workspace init; \ - done - @echo "Initialization complete." - -# Initialize and upgrade Terraform in all workspaces -init-upgrade: - @echo "Initializing and upgrading Terraform in all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Initializing and upgrading $workspace"; \ - terraform -chdir=$workspace init -upgrade; \ - done - @echo "Initialization and upgrade complete." - -# Run Terraform plan in all workspaces -plan: - @echo "Running Terraform plan in all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Planning $workspace"; \ - terraform -chdir=$workspace plan; \ - done - @echo "Planning complete." - -# Remove Terraform cache files from all workspaces -[confirm("This will delete .terraform, tfstate, and lock files. Continue?")] -clean: - @echo "Cleaning Terraform cache files from all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Cleaning $workspace"; \ - rm -rf $workspace/.terraform $workspace/*.tfstate* $workspace/.terraform.lock.hcl; \ - done - @echo "Cleaning complete."