From 3b64e6e2fe944af5fdb68e2252d2c340f057e887 Mon Sep 17 00:00:00 2001
From: pjb157 <peter.bhabra@gmail.com>
Date: Tue, 19 May 2026 13:32:53 +0100
Subject: [PATCH 1/5] feat(chart): egress NetworkPolicy and Workload Identity
 wiring

Adds an opt-in egress NetworkPolicy template that restricts the
control-layer pod's outbound traffic, and documents the existing
ServiceAccount annotation hook for GKE Workload Identity.

The NetworkPolicy allows broad public-internet egress (provider APIs,
managed Postgres, GCS) but denies private / loopback / link-local /
CGNAT / IPv6 ULA + LL ranges and operator-configured cluster CIDRs.
It's the application-layer backstop for the in-process image fetcher's
IP allow-list: even if a bug let a request through the deny-list
inside the process, the CNI refuses the packet.

`networkPolicy.allowKubeDns: true` (the default) emits an explicit
allow rule for kube-dns, because the cluster-CIDR deny would
otherwise break in-pod name resolution.

`networkPolicy.enabled` defaults to `false` so the chart change is a
no-op until an operator opts in. Operators MUST set
`networkPolicy.clusterCidrs` to match their cluster's Pod and
Service CIDRs.

ServiceAccount annotation documentation now spells out the GKE
Workload Identity binding shape for operators wiring the image
normaliser to GCS. No template change for that; the existing
`serviceAccount.annotations` passthrough already covers it.
---
 templates/networkpolicy.yaml | 65 ++++++++++++++++++++++++++++++++++++
 values.yaml                  | 39 +++++++++++++++++++++-
 2 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 templates/networkpolicy.yaml

diff --git a/templates/networkpolicy.yaml b/templates/networkpolicy.yaml
new file mode 100644
index 0000000..6f5aca2
--- /dev/null
+++ b/templates/networkpolicy.yaml
@@ -0,0 +1,65 @@
+{{- if .Values.networkPolicy.enabled -}}
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: {{ include "control-layer.fullname" . }}-egress
+  labels:
+    {{- include "control-layer.labels" . | nindent 4 }}
+spec:
+  podSelector:
+    matchLabels:
+      {{- include "control-layer.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: control-layer
+  policyTypes:
+    - Egress
+  egress:
+    # Egress to the public internet, with private / loopback / link-local
+    # / CGNAT / IPv6 ULA + LL ranges excluded. This is the application-layer
+    # backstop for the in-process image fetcher's IP allow-list: even if a
+    # bug let a request through the deny-list inside the process, the CNI
+    # refuses the packet.
+    - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+            except:
+              # RFC1918 private ranges
+              - 10.0.0.0/8
+              - 172.16.0.0/12
+              - 192.168.0.0/16
+              # Loopback
+              - 127.0.0.0/8
+              # Link-local — covers GCE / AWS metadata at 169.254.169.254
+              - 169.254.0.0/16
+              # Carrier-grade NAT (RFC 6598)
+              - 100.64.0.0/10
+              # In-cluster Pod / Service CIDRs (operator-configured)
+              {{- range .Values.networkPolicy.clusterCidrs }}
+              - {{ . }}
+              {{- end }}
+        - ipBlock:
+            cidr: ::/0
+            except:
+              # IPv6 loopback
+              - ::1/128
+              # IPv6 unique-local (RFC 4193)
+              - fc00::/7
+              # IPv6 link-local
+              - fe80::/10
+  {{- if .Values.networkPolicy.allowKubeDns }}
+    # DNS is required for any outbound resolution. Without this rule, the
+    # cluster-CIDR deny above blocks queries to kube-dns and breaks all
+    # name lookups inside the pod.
+    - to:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: kube-system
+          podSelector:
+            matchLabels:
+              k8s-app: kube-dns
+      ports:
+        - protocol: UDP
+          port: 53
+        - protocol: TCP
+          port: 53
+  {{- end }}
+{{- end }}
diff --git a/values.yaml b/values.yaml
index bfd8e3a..8721c1d 100644
--- a/values.yaml
+++ b/values.yaml
@@ -15,7 +15,17 @@ serviceAccount:
   create: true
   # Automatically mount a ServiceAccount's API credentials?
   automount: true
-  # Annotations to add to the service account
+  # Annotations to add to the service account.
+  #
+  # For GKE Workload Identity (used when the image normaliser writes
+  # objects to a GCS bucket), bind a GCP service account here:
+  #
+  #   annotations:
+  #     iam.gke.io/gcp-service-account: <name>@<project>.iam.gserviceaccount.com
+  #
+  # The bound GCP service account needs `roles/storage.objectAdmin` on the
+  # image-normaliser bucket and `roles/iam.serviceAccountTokenCreator` on
+  # itself (to call signBlob for V4 signed URLs).
   annotations: {}
   # The name of the service account to use.
   # If not set and create is true, a name is generated using the fullname template
@@ -328,3 +338,30 @@ postgresql:
     runAsGroup: 999
     # Security best practice - prevent running as root
     runAsNonRoot: true
+
+# ---------------------------------------------------------------------------
+# Egress NetworkPolicy for the control-layer pod.
+#
+# Disabled by default. When enabled, restricts the control-layer pod's
+# outbound traffic so it cannot reach private / loopback / link-local
+# ranges or in-cluster service CIDRs. Public-internet egress (provider
+# APIs, external Postgres, object storage) is preserved.
+#
+# This is the application-layer backstop for the hardened image fetcher:
+# even if a bug in the in-process IP allow-list let a request through,
+# the kernel-level network policy refuses the packet.
+#
+# Operators MUST set `clusterCidrs` for their cluster — typically the
+# Pod CIDR and the Service CIDR. Misconfiguring this can break in-pod
+# DNS resolution if `kube-dns` isn't allowed; the template emits an
+# explicit `kube-dns` allow rule for that reason.
+#
+# Requires a NetworkPolicy-aware CNI (Cilium, Calico, GKE Dataplane V2).
+networkPolicy:
+  enabled: false
+  # Cluster pod/service CIDRs to deny egress to. Override per cluster.
+  # Typical GKE defaults are 10.0.0.0/8 covering both.
+  clusterCidrs:
+    - 10.0.0.0/8
+  # Allow egress to kube-dns. Disable only if you know what you're doing.
+  allowKubeDns: true

From 36956231182bd7b40d57bc71ef0b1f7b24f24738 Mon Sep 17 00:00:00 2001
From: pjb157 <peter.bhabra@gmail.com>
Date: Wed, 20 May 2026 12:00:02 +0100
Subject: [PATCH 2/5] docs(values): GCS bucket lifecycle guidance for image
 normaliser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a brief note alongside the `serviceAccount.annotations` block
explaining that dwctl does not garbage-collect normaliser objects —
operators configure a GCS Object Lifecycle Management rule on the
bucket itself (Terraform / gcloud) to delete objects after enough
days to outlive the longest possible batch dispatch window.

Suggested default: 7 days (24h batch completion window + 6d
investigation buffer). Operators tighten or extend per their own
retention policies.

No behavioural change.
---
 values.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/values.yaml b/values.yaml
index 8721c1d..2c45b1c 100644
--- a/values.yaml
+++ b/values.yaml
@@ -26,6 +26,12 @@ serviceAccount:
   # The bound GCP service account needs `roles/storage.objectAdmin` on the
   # image-normaliser bucket and `roles/iam.serviceAccountTokenCreator` on
   # itself (to call signBlob for V4 signed URLs).
+  #
+  # Bucket lifetime: dwctl does NOT garbage-collect normaliser objects.
+  # Configure an Object Lifecycle Management rule on the bucket itself
+  # (Terraform / gcloud) to delete objects after enough days to outlive
+  # the longest possible batch dispatch window. Example: 7d (covers a
+  # 24h batch completion window + a 6d investigation buffer).
   annotations: {}
   # The name of the service account to use.
   # If not set and create is true, a name is generated using the fullname template

From 20ab66495fe86da8e7d1967b894c6e9ff014aeba Mon Sep 17 00:00:00 2001
From: pjb157 <peter.bhabra@gmail.com>
Date: Wed, 20 May 2026 14:01:25 +0100
Subject: [PATCH 3/5] fix(chart): default clusterCidrs to empty and clarify
 when to set it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous default `clusterCidrs: [10.0.0.0/8]` was redundant with
the hardcoded RFC1918 exception already in the NetworkPolicy template,
producing a duplicate `- 10.0.0.0/8` line in the rendered manifest
and (more importantly) suggesting to operators that they MUST set
this even on standard GKE clusters. They do not — the hardcoded
exception covers RFC1918 ranges (Pod CIDR + Service CIDR for any
typical GKE cluster).

`clusterCidrs` is only meaningful for clusters whose Pod or Service
CIDRs fall OUTSIDE RFC1918 / CGNAT / IPv6 ULA+LL (e.g. dual-stack
with public IPv6, custom CIDR plans). Defaulted to an empty list and
the comment now explains exactly when to set it.

`helm lint .` clean; the rendered policy contains no duplicate CIDR
entries with the default.
---
 values.yaml | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/values.yaml b/values.yaml
index 2c45b1c..a9e3439 100644
--- a/values.yaml
+++ b/values.yaml
@@ -357,17 +357,26 @@ postgresql:
 # even if a bug in the in-process IP allow-list let a request through,
 # the kernel-level network policy refuses the packet.
 #
-# Operators MUST set `clusterCidrs` for their cluster — typically the
-# Pod CIDR and the Service CIDR. Misconfiguring this can break in-pod
-# DNS resolution if `kube-dns` isn't allowed; the template emits an
-# explicit `kube-dns` allow rule for that reason.
+# The template ALREADY denies RFC1918 (10.0.0.0/8, 172.16/12, 192.168/16),
+# loopback, link-local, CGNAT, and IPv6 ULA/LL ranges. `clusterCidrs`
+# is only needed if your cluster uses Pod or Service CIDRs OUTSIDE
+# those ranges (e.g. dual-stack with public IPv6, custom CIDR plans).
+#
+# For standard GKE clusters (default Pod CIDR 10.x and Service CIDR
+# 10.x), the hardcoded RFC1918 block already covers them — leave the
+# default empty list and nothing additional is denied.
+#
+# Misconfiguring this can break in-pod DNS resolution if `kube-dns`
+# isn't allowed; the template emits an explicit `kube-dns` allow rule
+# regardless of which CIDRs you set.
 #
 # Requires a NetworkPolicy-aware CNI (Cilium, Calico, GKE Dataplane V2).
 networkPolicy:
   enabled: false
-  # Cluster pod/service CIDRs to deny egress to. Override per cluster.
-  # Typical GKE defaults are 10.0.0.0/8 covering both.
-  clusterCidrs:
-    - 10.0.0.0/8
+  # Additional CIDRs to deny egress to, ON TOP OF the hardcoded RFC1918
+  # / loopback / link-local / CGNAT / IPv6-ULA-LL exceptions. Empty by
+  # default — standard clusters using RFC1918 Pod/Service ranges need
+  # nothing here. Set explicitly only for non-RFC1918 cluster CIDRs.
+  clusterCidrs: []
   # Allow egress to kube-dns. Disable only if you know what you're doing.
   allowKubeDns: true

From 241289e1a6a71c6353a6fb53902aaef9a8612a09 Mon Sep 17 00:00:00 2001
From: pjb157 <peter.bhabra@gmail.com>
Date: Thu, 21 May 2026 09:19:01 +0100
Subject: [PATCH 4/5] docs(values): note on non-standard DNS pod labels for
 NetworkPolicy

The egress NetworkPolicy's kube-dns allow rule selects on
`kube-system / k8s-app=kube-dns`, which matches vanilla Kubernetes
and CoreDNS on most distributions but NOT OpenShift's
`openshift-dns / dns.operator.openshift.io/daemonset-dns=default` or
some managed offerings.

Adds an inline note so operators enabling NetworkPolicy on a
non-standard distribution don't quietly break in-pod DNS resolution.
Suggests two ways out: disable the rule and add an equivalent
NetworkPolicy out-of-band, or fork with the right selector.

No behavioural change; documentation only.
---
 values.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/values.yaml b/values.yaml
index a9e3439..ceaa555 100644
--- a/values.yaml
+++ b/values.yaml
@@ -379,4 +379,15 @@ networkPolicy:
   # nothing here. Set explicitly only for non-RFC1918 cluster CIDRs.
   clusterCidrs: []
   # Allow egress to kube-dns. Disable only if you know what you're doing.
+  #
+  # NOTE on non-standard DNS deployments: the template selects DNS pods
+  # using `namespace=kube-system, podSelector k8s-app=kube-dns`, which
+  # matches vanilla Kubernetes (and CoreDNS on most distros). If your
+  # cluster runs DNS under different labels — e.g. OpenShift uses
+  # `dns.operator.openshift.io/daemonset-dns=default` in the
+  # `openshift-dns` namespace, and some managed offerings use other
+  # combinations — the rule will not match and in-pod DNS resolution
+  # will break when networkPolicy is enabled. In that case either
+  # disable this rule (and add an equivalent NetworkPolicy out-of-band)
+  # or fork the chart with the right selector for your distribution.
   allowKubeDns: true

From 9f8d7fd60461d2a9822bcba0accdb83f2aeaca29 Mon Sep 17 00:00:00 2001
From: pjb157 <peter.bhabra@gmail.com>
Date: Fri, 22 May 2026 16:27:24 +0100
Subject: [PATCH 5/5] docs(values): warn that egress NetworkPolicy is
 incompatible with in-cluster Postgres

Enabling networkPolicy while `postgresql.enabled: true` blocks the
control-layer pod from reaching the in-cluster Postgres StatefulSet
(its Pod IP / ClusterIP are RFC1918, which the policy denies),
causing immediate startup connection failures.

The feature is opt-in and aimed at deployments using external managed
Postgres (Neon) + public-internet provider APIs, but the conflict
wasn't documented. Added a prominent warning so operators don't
enable it on an internal-Postgres deployment and get a confusing
startup failure. No behavioural change.
---
 values.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/values.yaml b/values.yaml
index ceaa555..482f84f 100644
--- a/values.yaml
+++ b/values.yaml
@@ -357,6 +357,15 @@ postgresql:
 # even if a bug in the in-process IP allow-list let a request through,
 # the kernel-level network policy refuses the packet.
 #
+# ⚠️ INCOMPATIBLE WITH IN-CLUSTER POSTGRES. Because the policy denies
+# RFC1918 egress, enabling it while `postgresql.enabled: true` will
+# block the control-layer pod from reaching the in-cluster Postgres
+# StatefulSet (its Pod IP and ClusterIP are RFC1918) — startup will
+# fail immediately with connection errors. This NetworkPolicy is
+# intended for deployments using EXTERNAL managed Postgres (e.g. Neon)
+# and public-internet provider APIs. If you must run both, add an
+# explicit allow rule for your Postgres CIDR out-of-band.
+#
 # The template ALREADY denies RFC1918 (10.0.0.0/8, 172.16/12, 192.168/16),
 # loopback, link-local, CGNAT, and IPv6 ULA/LL ranges. `clusterCidrs`
 # is only needed if your cluster uses Pod or Service CIDRs OUTSIDE