openova/products/catalyst/chart/templates/clusterrole-cutover-driver.yaml

# ClusterRole granting catalyst-api the verbs needed to drive the
# self-sovereignty cutover endpoint (issue #792, #830 P0 Bug 1).
#
# CRITICAL — feedback_rbac_create_no_resourcenames.md (auto-memory anchor):
# Kubernetes RBAC forbids combining `create` verbs with `resourceNames`.
# A POST request has no resource name yet; the apiserver MUST evaluate
# the rule against the request without a name match, and a `resourceNames`
# set with `create` produces a 403 every time. This caused the bp-openbao
# 6+ provisioning loop. We split `create` into its own Rule with NO
# `resourceNames` and keep `update/patch/get/delete` in a separate Rule.
#
# What catalyst-api needs to do across namespaces (cutover sequence):
#   - read configmaps in the cutover namespace (default `catalyst`):
#     cutover-step PodSpec ConfigMaps + the self-sovereign-cutover-status
#     ConfigMap
#   - patch the status ConfigMap on every step transition
#   - create batchv1.Job from each PodSpec ConfigMap
#   - watch jobs to completion / Failed
#   - delete completed jobs after status capture (housekeeping)
#   - read daemonsets/deployments status for daemonset-wait + deployment-
#     targeted steps (step 04 registry-pivot DaemonSet readiness, step 07
#     catalyst-api Deployment env patch)
#   - emit Events as steps complete (operator-visible kube events)
#
# ClusterRole (not Role) because the cutover handler today is namespace-
# configurable via env CATALYST_CUTOVER_NAMESPACE — defaulting to
# `catalyst` but operators may relocate. A namespaced Role would couple
# the chart to a single namespace forever.
#
# Per the dual-mode contract (api-deployment.yaml comment), this file is
# consumed by BOTH Helm and Kustomize — NO Helm directives anywhere.
# ClusterRole is cluster-scoped so namespace is omitted by design.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: catalyst-api-cutover-driver
  labels:
    app.kubernetes.io/name: catalyst-api
    app.kubernetes.io/component: cutover-driver-rbac
rules:
  # ───────────────────────────────────────────────────────────────
  # CREATE verbs — MUST be in their own Rule WITHOUT resourceNames.
  # Per feedback_rbac_create_no_resourcenames.md this split is
  # load-bearing: combining `create` with resourceNames produces a 403
  # at every POST because the apiserver has no resource name to match
  # the rule against on the create path.
  # ───────────────────────────────────────────────────────────────
  - apiGroups: [""]
    resources: ["configmaps"]
    verbs: ["create"]
  - apiGroups: [""]
    resources: ["events"]
    verbs: ["create"]
  - apiGroups: ["batch"]
    resources: ["jobs"]
    verbs: ["create"]

  # ───────────────────────────────────────────────────────────────
  # READ verbs — get/list/watch on the resources the handler reads.
  # ───────────────────────────────────────────────────────────────
  - apiGroups: [""]
    resources: ["configmaps"]
    verbs: ["get", "list", "watch"]
  - apiGroups: [""]
    resources: ["pods", "pods/log"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["batch"]
    resources: ["jobs"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["apps"]
    resources: ["deployments", "daemonsets"]
    verbs: ["get", "list", "watch"]

  # ───────────────────────────────────────────────────────────────
  # UPDATE / PATCH / DELETE — separate from create per RBAC rule.
  # ───────────────────────────────────────────────────────────────
  - apiGroups: [""]
    resources: ["configmaps"]
    verbs: ["update", "patch", "delete"]
  - apiGroups: ["batch"]
    resources: ["jobs"]
    verbs: ["delete", "patch", "update"]
  # qa-loop iter-7 Fix #34 follow-up — Sovereign Console mutation
  # surface (PR #1229) added PUT/POST/DELETE semantics on every
  # workload kind. The cutover-driver SA needs apiserver verbs to
  # match: scale (patch /spec/replicas), restart (patch
  # /spec/template/metadata/annotations), apply (update via Update),
  # and delete. Per ADR-0001 §5 the in-cluster fallback runs as
  # this SA, so without these verbs every mutation surfaces as 403
  # (TC-215, TC-218, TC-243, TC-247 in iter-7 matrix).
  - apiGroups: ["apps"]
    resources: ["deployments", "statefulsets", "daemonsets", "replicasets"]
    verbs: ["update", "patch", "delete"]
  - apiGroups: ["apps"]
    resources: ["deployments/scale", "statefulsets/scale", "replicasets/scale"]
    verbs: ["update", "patch", "get"]
  - apiGroups: [""]
    resources: ["pods", "services", "endpoints", "persistentvolumeclaims"]
    verbs: ["update", "patch", "delete"]
  - apiGroups: ["networking.k8s.io"]
    resources: ["ingresses", "networkpolicies"]
    verbs: ["update", "patch", "delete"]
  - apiGroups: ["batch"]
    resources: ["cronjobs"]
    verbs: ["update", "patch", "delete", "create"]

  # ───────────────────────────────────────────────────────────────
  # TokenReview — required by HandleCutoverInternalTrigger to
  # validate the in-cluster auto-trigger Job's projected SA token
  # against the apiserver's authentication chain (issue #957
  # follow-up; PR #947 wired the endpoint but not its RBAC).
  #
  # Without this rule POST /api/v1/internal/cutover/trigger fails
  # at TokenReviews().Create with HTTP 403 "tokenreviews is
  # forbidden", which the handler maps to a generic 502
  # "token-review-failed". Caught live on otech113 2026-05-05
  # immediately after chart 0.1.18 fixed the readiness-probe loop:
  # the trigger reached catalyst-api but every call returned 502
  # in 8.87ms (well under any I/O latency, indicating a synchronous
  # apiserver permission rejection).
  #
  # Per feedback_rbac_create_no_resourcenames.md (auto-memory
  # anchor) `create` MUST be in its own Rule WITHOUT
  # `resourceNames`. TokenReview is a virtual sub-resource of
  # authentication.k8s.io — there is no name to scope to.
  # ───────────────────────────────────────────────────────────────
  - apiGroups: ["authentication.k8s.io"]
    resources: ["tokenreviews"]
    verbs: ["create"]

  # ───────────────────────────────────────────────────────────────
  # SOVEREIGN ENDPOINT READS — backing the chroot Sovereign Console
  # at console.<sov-fqdn>. Handlers (`HandleSovereignCloud`,
  # `HandleSovereignApps`) live in catalyst-api and use the
  # in-cluster client to enumerate cluster-wide resources for the
  # /cloud and /apps panes. Caught live on otech130 2026-05-06:
  # /api/v1/sovereign/cloud returned `{nodes:[], namespaces:[], …}`
  # because the SA was silently rejected by the apiserver on the
  # Nodes().List call (handler's err branch falls through to an
  # empty response).
  # ───────────────────────────────────────────────────────────────
  - apiGroups: [""]
    resources:
      - nodes
      - namespaces
      - services
      - persistentvolumes
      - persistentvolumeclaims
    verbs: ["get", "list", "watch"]
  - apiGroups: ["networking.k8s.io"]
    resources: ["ingresses"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["gateway.networking.k8s.io"]
    resources: ["httproutes", "gateways"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["storage.k8s.io"]
    resources: ["storageclasses"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["helm.toolkit.fluxcd.io"]
    resources: ["helmreleases"]
    verbs: ["get", "list", "watch"]

  # ───────────────────────────────────────────────────────────────
  # USER ACCESS CRD — backs /users on the chroot Sovereign Console.
  # The mother-side handlers (ListUserAccess, CreateUserAccess,
  # UpdateUserAccess, DeleteUserAccess) read/write the
  # access.openova.io/v1alpha1 UserAccess CR via the in-cluster
  # dynamic client (issue #322). On the Sovereign chroot the
  # catalyst-api SA needs full CRUD on this CRD to render and edit
  # the User Access list. Caught live on omantel.biz 2026-05-06:
  # GET /api/v1/deployments/{depId}/admin/user-access returned 500
  # "useraccesses.access.openova.io is forbidden" because the SA
  # had no rule covering this GVR.
  #
  # `create` MUST be in its own rule WITHOUT resourceNames per
  # feedback_rbac_create_no_resourcenames.md.
  # ───────────────────────────────────────────────────────────────
  - apiGroups: ["access.openova.io"]
    resources: ["useraccesses"]
    verbs: ["create"]
  - apiGroups: ["access.openova.io"]
    resources: ["useraccesses"]
    verbs: ["get", "list", "watch", "update", "patch", "delete"]

  # ───────────────────────────────────────────────────────────────
  # K8SCACHE DATA PLANE — chroot Sovereign-side k8scache.Factory
  # registers the local cluster via rest.InClusterConfig() and
  # spawns one informer per kind in the registry. Those informers
  # need cluster-wide watch on every kind to feed the
  # /api/v1/sovereigns/{depId}/k8s/stream SSE consumer.
  #
  # Kinds list from internal/k8scache/kinds.go:
  #   - core/v1: pods, services, configmaps, persistentvolumeclaims
  #     (already covered above for HandleSovereignCloud reads;
  #      re-listed here for clarity — RBAC merging is idempotent),
  #     plus endpointslices, nodes, namespaces, persistentvolumes,
  #     secrets (sensitive — values are stripped by k8scache.redact).
  #   - apps/v1: deployments, statefulsets, daemonsets, replicasets
  #   - networking.k8s.io/v1: ingresses
  #   - hcloud.crossplane.io/v1alpha1: servers, loadbalancers,
  #     networks, volumes
  #   - vcluster.com/v1alpha1: vclusters
  # ───────────────────────────────────────────────────────────────
  - apiGroups: [""]
    resources:
      - configmaps
      - endpointslices
      - pods
      - secrets
    verbs: ["get", "list", "watch"]
  - apiGroups: ["apps"]
    resources: ["deployments", "statefulsets", "daemonsets", "replicasets"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["discovery.k8s.io"]
    resources: ["endpointslices"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["hcloud.crossplane.io"]
    resources: ["servers", "loadbalancers", "networks", "volumes"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["vcluster.com"]
    resources: ["vclusters"]
    verbs: ["get", "list", "watch"]
  # SubjectAccessReview — k8scache's per-event SAR cache calls
  # AuthorizationV1().SubjectAccessReviews().Create() for every
  # SSE delivery to gate per-namespace visibility. Without this
  # the Subscribe path would 403 every event silently and the
  # stream would deliver nothing.
  - apiGroups: ["authorization.k8s.io"]
    resources: ["subjectaccessreviews"]
    verbs: ["create"]
  # PodMetrics + NodeMetrics — required for the Sovereign Dashboard's
  # `color_by=utilization` overlay. Without this rule the dynamic
  # client returns 403 on every metrics.k8s.io list and the dashboard
  # falls back to null-percentage grey cells. metrics-server is
  # always installed by bp-metrics-server in the platform bundle, so
  # the only thing standing between the dashboard and a working
  # gradient was this RBAC rule. (#1084 follow-up — the chart-side
  # half of the treemap utilization fix.)
  - apiGroups: ["metrics.k8s.io"]
    resources: ["pods", "nodes"]
    verbs: ["get", "list", "watch"]

  # ───────────────────────────────────────────────────────────────
  # NEW KINDS added by EPIC-1 W (#1139) + EPIC-4 R (#1167):
  #   - wgpolicyk8s.io/v1alpha2 PolicyReports + ClusterPolicyReports
  #     (compliance score aggregator + UI score-by-resource view)
  #   - events.k8s.io/v1 Events
  #     (resource detail page Events tab + EventsPanel widget)
  #
  # Per feedback_chroot_in_cluster_fallback.md: every new GVR added
  # to catalyst-api's k8scache.DefaultKinds MUST get a matching rule
  # in this ClusterRole — the chroot SovereignClient uses this SA.
  # ───────────────────────────────────────────────────────────────
  - apiGroups: ["wgpolicyk8s.io"]
    resources: ["policyreports", "clusterpolicyreports"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["events.k8s.io"]
    resources: ["events"]
    verbs: ["get", "list", "watch"]

  # qa-loop iter-8 Fix #41 — ClusterPolicy CR read access so the
  # compliance aggregator's `clusterpolicy` SubscribeKind ingest path
  # populates per-policy Severity / Rule list metadata for the
  # PolicyDrilldownPage (TC-026 cluster-B). Read-only — the aggregator
  # never mutates ClusterPolicies; that's owned by Kyverno + the
  # qa-fixtures chart's policy bundle.
  - apiGroups: ["kyverno.io"]
    resources: ["clusterpolicies", "policies"]
    verbs: ["get", "list", "watch"]

  # ───────────────────────────────────────────────────────────────
  # APPLICATION CRD — backs slice I (#1152) install-flow handler +
  # slice T+O+P (#1160) AppDetail PUT/DELETE/topology-preview/upgrade-
  # preview handlers. Caught live on omantel iter-1: TC-040 returned
  # HTTP 500 with body
  #   "applications.apps.openova.io is forbidden: User
  #    system:serviceaccount:catalyst-system:catalyst-api-cutover-driver
  #    cannot list resource applications in API group apps.openova.io"
  # because the cutover-driver ClusterRole had no rule for the new CRD.
  # Per feedback_chroot_in_cluster_fallback.md every new GVR added to
  # catalyst-api dynamic-client paths MUST get a matching rule here.
  # `create` MUST be in its own rule WITHOUT resourceNames per
  # feedback_rbac_create_no_resourcenames.md.
  # ───────────────────────────────────────────────────────────────
  - apiGroups: ["apps.openova.io"]
    resources: ["applications"]
    verbs: ["create"]
  - apiGroups: ["apps.openova.io"]
    resources: ["applications"]
    verbs: ["get", "list", "watch", "update", "patch", "delete"]

  # ───────────────────────────────────────────────────────────────
  # CONTINUUM CRD — backs EPIC-6 slice U-DR-1 (#1162) Continuum DR UI
  # handlers (POST switchover, POST failback, POST failback/approve,
  # GET /continuums/{name}, GET/SSE /audit/continuum). Caught live on
  # omantel iter-1: TC-099 returned HTTP 500 with body
  #   "continuums.dr.openova.io is forbidden: ..."
  # ───────────────────────────────────────────────────────────────
  - apiGroups: ["dr.openova.io"]
    resources: ["continuums"]
    verbs: ["create"]
  - apiGroups: ["dr.openova.io"]
    resources: ["continuums"]
    verbs: ["get", "list", "watch", "update", "patch", "delete"]

  # ───────────────────────────────────────────────────────────────
  # QA-loop iter-2 Fix #17 — additional CRDs the generic /k8s/{kind}
  # surface needs to enumerate. These live alongside the per-CRD
  # handlers (HandleListUserAccesses, HandleBlueprintPublish, …) but
  # the k8scache.Factory's dynamic informer pool is the read path for
  # the SSE stream + dashboard treemap + components page.
  #
  # Per feedback_chroot_in_cluster_fallback.md every new GVR added to
  # internal/k8scache/kinds.go DefaultKinds MUST get a matching rule
  # in this ClusterRole — the chroot SovereignClient uses this SA.
  # ───────────────────────────────────────────────────────────────
  # Blueprint CRD — published blueprint records (already used by
  # HandleBlueprintListCuratable etc.; the k8scache surface needs
  # explicit list+watch verbs).
  - apiGroups: ["catalyst.openova.io"]
    resources: ["blueprints", "environments"]
    verbs: ["get", "list", "watch"]
  # EnvironmentPolicy CRD — backs slice X (#1147) PUT
  # /environments/{env}/policy, restored to working contract by Fix #19
  # (#1208) in qa-loop iter-3. Without this rule the handler 503s on
  # the apiserver Create/Update with body
  #   environmentpolicies.catalyst.openova.io is forbidden.
  # `create` MUST be split into its own rule WITHOUT resourceNames per
  # feedback_rbac_create_no_resourcenames.md.
  - apiGroups: ["catalyst.openova.io"]
    resources: ["environmentpolicies"]
    verbs: ["create"]
  - apiGroups: ["catalyst.openova.io"]
    resources: ["environmentpolicies"]
    verbs: ["get", "list", "watch", "update", "patch", "delete"]
  # Organization CRD — top-level tenancy resource surfaced on the
  # /organizations page.
  - apiGroups: ["orgs.openova.io"]
    resources: ["organizations"]
    verbs: ["get", "list", "watch"]

  # ───────────────────────────────────────────────────────────────
  # QA-loop iter-3 Fix #18 — RBAC kinds surfaced through the
  # /k8s/{kind} generic surface. The Sovereign Console's RBAC pane
  # lists ClusterRoles + ClusterRoleBindings (TC-122/196/199/248).
  # Without this rule the in-cluster catalyst-api SA gets 403 from
  # the apiserver on every dynamic-client list.list call against
  # rbac.authorization.k8s.io.
  #
  # Per feedback_chroot_in_cluster_fallback.md every new GVR added
  # to internal/k8scache/kinds.go DefaultKinds MUST get a matching
  # rule in this ClusterRole — the chroot SovereignClient uses this
  # SA via in-cluster fallback. Read-only verbs only — the Sovereign
  # Console renders RBAC inventory; mutation goes through the
  # UserAccess CR reconciliation loop, not through direct ClusterRole/
  # ClusterRoleBinding writes.
  # ───────────────────────────────────────────────────────────────
  - apiGroups: ["rbac.authorization.k8s.io"]
    resources: ["clusterroles", "clusterrolebindings"]
    verbs: ["get", "list", "watch"]

  # ───────────────────────────────────────────────────────────────
  # QA-loop iter-4 Fix #24 — CustomResourceDefinitions surfaced
  # through the /k8s/{kind} generic surface. The Sovereign Console's
  # CRD inventory pane lists every installed CRD on the cluster
  # (TC-199). Without this rule the in-cluster catalyst-api SA gets
  # 403 from the apiserver on every dynamic-client list.list call
  # against apiextensions.k8s.io.
  #
  # Per feedback_chroot_in_cluster_fallback.md every new GVR added
  # to internal/k8scache/kinds.go DefaultKinds MUST get a matching
  # rule in this ClusterRole — the chroot SovereignClient uses this
  # SA via in-cluster fallback. Read-only verbs only — the Sovereign
  # Console renders CRD inventory; CRD install/uninstall happens
  # through Flux + the blueprint catalog (HelmRelease → CRD), not
  # through direct apiextensions writes.
  # ───────────────────────────────────────────────────────────────
  - apiGroups: ["apiextensions.k8s.io"]
    resources: ["customresourcedefinitions"]
    verbs: ["get", "list", "watch"]

  # ───────────────────────────────────────────────────────────────
  # QA-loop iter-11 Fix #48 — Networking GVRs (Cilium NetworkPolicy
  # tier-3 micro-segmentation + Gateway API + ClusterMesh metadata)
  # surfaced through the generic /k8s/{kind} endpoint AND the new
  # networking aggregator handlers (HandleNetworkingPolicies /
  # ClusterMesh / NetBird / DMZ / Hubble).
  #
  # Per feedback_chroot_in_cluster_fallback.md every new GVR added
  # to internal/k8scache/kinds.go DefaultKinds MUST get a matching
  # rule here — the chroot SovereignClient uses this SA via
  # in-cluster fallback. Read-only verbs only: NetworkPolicies are
  # authored by the qa-fixtures chart + customer overlays, never
  # by catalyst-api at runtime.
  # ───────────────────────────────────────────────────────────────
  - apiGroups: ["cilium.io"]
    resources:
      - "ciliumnetworkpolicies"
      - "ciliumclusterwidenetworkpolicies"
      - "ciliumendpointslices"
      - "ciliumnodes"
      - "ciliumendpoints"
      - "ciliumidentities"
      - "ciliumexternalworkloads"
    verbs: ["get", "list", "watch"]
  - apiGroups: ["gateway.networking.k8s.io"]
    resources: ["gatewayclasses"]
    verbs: ["get", "list", "watch"]
  # vCluster CRD (vcluster.com) — already registered for `vcluster`
  # but the DMZ aggregator queries the legacy `cluster.loft.sh` group
  # too for older vCluster operator versions. Defensive duplication.
  - apiGroups: ["cluster.loft.sh"]
    resources: ["virtualclusters"]
    verbs: ["get", "list", "watch"]