Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions charts/kserve-resources/templates/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ rules:
- apiGroups:
- rbac.authorization.k8s.io
resources:
- clusterrolebindings
- rolebindings
- roles
verbs:
Expand Down
1 change: 1 addition & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ rules:
- apiGroups:
- rbac.authorization.k8s.io
resources:
- clusterrolebindings
- rolebindings
- roles
verbs:
Expand Down
36 changes: 32 additions & 4 deletions pkg/controller/llmisvc/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/kubernetes"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
lwsapi "sigs.k8s.io/lws/api/leaderworkerset/v1"

"github.com/kserve/kserve/pkg/apis/serving/v1beta1"
Expand Down Expand Up @@ -126,7 +127,7 @@ type LLMInferenceServiceReconciler struct {
//+kubebuilder:rbac:groups=inference.networking.x-k8s.io,resources=inferencepools;inferencemodels;,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
//+kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles;rolebindings,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles;rolebindings;clusterrolebindings,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=discovery.k8s.io,resources=endpointslices,verbs=get;list;watch
//+kubebuilder:rbac:groups=authentication.k8s.io,resources=tokenreviews;subjectaccessreviews,verbs=create
//+kubebuilder:rbac:groups=networking.istio.io,resources=destinationrules,verbs=get;list;watch;create;update;patch;delete
Expand All @@ -143,9 +144,28 @@ func (r *LLMInferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.
return ctrl.Result{}, client.IgnoreNotFound(err)
}

if original.DeletionTimestamp != nil {
// TODO(reconcile): Handle finalization logic if needed.
logger.Info("Mark for deletion, skipping reconciliation")
finalizerName := constants.KServeAPIGroupName + "/llmisvc-finalizer"
if original.DeletionTimestamp.IsZero() {
if controllerutil.AddFinalizer(original, finalizerName) {
if err := r.Update(ctx, original); err != nil {
return ctrl.Result{}, err
}
}
} else {
logger.Info("Marked for deletion, finalizing resources")
if controllerutil.ContainsFinalizer(original, finalizerName) {
if cleanupErr := r.finalize(ctx, original); cleanupErr != nil {
logger.Error(cleanupErr, "Finalization failed")
return ctrl.Result{}, cleanupErr
}

controllerutil.RemoveFinalizer(original, finalizerName)
if err := r.Update(ctx, original); err != nil {
return ctrl.Result{}, err
}
}

// Do not reconcile, because llmisvc is being deleted.
return ctrl.Result{}, nil
}

Expand Down Expand Up @@ -200,6 +220,14 @@ func (r *LLMInferenceServiceReconciler) reconcile(ctx context.Context, llmSvc *v
return nil
}

func (r *LLMInferenceServiceReconciler) finalize(ctx context.Context, llmSvc *v1alpha1.LLMInferenceService) error {
if err := r.reconcileSchedulerServiceAccount(ctx, llmSvc); err != nil {
return fmt.Errorf("failed to finalize scheduler service account: %w", err)
}

return nil
}

func (r *LLMInferenceServiceReconciler) updateStatus(ctx context.Context, desired *v1alpha1.LLMInferenceService) error {
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
latest := &v1alpha1.LLMInferenceService{}
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/llmisvc/controller_int_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ var _ = Describe("LLMInferenceService Controller", func() {

It("should use storage-initializer to download model when uri starts with hf://", func(ctx SpecContext) {
// given
svcName := "test-llm"
svcName := "test-llm-storage-hf"
nsName := kmeta.ChildName(svcName, "-test")
namespace := &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -686,7 +686,7 @@ var _ = Describe("LLMInferenceService Controller", func() {

It("should use storage-initializer to download model when uri starts with s3://", func(ctx SpecContext) {
// given
svcName := "test-llm-s3"
svcName := "test-llm-storage-s3"
nsName := kmeta.ChildName(svcName, "-test")
namespace := &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Expand Down
46 changes: 32 additions & 14 deletions pkg/controller/llmisvc/lifecycle_crud.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
"sigs.k8s.io/controller-runtime/pkg/log"
)

Expand All @@ -48,13 +49,26 @@ func Delete[O client.Object, T client.Object](ctx context.Context, c clientWithR
typeLogLine := logLineForObject(expected)
ownerLogLine := logLineForObject(owner)

if !metav1.IsControlledBy(expected, owner) {
return fmt.Errorf("failed to delete %s %s/%s: it is not controlled by %s %s/%s",
typeLogLine,
expected.GetNamespace(), expected.GetName(),
ownerLogLine,
owner.GetNamespace(), owner.GetName(),
)
if isNamespaced, err := apiutil.IsObjectNamespaced(expected, c.Scheme(), c.RESTMapper()); err != nil {
return fmt.Errorf("failed to resolve if resource is namespaced %s: %w", typeLogLine, err)
} else if isNamespaced {
if !metav1.IsControlledBy(expected, owner) {
return fmt.Errorf("failed to delete %s %s/%s: it is not controlled by %s %s/%s",
typeLogLine,
expected.GetNamespace(), expected.GetName(),
ownerLogLine,
owner.GetNamespace(), owner.GetName(),
)
} else if !owner.GetDeletionTimestamp().IsZero() {
// If the owner is being deleted, assume the owned resource is going
// to be automatically garbage colleted by the cluster
return nil
}
}

// Don't re-try deletion, if the owned object is already being deleted
if !expected.GetDeletionTimestamp().IsZero() {
return nil
}

if err := c.Delete(ctx, expected); err != nil {
Expand Down Expand Up @@ -85,13 +99,17 @@ func Update[O client.Object, T client.Object](ctx context.Context, c clientWithR
typeLogLine := logLineForObject(expected)
ownerLogLine := logLineForObject(owner)

if !metav1.IsControlledBy(curr, owner) {
return fmt.Errorf("failed to update %s %s/%s: it is not controlled by %s %s/%s",
typeLogLine,
curr.GetNamespace(), curr.GetName(),
ownerLogLine,
owner.GetNamespace(), owner.GetName(),
)
if isNamespaced, err := apiutil.IsObjectNamespaced(expected, c.Scheme(), c.RESTMapper()); err != nil {
return fmt.Errorf("failed to resolve if resource is namespaced %s: %w", typeLogLine, err)
} else if isNamespaced {
if !metav1.IsControlledBy(curr, owner) {
return fmt.Errorf("failed to update %s %s/%s: it is not controlled by %s %s/%s",
typeLogLine,
curr.GetNamespace(), curr.GetName(),
ownerLogLine,
owner.GetNamespace(), owner.GetName(),
)
}
}

expected.SetResourceVersion(curr.GetResourceVersion())
Expand Down
50 changes: 49 additions & 1 deletion pkg/controller/llmisvc/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,19 @@ func (r *LLMInferenceServiceReconciler) reconcileScheduler(ctx context.Context,
return nil
}

func (r *LLMInferenceServiceReconciler) reconcileSchedulerAuthDelegatorBinding(ctx context.Context, llmSvc *v1alpha1.LLMInferenceService, sa *corev1.ServiceAccount) error {
authDelegatorBinding := r.expectedSchedulerAuthDelegatorBinding(llmSvc, sa)
if !llmSvc.DeletionTimestamp.IsZero() || llmSvc.Spec.Router == nil || llmSvc.Spec.Router.Scheduler == nil {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

!llmSvc.DeletionTimestamp.IsZero() - I noticed we don't do it in other places. How about moving this check to Delete func as a guard to all types of resources? That would simplify the code below too.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But notice that this condition is over llmSvc instead of authDelegatorBinding. I did this way just to not tamper with the garbage collector for the owned resources which should automatically do the deletion so that, to some extent, it preserves behavior.

In practice, I'm not sure how useful the guard in Delete will be. I see the pattern in the code is to delete the built expected resource; i.e. it doesn't query for the actual one, so I don't think we will have the timestamp. In any case, I agree that having the guard is good. I'll dd it. Yet, IMO this check over llmSvc should stay, unless we are OK with the controller trying the delete despite the ownership?

Copy link
Copy Markdown

@bartoszmajsak bartoszmajsak Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did this way just to not tamper with the garbage collector for the owned resources which should automatically do the deletion

I was thinking to move this down to Delete and guard in the same fashion - if expected is owned by existing resource (owner) we would have a timestamp on owner.

But I see now this is about cluster-scoped resource that I missed going quickly through the code. Apologies for the initial noise.

I am not sure about using Delete for such a case. This method expects an owner and here it is misleading. Perhaps we should just delete it using available client? The same applies to Reconcile. Thinking about it more - maybe we need separate methods for cluster-scoped resources instead of patching the behaviour of the code path not intended for this use-case?

Thoughts @pierDipi?

Copy link
Copy Markdown
Member

@pierDipi pierDipi Jul 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe what we want is to allow nil owner for Reconcile and Delete without having separate paths for cluster-scoped resources?

Copy link
Copy Markdown

@bartoszmajsak bartoszmajsak Jul 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a bit alergic to nils as I've seen many methods with couple of those passed out in the wild. That is making the intent blurry, but we can refactor later.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not seeing any actionable change. Let me know if I'm misunderstanding.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but we can refactor later.

follow-up :)

return Delete(ctx, r, llmSvc, authDelegatorBinding)
}

if err := Reconcile(ctx, r, llmSvc, &rbacv1.ClusterRoleBinding{}, authDelegatorBinding, semanticClusterRoleBindingIsEqual); err != nil {
return fmt.Errorf("failed to reconcile scheduler clusterrolebinding %s: %w", authDelegatorBinding.GetName(), err)
}

return nil
}

func (r *LLMInferenceServiceReconciler) reconcileSchedulerRole(ctx context.Context, llmSvc *v1alpha1.LLMInferenceService) error {
role := r.expectedSchedulerRole(llmSvc)
if llmSvc.Spec.Router == nil || llmSvc.Spec.Router.Scheduler == nil {
Expand Down Expand Up @@ -88,6 +101,11 @@ func (r *LLMInferenceServiceReconciler) reconcileSchedulerRoleBinding(ctx contex

func (r *LLMInferenceServiceReconciler) reconcileSchedulerServiceAccount(ctx context.Context, llmSvc *v1alpha1.LLMInferenceService) error {
serviceAccount := r.expectedSchedulerServiceAccount(llmSvc)

if !llmSvc.DeletionTimestamp.IsZero() {
return r.reconcileSchedulerAuthDelegatorBinding(ctx, llmSvc, serviceAccount)
}

if llmSvc.Spec.Router == nil || llmSvc.Spec.Router.Scheduler == nil {
return Delete(ctx, r, llmSvc, serviceAccount)
}
Expand All @@ -96,6 +114,10 @@ func (r *LLMInferenceServiceReconciler) reconcileSchedulerServiceAccount(ctx con
return fmt.Errorf("failed to reconcile scheduler service account %s/%s: %w", serviceAccount.GetNamespace(), serviceAccount.GetName(), err)
}

if err := r.reconcileSchedulerAuthDelegatorBinding(ctx, llmSvc, serviceAccount); err != nil {
return err
}

if err := r.reconcileSchedulerRole(ctx, llmSvc); err != nil {
return err
}
Expand Down Expand Up @@ -390,6 +412,26 @@ func (r *LLMInferenceServiceReconciler) expectedSchedulerServiceAccount(llmSvc *
return sa
}

func (r *LLMInferenceServiceReconciler) expectedSchedulerAuthDelegatorBinding(llmSvc *v1alpha1.LLMInferenceService, sa *corev1.ServiceAccount) *rbacv1.ClusterRoleBinding {
crb := &rbacv1.ClusterRoleBinding{
ObjectMeta: metav1.ObjectMeta{
Name: kmeta.ChildName(llmSvc.GetNamespace(), "-"+llmSvc.GetName()+"-epp-auth-rb"),
Labels: r.schedulerLabels(llmSvc),
},
Subjects: []rbacv1.Subject{{
Kind: "ServiceAccount",
Name: sa.GetName(),
Namespace: sa.GetNamespace(),
}},
RoleRef: rbacv1.RoleRef{
APIGroup: "rbac.authorization.k8s.io",
Kind: "ClusterRole",
Name: "system:auth-delegator",
},
}
return crb
}

func (r *LLMInferenceServiceReconciler) expectedSchedulerRole(llmSvc *v1alpha1.LLMInferenceService) *rbacv1.Role {
role := &rbacv1.Role{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -404,7 +446,6 @@ func (r *LLMInferenceServiceReconciler) expectedSchedulerRole(llmSvc *v1alpha1.L
{APIGroups: []string{""}, Resources: []string{"pods"}, Verbs: []string{"get", "list", "watch"}},
{APIGroups: []string{"inference.networking.x-k8s.io"}, Resources: []string{"inferencepools", "inferencemodels"}, Verbs: []string{"get", "list", "watch"}},
{APIGroups: []string{"discovery.k8s.io"}, Resources: []string{"endpointslices"}, Verbs: []string{"get", "list", "watch"}},
{APIGroups: []string{"authentication.k8s.io"}, Resources: []string{"tokenreviews", "subjectaccessreviews"}, Verbs: []string{"create"}},
},
}
return role
Expand Down Expand Up @@ -465,6 +506,13 @@ func semanticRoleIsEqual(expected *rbacv1.Role, curr *rbacv1.Role) bool {
equality.Semantic.DeepDerivative(expected.Annotations, curr.Annotations)
}

func semanticClusterRoleBindingIsEqual(expected *rbacv1.ClusterRoleBinding, curr *rbacv1.ClusterRoleBinding) bool {
return equality.Semantic.DeepDerivative(expected.Subjects, curr.Subjects) &&
equality.Semantic.DeepDerivative(expected.RoleRef, curr.RoleRef) &&
equality.Semantic.DeepDerivative(expected.Labels, curr.Labels) &&
equality.Semantic.DeepDerivative(expected.Annotations, curr.Annotations)
}

func semanticRoleBindingIsEqual(expected *rbacv1.RoleBinding, curr *rbacv1.RoleBinding) bool {
return equality.Semantic.DeepDerivative(expected.Subjects, curr.Subjects) &&
equality.Semantic.DeepDerivative(expected.RoleRef, curr.RoleRef) &&
Expand Down
Loading