From cbd9d8544312dd1cdbbfcd484590943d17797753 Mon Sep 17 00:00:00 2001
From: David Holtz <56723830+dmholtz@users.noreply.github.com>
Date: Wed, 8 Apr 2026 08:23:53 +0000
Subject: [PATCH 1/3] Improve docstring of  dataclass.

This PR highlights additional settings for multi-node trainings on K8s through a remark in the Elastic docstring to avoid TCPStore timeouts if the zero worker starts slower than any other worker.

Signed-off-by: David Holtz <56723830+dmholtz@users.noreply.github.com>
---
 .../flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py
index 62e4e0ccda..7eb1121153 100644
--- a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py
+++ b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py
@@ -142,15 +142,18 @@ class Elastic(object):
         start_method (str): Multiprocessing start method to use when creating workers.
         monitor_interval (int): Interval, in seconds, to monitor the state of workers.
         max_restarts (int): Maximum number of worker group restarts before failing.
-        rdzv_configs (Dict[str, Any]): Additional rendezvous configs to pass to torch elastic, e.g. `{"timeout": 1200, "join_timeout": 900}`.
+        rdzv_configs (Dict[str, Any]): Additional rendezvous configs to pass to torch elastic, e.g., `{"timeout": 1200, "join_timeout": 900}`.
             See `torch.distributed.launcher.api.LaunchConfig` and `torch.distributed.elastic.rendezvous.dynamic_rendezvous.create_handler`.
             Default timeouts are set to 15 minutes to account for the fact that some workers might start faster than others: Some pods might
             be assigned to a running node which might have the image in its cache while other workers might require a node scale up and image pull.
+            When using the default `torch.distributed.elastic.rendezvous.c10d_rendezvous_backend.C10dRendezvousBackend`, consider also increasing
+            the TCPStore `read_timeout`, e.g., {"timeout": 900, "join_timeout": 900, "read_timeout": 900}, as its default value of 60 seconds
+            might be too tight if the zero-worker starts slower than any other worker.
 
         increase_shared_mem (bool): [DEPRECATED] This argument is deprecated. Use `@task(shared_memory=...)` instead.
             PyTorch uses shared memory to share data between processes. If torch multiprocessing is used
             (e.g. for multi-processed data loaders) the default shared memory segment size that the container runs with might not be enough
-            and and one might have to increase the shared memory size. This option configures the task's pod template to mount
+            and one might have to increase the shared memory size. This option configures the task's pod template to mount
             an `emptyDir` volume with medium `Memory` to to `/dev/shm`.
             The shared memory size upper limit is the sum of the memory limits of the containers in the pod.
         run_policy: Configuration for the run policy.

From 611c3f7bea3a78c056f2948d4bea63a1906c515c Mon Sep 17 00:00:00 2001
From: David Holtz <56723830+dmholtz@users.noreply.github.com>
Date: Thu, 9 Apr 2026 19:09:07 +0000
Subject: [PATCH 2/3] Add remark about absence of gang-scheduling

Signed-off-by: David Holtz <56723830+dmholtz@users.noreply.github.com>
---
 plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py
index 7eb1121153..e9a1c1a4da 100644
--- a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py
+++ b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py
@@ -149,6 +149,8 @@ class Elastic(object):
             When using the default `torch.distributed.elastic.rendezvous.c10d_rendezvous_backend.C10dRendezvousBackend`, consider also increasing
             the TCPStore `read_timeout`, e.g., {"timeout": 900, "join_timeout": 900, "read_timeout": 900}, as its default value of 60 seconds
             might be too tight if the zero-worker starts slower than any other worker.
+            Increasing the default timeouts is mostly relevant in the absence of true gang-scheduling on the cluster, as provided by
+            coscheduling or volcano.
 
         increase_shared_mem (bool): [DEPRECATED] This argument is deprecated. Use `@task(shared_memory=...)` instead.
             PyTorch uses shared memory to share data between processes. If torch multiprocessing is used

From 6bb5ca115477fa3e70f9d0ce9a00432ef1dc6392 Mon Sep 17 00:00:00 2001
From: David Holtz <56723830+dmholtz@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:56:30 +0200
Subject: [PATCH 3/3] Update
 plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py

Co-authored-by: Fabio M. Graetz, Ph.D. <fabiograetz@googlemail.com>
Signed-off-by: David Holtz <56723830+dmholtz@users.noreply.github.com>
---
 plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py
index e9a1c1a4da..e73623913a 100644
--- a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py
+++ b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py
@@ -149,7 +149,7 @@ class Elastic(object):
             When using the default `torch.distributed.elastic.rendezvous.c10d_rendezvous_backend.C10dRendezvousBackend`, consider also increasing
             the TCPStore `read_timeout`, e.g., {"timeout": 900, "join_timeout": 900, "read_timeout": 900}, as its default value of 60 seconds
             might be too tight if the zero-worker starts slower than any other worker.
-            Increasing the default timeouts is mostly relevant in the absence of true gang-scheduling on the cluster, as provided by
+            Increasing the default timeouts is mostly relevant in the absence of true gang-scheduling on the cluster, as provided by e.g.
             coscheduling or volcano.
 
         increase_shared_mem (bool): [DEPRECATED] This argument is deprecated. Use `@task(shared_memory=...)` instead.