From cbd9d8544312dd1cdbbfcd484590943d17797753 Mon Sep 17 00:00:00 2001 From: David Holtz <56723830+dmholtz@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:23:53 +0000 Subject: [PATCH 1/3] Improve docstring of dataclass. This PR highlights additional settings for multi-node trainings on K8s through a remark in the Elastic docstring to avoid TCPStore timeouts if the zero worker starts slower than any other worker. Signed-off-by: David Holtz <56723830+dmholtz@users.noreply.github.com> --- .../flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py index 62e4e0ccda..7eb1121153 100644 --- a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py +++ b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py @@ -142,15 +142,18 @@ class Elastic(object): start_method (str): Multiprocessing start method to use when creating workers. monitor_interval (int): Interval, in seconds, to monitor the state of workers. max_restarts (int): Maximum number of worker group restarts before failing. - rdzv_configs (Dict[str, Any]): Additional rendezvous configs to pass to torch elastic, e.g. `{"timeout": 1200, "join_timeout": 900}`. + rdzv_configs (Dict[str, Any]): Additional rendezvous configs to pass to torch elastic, e.g., `{"timeout": 1200, "join_timeout": 900}`. See `torch.distributed.launcher.api.LaunchConfig` and `torch.distributed.elastic.rendezvous.dynamic_rendezvous.create_handler`. Default timeouts are set to 15 minutes to account for the fact that some workers might start faster than others: Some pods might be assigned to a running node which might have the image in its cache while other workers might require a node scale up and image pull. + When using the default `torch.distributed.elastic.rendezvous.c10d_rendezvous_backend.C10dRendezvousBackend`, consider also increasing + the TCPStore `read_timeout`, e.g., {"timeout": 900, "join_timeout": 900, "read_timeout": 900}, as its default value of 60 seconds + might be too tight if the zero-worker starts slower than any other worker. increase_shared_mem (bool): [DEPRECATED] This argument is deprecated. Use `@task(shared_memory=...)` instead. PyTorch uses shared memory to share data between processes. If torch multiprocessing is used (e.g. for multi-processed data loaders) the default shared memory segment size that the container runs with might not be enough - and and one might have to increase the shared memory size. This option configures the task's pod template to mount + and one might have to increase the shared memory size. This option configures the task's pod template to mount an `emptyDir` volume with medium `Memory` to to `/dev/shm`. The shared memory size upper limit is the sum of the memory limits of the containers in the pod. run_policy: Configuration for the run policy. From 611c3f7bea3a78c056f2948d4bea63a1906c515c Mon Sep 17 00:00:00 2001 From: David Holtz <56723830+dmholtz@users.noreply.github.com> Date: Thu, 9 Apr 2026 19:09:07 +0000 Subject: [PATCH 2/3] Add remark about absence of gang-scheduling Signed-off-by: David Holtz <56723830+dmholtz@users.noreply.github.com> --- plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py index 7eb1121153..e9a1c1a4da 100644 --- a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py +++ b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py @@ -149,6 +149,8 @@ class Elastic(object): When using the default `torch.distributed.elastic.rendezvous.c10d_rendezvous_backend.C10dRendezvousBackend`, consider also increasing the TCPStore `read_timeout`, e.g., {"timeout": 900, "join_timeout": 900, "read_timeout": 900}, as its default value of 60 seconds might be too tight if the zero-worker starts slower than any other worker. + Increasing the default timeouts is mostly relevant in the absence of true gang-scheduling on the cluster, as provided by + coscheduling or volcano. increase_shared_mem (bool): [DEPRECATED] This argument is deprecated. Use `@task(shared_memory=...)` instead. PyTorch uses shared memory to share data between processes. If torch multiprocessing is used From 6bb5ca115477fa3e70f9d0ce9a00432ef1dc6392 Mon Sep 17 00:00:00 2001 From: David Holtz <56723830+dmholtz@users.noreply.github.com> Date: Fri, 10 Apr 2026 14:56:30 +0200 Subject: [PATCH 3/3] Update plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py Co-authored-by: Fabio M. Graetz, Ph.D. Signed-off-by: David Holtz <56723830+dmholtz@users.noreply.github.com> --- plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py index e9a1c1a4da..e73623913a 100644 --- a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py +++ b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py @@ -149,7 +149,7 @@ class Elastic(object): When using the default `torch.distributed.elastic.rendezvous.c10d_rendezvous_backend.C10dRendezvousBackend`, consider also increasing the TCPStore `read_timeout`, e.g., {"timeout": 900, "join_timeout": 900, "read_timeout": 900}, as its default value of 60 seconds might be too tight if the zero-worker starts slower than any other worker. - Increasing the default timeouts is mostly relevant in the absence of true gang-scheduling on the cluster, as provided by + Increasing the default timeouts is mostly relevant in the absence of true gang-scheduling on the cluster, as provided by e.g. coscheduling or volcano. increase_shared_mem (bool): [DEPRECATED] This argument is deprecated. Use `@task(shared_memory=...)` instead.