Skip to content

Commit

Permalink
small fix
Browse files Browse the repository at this point in the history
  • Loading branch information
blahBlahhhJ committed Oct 29, 2024
1 parent dc87e85 commit ad9ef30
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 6 deletions.
4 changes: 2 additions & 2 deletions infra/cluster/job-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ available_node_types:
tpu_slice_v5e_16:
min_workers: 0
max_workers: 1024
resources: { "CPU": 120, "TPU": 8 }
resources: { "CPU": 120, "TPU": 4 }

node_config:
acceleratorType: v5litepod-16
Expand All @@ -142,7 +142,7 @@ available_node_types:
tpu_slice_v5e_256:
min_workers: 0
max_workers: 1024
resources: { "CPU": 120, "TPU": 8 }
resources: { "CPU": 120, "TPU": 4 }

node_config:
acceleratorType: v5litepod-256
Expand Down
12 changes: 8 additions & 4 deletions src/levanter/infra/ray_tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,10 @@ def run_on_pod_multislice_resumable(
outs = ray.get(futures)
except ray.exceptions.RayTaskError as e:
for f in futures:
ray.cancel(f)
logger.info(f"Cancelling {f}")
try:
ray.cancel(f)
except Exception:
logger.exception("Failed to kill job after primary failure")
problem = e
if "preempted" in str(e).lower():
num_preemptions += 1
Expand All @@ -317,8 +319,10 @@ def run_on_pod_multislice_resumable(
continue
except Exception as e:
for f in futures:
ray.cancel(f)
logger.info(f"Cancelling {f}")
try:
ray.cancel(f)
except Exception:
logger.exception("Failed to kill job after primary failure")
problem = e
num_failures += 1
if num_failures >= max_retries_failure:
Expand Down

0 comments on commit ad9ef30

Please sign in to comment.