From 386c36828e724f983dd4058ed5c22592f345f1f5 Mon Sep 17 00:00:00 2001
From: Xida Ren <xidaren2@amd.com>
Date: Mon, 9 Dec 2024 21:08:39 +0000
Subject: [PATCH 1/2] initial fix

---
 shortfin/python/shortfin_apps/llm/components/service.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/shortfin/python/shortfin_apps/llm/components/service.py b/shortfin/python/shortfin_apps/llm/components/service.py
index 5b43c1310..5fa273b1f 100644
--- a/shortfin/python/shortfin_apps/llm/components/service.py
+++ b/shortfin/python/shortfin_apps/llm/components/service.py
@@ -340,8 +340,9 @@ async def run(self):
                 for r in self.exec_requests:
                     assert r.start_position == 0
 
+            extra_token_slots = 1 if is_decode else 0
             bsl = max(
-                (r.start_position + len(r.input_token_ids)) for r in self.exec_requests
+                (extra_token_slots + len(r.input_token_ids)) for r in self.exec_requests
             )
             bsl = int(math.ceil(bsl / seq_stride) * seq_stride)
             block_count = bsl // seq_stride
@@ -389,13 +390,13 @@ async def run(self):
             if self.phase == InferencePhase.DECODE:
                 start_positions_host = start_positions.for_transfer()
                 with start_positions_host.map(discard=True) as m:
-                    m.fill(0)
+                    m.fill(1) # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values.
                     m.items = [req.start_position for req in self.exec_requests]
                 start_positions_host.copy_to(start_positions)
 
                 seq_lens_host = seq_lens.for_transfer()
                 with seq_lens_host.map(discard=True) as m:
-                    m.fill(0)
+                    m.fill(1) # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values.
                     m.items = [
                         req.start_position + len(req.input_token_ids)
                         for req in self.exec_requests

From 176f0aa38b90c481d258fbd30f97ab4fd97f49c3 Mon Sep 17 00:00:00 2001
From: Xida Ren <xidaren2@amd.com>
Date: Mon, 9 Dec 2024 23:45:06 +0000
Subject: [PATCH 2/2] precommit

---
 shortfin/python/shortfin_apps/llm/components/service.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/shortfin/python/shortfin_apps/llm/components/service.py b/shortfin/python/shortfin_apps/llm/components/service.py
index 5fa273b1f..71d54c234 100644
--- a/shortfin/python/shortfin_apps/llm/components/service.py
+++ b/shortfin/python/shortfin_apps/llm/components/service.py
@@ -390,13 +390,17 @@ async def run(self):
             if self.phase == InferencePhase.DECODE:
                 start_positions_host = start_positions.for_transfer()
                 with start_positions_host.map(discard=True) as m:
-                    m.fill(1) # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values.
+                    m.fill(
+                        1
+                    )  # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values.
                     m.items = [req.start_position for req in self.exec_requests]
                 start_positions_host.copy_to(start_positions)
 
                 seq_lens_host = seq_lens.for_transfer()
                 with seq_lens_host.map(discard=True) as m:
-                    m.fill(1) # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values.
+                    m.fill(
+                        1
+                    )  # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values.
                     m.items = [
                         req.start_position + len(req.input_token_ids)
                         for req in self.exec_requests