From 386c36828e724f983dd4058ed5c22592f345f1f5 Mon Sep 17 00:00:00 2001 From: Xida Ren Date: Mon, 9 Dec 2024 21:08:39 +0000 Subject: [PATCH 1/2] initial fix --- shortfin/python/shortfin_apps/llm/components/service.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/shortfin/python/shortfin_apps/llm/components/service.py b/shortfin/python/shortfin_apps/llm/components/service.py index 5b43c1310..5fa273b1f 100644 --- a/shortfin/python/shortfin_apps/llm/components/service.py +++ b/shortfin/python/shortfin_apps/llm/components/service.py @@ -340,8 +340,9 @@ async def run(self): for r in self.exec_requests: assert r.start_position == 0 + extra_token_slots = 1 if is_decode else 0 bsl = max( - (r.start_position + len(r.input_token_ids)) for r in self.exec_requests + (extra_token_slots + len(r.input_token_ids)) for r in self.exec_requests ) bsl = int(math.ceil(bsl / seq_stride) * seq_stride) block_count = bsl // seq_stride @@ -389,13 +390,13 @@ async def run(self): if self.phase == InferencePhase.DECODE: start_positions_host = start_positions.for_transfer() with start_positions_host.map(discard=True) as m: - m.fill(0) + m.fill(1) # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values. m.items = [req.start_position for req in self.exec_requests] start_positions_host.copy_to(start_positions) seq_lens_host = seq_lens.for_transfer() with seq_lens_host.map(discard=True) as m: - m.fill(0) + m.fill(1) # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values. m.items = [ req.start_position + len(req.input_token_ids) for req in self.exec_requests From 176f0aa38b90c481d258fbd30f97ab4fd97f49c3 Mon Sep 17 00:00:00 2001 From: Xida Ren Date: Mon, 9 Dec 2024 23:45:06 +0000 Subject: [PATCH 2/2] precommit --- shortfin/python/shortfin_apps/llm/components/service.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/shortfin/python/shortfin_apps/llm/components/service.py b/shortfin/python/shortfin_apps/llm/components/service.py index 5fa273b1f..71d54c234 100644 --- a/shortfin/python/shortfin_apps/llm/components/service.py +++ b/shortfin/python/shortfin_apps/llm/components/service.py @@ -390,13 +390,17 @@ async def run(self): if self.phase == InferencePhase.DECODE: start_positions_host = start_positions.for_transfer() with start_positions_host.map(discard=True) as m: - m.fill(1) # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values. + m.fill( + 1 + ) # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values. m.items = [req.start_position for req in self.exec_requests] start_positions_host.copy_to(start_positions) seq_lens_host = seq_lens.for_transfer() with seq_lens_host.map(discard=True) as m: - m.fill(1) # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values. + m.fill( + 1 + ) # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values. m.items = [ req.start_position + len(req.input_token_ids) for req in self.exec_requests