From 4bb53e2dde809ea5727b8cac95a080893733a1ef Mon Sep 17 00:00:00 2001
From: leiwen83 <leiwen83@users.noreply.github.com>
Date: Wed, 1 May 2024 01:12:59 +0800
Subject: [PATCH 01/65] [BugFix] fix num_lookahead_slots missing in async
 executor (#4165)

Co-authored-by: Lei Wen <wenlei03@qiyi.com>
---
 tests/spec_decode/e2e/conftest.py           | 125 +++++++++++++++++++-
 tests/spec_decode/e2e/test_compatibility.py |  15 ++-
 tests/spec_decode/e2e/test_correctness.py   |  25 ++--
 vllm/engine/async_llm_engine.py             |   6 +-
 vllm/executor/cpu_executor.py               |   4 +-
 vllm/executor/executor_base.py              |   1 +
 vllm/executor/gpu_executor.py               |   4 +-
 vllm/executor/neuron_executor.py            |   1 +
 vllm/executor/ray_gpu_executor.py           |   1 +
 9 files changed, 163 insertions(+), 19 deletions(-)

diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 59fb8311fc5b7..5d3469c4210ee 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,10 +1,127 @@
-from typing import List, Tuple
+import asyncio
+from typing import List, Optional, Tuple, Union
 
 import pytest
+import ray
 
 from tests.conftest import cleanup
 from vllm import LLM
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.lora.request import LoRARequest
 from vllm.model_executor.utils import set_random_seed
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import MultiModalData
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Counter, random_uuid
+
+
+class AsyncLLM:
+    """AsyncLLM
+
+    Note: Current LLM class in vllm don't support async mode, for test purpose,
+    we implement async one in here. Maybe we could move to
+    vllm/entrypoints/llm.py in future.
+
+    Below AsyncLLM is directly borrow from vllm/entrypoints/llm.py with changes
+    to make to work in async mode.
+    """
+
+    def __init__(
+        self,
+        model: str,
+        tokenizer: Optional[str] = None,
+        tokenizer_mode: str = "auto",
+        skip_tokenizer_init: bool = False,
+        trust_remote_code: bool = False,
+        tensor_parallel_size: int = 1,
+        dtype: str = "auto",
+        quantization: Optional[str] = None,
+        revision: Optional[str] = None,
+        tokenizer_revision: Optional[str] = None,
+        seed: int = 0,
+        gpu_memory_utilization: float = 0.9,
+        swap_space: int = 4,
+        enforce_eager: bool = False,
+        max_context_len_to_capture: int = 8192,
+        disable_custom_all_reduce: bool = False,
+        **kwargs,
+    ) -> None:
+        if "disable_log_stats" not in kwargs:
+            kwargs["disable_log_stats"] = True
+        self.engine_args = AsyncEngineArgs(
+            model=model,
+            tokenizer=tokenizer,
+            tokenizer_mode=tokenizer_mode,
+            skip_tokenizer_init=skip_tokenizer_init,
+            trust_remote_code=trust_remote_code,
+            tensor_parallel_size=tensor_parallel_size,
+            dtype=dtype,
+            quantization=quantization,
+            revision=revision,
+            tokenizer_revision=tokenizer_revision,
+            seed=seed,
+            gpu_memory_utilization=gpu_memory_utilization,
+            swap_space=swap_space,
+            enforce_eager=enforce_eager,
+            max_context_len_to_capture=max_context_len_to_capture,
+            engine_use_ray=True,
+            disable_custom_all_reduce=disable_custom_all_reduce,
+            **kwargs,
+        )
+        self.request_counter = Counter()
+
+    def generate(
+        self,
+        prompts: Optional[Union[str, List[str]]] = None,
+        sampling_params: Optional[Union[SamplingParams,
+                                        List[SamplingParams]]] = None,
+        prompt_token_ids: Optional[List[List[int]]] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[LoRARequest] = None,
+        multi_modal_data: Optional[MultiModalData] = None,
+    ) -> List[RequestOutput]:
+
+        llm_engine = AsyncLLMEngine.from_engine_args(
+            self.engine_args, usage_context=UsageContext.LLM_CLASS)
+
+        if prompts is None:
+            raise ValueError("prompts must be provided.")
+        if isinstance(prompts, str):
+            # Convert a single prompt to a list.
+            prompts = [prompts]
+
+        if prompts is not None:
+            num_requests = len(prompts)
+
+        if sampling_params is None:
+            # Use default sampling params.
+            sampling_params = SamplingParams()
+
+        elif isinstance(sampling_params,
+                        list) and len(sampling_params) != num_requests:
+            raise ValueError("The lengths of prompts and "
+                             "sampling_params must be the same.")
+
+        async def get_output(prompt, sampling_param) -> str:
+            request_id = random_uuid()
+            results_generator = llm_engine.generate(prompt, sampling_param,
+                                                    request_id)
+            final_output = None
+            async for request_output in results_generator:
+                final_output = request_output
+            return final_output
+
+        outputs = []
+        try:
+            for i in range(num_requests):
+                prompt = prompts[i] if prompts is not None else None
+                res = asyncio.run(get_output(prompt, sampling_params))
+                outputs.append(res)
+        finally:
+            ray.shutdown()
+        return outputs
 
 
 @pytest.fixture
@@ -36,8 +153,12 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
 
     def generator_inner():
         print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
-        llm = LLM(**kwargs)
 
+        use_async = False
+        if "use_async" in kwargs:
+            use_async = kwargs.pop("use_async")
+
+        llm = AsyncLLM(**kwargs) if use_async else LLM(**kwargs)
         set_random_seed(seed)
 
         yield llm
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index fde950c14382c..60c20ed7db7a3 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -42,10 +42,17 @@ def test_spec_decode_xfail_ray(test_llm_generator):
         temperature=temperature,
     )
 
-    with pytest.raises(AssertionError,
-                       match="Speculative decoding not yet supported for "):
-        get_output_from_llm_generator(test_llm_generator, prompts,
-                                      sampling_params)
+    try:
+        with pytest.raises(
+                AssertionError,
+                match="Speculative decoding not yet supported for "):
+            get_output_from_llm_generator(test_llm_generator, prompts,
+                                          sampling_params)
+    finally:
+        # we need to free up ray resource,
+        # so that latter test could use the gpu we allocated here
+        import ray
+        ray.shutdown()
 
 
 @pytest.mark.parametrize(
diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py
index 0536cc4ecde76..ab8d913fb894a 100644
--- a/tests/spec_decode/e2e/test_correctness.py
+++ b/tests/spec_decode/e2e/test_correctness.py
@@ -40,17 +40,24 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        # Note this is repeated in the test body; to initialize a tokenizer.
-        "model": "JackFram/llama-68m",
+    [
+        {
+            # Use a small model for a fast test.
+            # Note this is repeated in the test body; to initialize a tokenizer.
+            "model": "JackFram/llama-68m",
 
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
 
-        # Required for spec decode.
-        "use_v2_block_manager": True
-    }])
+            # Required for spec decode.
+            "use_v2_block_manager": True,
+
+            # whether use AsyncLLM engine
+            "use_async": async_mode,
+        }
+        # Try both async and sync engine execution
+        for async_mode in [True, False]
+    ])
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 7c1eb2ecbe550..4aceb19b50776 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -211,9 +211,11 @@ async def step_async(self) -> List[RequestOutput]:
         if not scheduler_outputs.is_empty():
             # Execute the model.
             output = await self.model_executor.execute_model_async(
-                seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in,
+                seq_group_metadata_list,
+                scheduler_outputs.blocks_to_swap_in,
                 scheduler_outputs.blocks_to_swap_out,
-                scheduler_outputs.blocks_to_copy)
+                scheduler_outputs.blocks_to_copy,
+                num_lookahead_slots=scheduler_outputs.num_lookahead_slots)
         else:
             output = []
 
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index e4436b2144bd3..da1b500cddaf6 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -109,12 +109,14 @@ async def execute_model_async(
         blocks_to_swap_in: Dict[int, int],
         blocks_to_swap_out: Dict[int, int],
         blocks_to_copy: Dict[int, List[int]],
+        num_lookahead_slots: int,
     ) -> List[SamplerOutput]:
         output = await make_async(self.driver_worker.execute_model)(
             seq_group_metadata_list=seq_group_metadata_list,
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy)
+            blocks_to_copy=blocks_to_copy,
+            num_lookahead_slots=num_lookahead_slots)
         return output
 
     async def check_health_async(self) -> None:
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index c36aa18fb25bb..96cd18250bb37 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -112,6 +112,7 @@ async def execute_model_async(
         blocks_to_swap_in: Dict[int, int],
         blocks_to_swap_out: Dict[int, int],
         blocks_to_copy: Dict[int, List[int]],
+        num_lookahead_slots: int,
     ) -> List[SamplerOutput]:
         """Executes one model step on the given sequences."""
         raise NotImplementedError
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 5ac62f02b99c7..489e66d586028 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -163,10 +163,12 @@ async def execute_model_async(
         blocks_to_swap_in: Dict[int, int],
         blocks_to_swap_out: Dict[int, int],
         blocks_to_copy: Dict[int, List[int]],
+        num_lookahead_slots: int,
     ) -> List[SamplerOutput]:
         output = await make_async(self.driver_worker.execute_model)(
             seq_group_metadata_list=seq_group_metadata_list,
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy)
+            blocks_to_copy=blocks_to_copy,
+            num_lookahead_slots=num_lookahead_slots)
         return output
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index f406287f3c1d8..8a3b9cde84311 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -84,6 +84,7 @@ async def execute_model_async(
         blocks_to_swap_in: Dict[int, int],
         blocks_to_swap_out: Dict[int, int],
         blocks_to_copy: Dict[int, List[int]],
+        num_lookahead_slots: int,
     ) -> List[SamplerOutput]:
         output = await make_async(self.driver_worker.execute_model)(
             seq_group_metadata_list=seq_group_metadata_list, )
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index b6bcda4e6b18c..3eb3726bd5a6d 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -196,6 +196,7 @@ def execute_model(self,
                 "blocks_to_swap_in": blocks_to_swap_in,
                 "blocks_to_swap_out": blocks_to_swap_out,
                 "blocks_to_copy": blocks_to_copy,
+                "num_lookahead_slots": num_lookahead_slots,
             },
             use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
 

From b31a1fb63c98fa1c64666aaae15579439af60d95 Mon Sep 17 00:00:00 2001
From: Prashant Gupta <prashantgupta@us.ibm.com>
Date: Tue, 30 Apr 2024 10:41:59 -0700
Subject: [PATCH 02/65] [Doc] add visualization for multi-stage dockerfile
 (#4456)

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 Dockerfile                                    |   4 ++
 .../dev/dockerfile-stages-dependency.png      | Bin 0 -> 118207 bytes
 docs/source/dev/dockerfile/dockerfile.rst     |  50 ++++++++++++++++++
 docs/source/index.rst                         |   1 +
 4 files changed, 55 insertions(+)
 create mode 100644 docs/source/assets/dev/dockerfile-stages-dependency.png
 create mode 100644 docs/source/dev/dockerfile/dockerfile.rst

diff --git a/Dockerfile b/Dockerfile
index e471a6e93b963..e8a9842c089dd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,10 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 
+# Please update any changes made here to
+# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/assets/dev/dockerfile-stages-dependency.png
+
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
diff --git a/docs/source/assets/dev/dockerfile-stages-dependency.png b/docs/source/assets/dev/dockerfile-stages-dependency.png
new file mode 100644
index 0000000000000000000000000000000000000000..b016531f1e0a06bb38b01b1989df932f161e3aee
GIT binary patch
literal 118207
zcmbTe2RN2}{62h1C8<P3Dm09!h>|^?QXwg#VPrLwB%92JC`B0=S&4+K5-BScNf|A&
zw`65+-t&9Y^ZefTcO38Wf8V<fiTl2;@A!=KJU^fBb@Q;={-ul8ET$-Gsj`xy21PA#
zrl@&e7A?e2#t*6f!9NR)4(wN?X2_q&>{vgFT2Cn}{-bFh+EwdtH_RqXc-qcx*E0rR
z1*scb-ImW|e79nKG=0?kh4TUrS}O+Sedv)8-0^79$yTXH1;v&x-mxlr(6O(x*|&Gz
zysH<^6)mscx}5c}FO|A<!>M||lCh~bKJ?{fr#c3|hGm`9(zD5ljp!Z7IutIbwko#1
z{fL%OU@;vA_4gGi7JOXKdfwmvl7FW|zA*fc|0`^X9{iu5C|fLmDgO6L+pWU=-~T8o
z@$Ub}k9T`5oBezl+cy0i883F>l4qxjpILW&iE5_O1hllYI=&|yyFBzeAjf&)@qQn!
zz5B^{-Hs}E(9=HHz<HrEWrL6Wv^|&0XrpR`oa~OXC8~S&>?u#BY}e8+T=Kz_Ws~JQ
zmi?z{URp@@ylnpRaF0inF`1M?WBkWf%gkTPXGf6u`SHOA{l)szqVkiC%mbbJGg4cQ
zC2f^r&EtB68%vJ0`0X&vzkZurUaq_9#TF@6^0AeMQFAlS)7h$>wz+=%M@D~buZy_r
z_}7T^%c{L~*;Xz0_CJ37m`~PmST*Bft+oWOLUqRZ@9|qPU-B+d<AU2;Pdq!7P`GrR
z#P&cv+Y?DIn{EyE)V_8<LguRQ&|hwLgubVKK0BS+@%TaiP_nCR!qLR4CcBC54E+Kh
z?ufqT!n2=(L^|qoUH=iFv(;x}+p46er`MA`QaAbgSnkKB_g)#J&ZXh91En*QrJ29u
z6a2f$RV$Bdp4(B!bj!?8bJvOLv6^<djQ1IzGR*7c&W2j-Bf|-`Z@L+mlq9sBH!nOU
zHC2RJ=8wX~?4chucJ<EZE?jsznPgs{lQ})!Y1Qb?{y089KF^<AJ*D`sy*MVdN4Uxb
zp1gdH%XPBvqZOyL?Wfj2{qTh39I4*R6|WboGAQ_F4SqQ=_VHMbZ?$=@>{{`QPi0)c
zDD$^g*H7EtSi+%UAVnK-4BxnlAo-3m@Uh1H+dpjbdAd>#Hx<SH$jnR*j1`FIJp4mf
zrm_Spno)Pz^h{}Z7_;l>+kkf0sZRZ~?>t#DJCn#g$yHw8pWDKe{U>r=oaM%f4z)FT
z371s8F!kfvb7M)-HD<XW>E5gz*A^}nl-NM&ST$T>XxlG8uI@6FpszKpla1XBb)6o|
zoa`@tZus`b)fKdT8%PigEN+e3L+(LQY1iL-vA3svRVzMnNLI_ChkLq_+j45CBKAO_
z&;mYs1IMS@F9ahdTllRorDNgJFjbl{W?vo>+|EI+uruK%-`~BuE<ZC?>M%1=uP@+a
z-BTl(QSaiIGxphB(1}VDseNTroI5j-+g>8w`*c!#@+N=yhssprfmr#exCU!-sWK<k
zaq=%k**Xem{n=eNGd<EC6L?a^cggDQ>vPAyCtHp-da#EsaSPMZ*N^ymNM4qST}ah;
z-K9J_2iUXdrG~t#e#D9L)OO5yVrBL=I6!luPx4B+WfBc;GULA<E#{_P-_(fnub&>y
zko*;S&6mfjz2q~-6LjpPLrlXdGJ=$WIs102{qZH{z>bSGFUcM%`?74>cj}|RfaIS%
zdM34n3RY`GPbm@OS;8r`Q@OuTep)0p!sYVOB;6C2M9GCJCFg=s-o-eWDEDLo!@%Iz
zL;T9iWd7vUxXw&<RA-pKDWWP@oj-rRzu_A5nVG4PL%x$ecDco4?GX`{nYwcqviSEx
z9Rm?FP9(DMDCfFN_2<zuN5hvJG^s9?iQZgSE~h_l20Om;D_fSzj<jsiA5(b~M}}fh
zs1@Lx$Vk4WQXXWshdWxf1)qJaD-^rOV;NaM4ILdF>#dabhds=){zud#dGFl0bFLyu
zuP1kA`u1A!q+iErAI;_qnTx+&NAGVv;d?+hd#AZm@AswRHIaYlQam`sFY4xuz8B_G
zUbA*BalJER{jCdPX(6&Z_TP_AcSKAJBxm)j__jQKUh<J&eyr%~3cADBO_Q^C@kcmW
z`bIx|xM60(b*2|T#*Ooa69jYn#Ey|MP?T8>8Kj?@lz{THkvhk-Kc1e5i;EMGSVip(
zmUnfz!OR`5m2foX)hj7K{zE0Hg<NE2cvT)<aqzE|F|}Gs9@BTWIqm%Qptyh&)z!F=
znJZM-b*xy~_wxhs_OB7s)^Bc-PZ>-E{T(-(OTS-4i}}p3xgSrAhQN_nwS!0qSmDju
z#>U3p5>1|;Ld3QxTh=>|w0=8sXd6a^*M${+v+>d`-F%?!b7s%0PQlJ(r`9bnMjmB2
zb)f51g=^wd`H{@hRY4?IQ|Uf)5umJ1cC_JI@Q$-;zAf3k&gN=zV=Y*cvXD^c;h&Pf
zpFHyA`TF=^@rpw->X$Z>Pgd@lGZlpgcE#H=G`aL`zH2Suql*+nIQkeXHx&1AoP@6{
z=g7Mas<i@&TP@2skpIB<zFl3wBx?VsVdt)0XY$<Wg2l~GR%Mv$6T=7|>Z~{m`z%F(
z3{eY{I{oJAf{w4Thmh9riWK;FX{*v+_vT2$e3X5?gmaEJ(z_-8{uXycDC<u?Pcpj#
zNdR-3Dt%;bH>%PyrptR6<o;Zl=SP)Qg<)<1g|qWWTcCxMsB685+=X)EOg-~Cgc!JK
zMbc+CI{N-E2ZUoVGni+Z2JYEWk*v=qd@__v_VTH|ruQ9X4@us|tD#$U=UiMcdH(oN
zMEDES3O;2u$<~$b%v8v?B)!+6PW?s7zN@wzJq3GOz)v&T$A7bh*$>=wi_d`=jVsEO
z$(Fi-@DWX}(jbvXc|7Fme@gy_j9r}_u@Ol!a!sC_7OT!v(CPaz;+2u(L*Ard`WL&T
zoj}w{l;M)HHlbba#?rM%9VYXcgjWX5C-1Ba{=5BWw&}gvsr&_4WAOL)god3|+5+eC
z!8jS0W@p+vwYS7DlTT2QpS9C3obZhXFo=tf7nE2^nf+7f&G|Lcs-4ID@BL%u3_0xt
za<z)Db?1-72A(c&M0RgYEiJR5_L<3unIqO&dguQ%yfL4sdQgn@%eXw9JzSOWGl|YT
z)^VhVwnQsKX?G8K{O<+fPRbg3-J6)H^xCz(Yt6Ifrrf$>4$V~F@)CNr8GG6PT>5yL
ziMpCpN3?ZsoosukmD<tdliA;!+e#z!9cnA|>}rLIM9-FleiTc}8UL+kxs=S$EnRJH
z`>5gNGvmoKAs$?pH)9$fOI@c2oRCC?X8{m(@bH{fxydRT8#l9c9(=HCl|OB~Ze|bX
z224463OhA2Nj6?zfe;@UYFl+~>|3mymNgH#85L5N^NaQkcCGkHTwKv69~lknCA1%<
z-&6iSgqYHejUL=3*BMt>{(9)~_-tu-M_sn#jb)r6KF(dn!BRHo%lj6KIrO*aUhasI
zY`MGVOQKE&wt36j8%vApXQt}g5xTZWagn<!B>WAYl(fP3c&)KN9`d0U%>}I!^)oJQ
zpMw2}-QwpCJ(=y6@cWh^WACC>$8_((Yt4t&iPza1!%Nb~ci?u3J(8VIpQm4*{D*-v
z?Rzu3knU)D#cFBW3(uz<v%bU32JBo%><Bca0moAL87FK|SZi_M+2O9Ana$qPmidhQ
zEm2-VA49}UZIx*e%dPgxoXt`fVu9P3oqz2a>p0{ZvL)%|Mg+>w9|N{Lzwq;!%y6YK
zxxHYa^F(!h3GsR0<%bR5wi$m85#x||adP={jaxGu@Mb{c*s(w|ulKiB1&f|m5y}~D
za_V)gFJ8c|d)PpmEQ{NZgMY0e{j2L_p=-+(2Cff2u9M9*SOpHT^O~=0I`4<eI<}2A
z`;-z((Y5+47SP;_Af#(oYvXDY?m8Ces+_5-mZ@7JJ5uxfV8!%%VHfqER?}ncxy^l3
z<3kDS^o0Vgnr`x6oM5V@JEmu&MFWiE1#8Ve!BzuAj0>XS7i4d}CEJ3WMqbx{=CrH1
zr$5=mK34AIl6GJRNnuo))?8$!Qo>HUjJ48seWb>!emIDv9f1IqfFY#OyC<HXd7+q<
zF9Uc#R$`ZP<V1w^7v-^b*Xi~QS(p6ljAI}6%lQJg#_=}IUiUA`pi=1NraBIZk4<2!
zw|N5vfBGz5pOs*|>bI%>gf^(f>8apSr#8+JUFV_iuSIEmLE*t)AEv51%JwjS#sZFd
z7lvK_9Y=!aX=nH&@U_LUeOo>e)9})tjC$5n{%C*QMBPYEb57%1kFy9CuV)dVLiOJe
zgR6ON@~(W`e0lKKy=%*)o;*95X?YsK`$dA$s`E$)KVMoHpO$qR{j+Vc@Iej{qjyQS
z7La8s`#0w1*~Z7k{cJO?*B)2$MGD+y7kac2(Y4Mfdz;_ky0To?nKo}4=}OW0*XO&x
z!HLY|(h&!Bo(66`@ukK>+r8qXqnVctxJHIU@9?A}3GGyu@?REXAgy)Ge4_9|n6w=(
zZQ1&<&HJk$!0&mlp_7i?FFrd~BUjW|kPlI|O#f~oQ=;B$TQ8fDo?0fAXQorFo!S`}
z_5M+weH-_`Zws4#_MzM#&SRchcQ(Q~WLsguowX0Tb)^&Rdh0O6Z93_n9giiSto`42
z2qU*QlN%=X%+?9?d12@!+1ej#cU`bAw3<2nFj~oV?A1y8Mt8Hi*RnI=w0k?U&ZPvZ
zQms^G;L|S745zXFZ7STdmj^!}Amw~Fer?_H?EPi9Rj>Ut;g;3@k9cpd5fcOldipfN
zWxT^IT=7}r-|MaU?|MT+Lk>gVkJb0RbnML*f3V9e{;7QbJ)h`Tmpi%3hdREV^%|NS
z8+afyLU4q6U6$Rl>fNubJAAE5Bx7j?zy+*nS2KdFJ(pRtmi~8jT<@cJPTw6qAA!K)
z%*GpQ1v_oKt6H16XGW{J<gI^|S;KCgg(DRBwdZ!HS8E6bW{6l6zXW&G01i8$?>aTq
z`?8%Fdx9e;Z7~wv|BFiGS~u2S{P31pelopK7i$|V=Y07zT*g((gVVOMnyVgEJk;`|
zK)_I~U40n4zLP~lq1@27Bbj{#YX_3-vP~Jj^h)jLcD7E+>i_t09E({OHQFD@cQC}?
z+-2Ya!OI8U(Ja!$>Obq!b7&19G+0e`=-cF1`KcoWeu$a=IMw<wzyWc^^2dp@tqJ;5
z=UYcXn%m)FP14~bA*M-jE7$)>)CmN*6PX#(p9z2&Z^PDqsL8Y%_@3+{#vg8P#HSi`
zcZHgS;j!#@oVLw(#M_*E9rbB`&c}^N(cj{C=nU?2BKxw93)orviilCg5~<Fo>+Bn@
zWVZMoSgcCRYGs%IBAvXWH5P@Oz!341Sz1sF1_&<!KOL*8$hm)S)BeXmULEj%Z~5ub
z8t0K(Wna9N#ImuUr7riqIi+)&#e5J4NdP`qpX2P?;?92Z1B3i1jhBQ;*?dHV2{KK}
z<YU*fR==1t*0yap=TAP9PKz@bi7yXmT}{(ih8lldLPB+u9RwIP-z8i!FZ*ls?`zdv
zo)mWJ^UA%v6wX=_t?Wk#_D3uE@p2#0GoOM6f0b_n*E|rX`4N#X^pTI8)JVOnE7Te-
zNn3<+9o^>K>CW7`zIqy_FtGdnZ<;S-<FqdOhol*o$!6|$!d7=c2f;pXQTDaV?$mRr
zpQsD;aT$Ch2^yKvnMIU?0&mU-56p8UedNYXNJzg>l_o$Go?qpUtlqCW7M-4Mdn_yZ
z%vT_c8i)2ULdOTVXZpAu2Fv$<Ei1eG`}glI5M>TY%TpxzggW+Q1{Cqp40rNHJYcVo
z6b-jfdENws<hwKN0672jKs@=E1eL~8Cr{pLvWf2~dl>CQb6?zY|BdOipZq71nVlrp
zq{Tl<;rYMt`zhcrpE6+YkxQ&pm2be7V|>c5?0Qe)e|VYvmEg+G3;~@eKSLV73Y>oQ
zLd{E&gk#BS{cRI8c%F82F6t>f`2PHCDUZ%HoiMetHF577@w)xwN|7%#{v!UTQzP}c
z{iC3{!rK`sv-djn!gb?vw2>v=`xpJ*SuGOPZx4J!7))Z2i1EJGKArm6Yl;1Lt&Hi^
z9-SFacB;;M_M>Ju|Gh+ZyKJxWYuj#Lp4{o7WF|Fw1)V?0bG*t@wp~wXXz)L5m3CsH
z>-nL5K3riu4lqVh*Xc<kdb+ZBAd@XX><4{AFXc0Kz$a&!VdS}g=ex^ga`a=fvLD~7
zp!se;dcqxh;$-By)4qZOkcn6C`^!DPmLMd5Z0>a#c}-(wTM+4{hO7N}5WvRtXF6S1
zh0@mYKML>|p~+AZu*r{S$C*zb{0_;T0M*leMDQ;ZByL|G{xE9|B1g?KK>M@*j>@hz
z;Mh?v*XFhLo3Y4=-^FvsK7VX37JB`3mbRpH|K9{hX=6AK$4-;OY}>T`VVgv}eCxq1
zEX778zJo{<0Jo3T6*^0pp|ssaDeGT{XNk2v|9|W+3uIvnVb;E<Kz7IS*>et);rbDe
zH+SN0YL}mJaHnyWswIDUacw?eT0dkDX`&KhCAcpCdYCmc)k|)@V4_vnHF!s<%noJR
zTb`f!JiGIeiQDH^wNke0Ma9XnCidJQ9+EnU>;<TC^$6YJXaL9-B5etF3OjckSWJWN
zX##Ugly|YYcV@cRYO=48UwL}8P`>R|XVMl47HaQ!*ID@o3z*n=l!=&0(Ci!~?53&K
zdp*du!Z(D-7SQBeXs{3@Nl>!<9T&^S$>hdnJ5p=c-u1i`;8G2_frvk6Kiu80w~!cx
zTenDg^elNZ$e_(kfBZ|JFV6#U^AaH3yJG|G>}wahu?<ZQl%56db7+=cZvLu+Z*K8;
z=JKXNhShhiW3A*azEEv>xJUCCo&m&Q@@1^|h4ey@d-=B<eX8WEd+E&rvNTL;G@vE@
zIDWF=*aO-$sIGmGt%8lcCHdgEYQ9;$&b`G5?tz5)K^5x~Bz(Jm@^gI)v@#{%br7IP
ztbAqLtvx4EZTQy&Y!B4Fwks`l8GcSvqWYnRwp?Wr`T#Y1vD(7Cq!cKUP5}9kNXLgd
z1ECh^G?Jw0w(IlX^>{s1TR)|DUsykU!=4+b8uG3Xt6aErjsF#ThBLWS!&{{4*dD8e
zg^<(;O<dJC6rg3S&&M@XNbl83sDy3L5cEkr&77H@@Z%xS2iYJ5`HL--mhfypz8l`l
zKRfxV@QbytT1kpJ>RJK9jC3=a{Biv?2Ii8;f3AH2&P>Df-#_EB`KC_Gayv>SHjxFv
zUiXW;4v3Et3k9Wk{oayFG0htPogg_i_;mmh5U(<pC<Ji<5))o2I{%vNMY}qOvqG<a
zMR8VD19p(5nwl6XrDIrkcnoTm^33$q4gz^#Ru1(14zN(G0*<q%Ag3w&{&Q_%E40&d
z-crAxoXKUQwl$CB&N!14g4`fHD~f%IR_54oMiufRUXdqMHRmF4dbL%lY#{E~<7;(H
zODhKIp(t<59pw!WljBvH5Z$Vmg*lE7c1X7RA3^p@=20dH&fK|EClhJ<!2mB+HjgOn
zPQQ;te8dL&@sN~9&n$Bn`?_j&bwlpV#ov@E&<<S8kN3d+m6lWBNN39;|DmmpyRg#+
zOs={u8%|FH2&%!NG5s~?V#Dt1T+HN!oH>)^$)vC&1ezX^{2`js2egx%3qV*%5Y?60
z$^AU|R7i@(g{ZP&2pO%v9{C7$jYHiiM%E1^w69ugRO#dl-zdP_PAI*nTQdoOMQ|%6
zKw;_HT|4b`sgO4aLzVzhXGZ@t>a3JYdi6VgvQ0c!eUz-BLB?PCrt2;OA-iSm^Af2o
zrE(J&rd#!=dAqAK>K|GGp$gDSF3%&Twu*^~Df<HX(?A#D{qQQspIHu8Rs{<U7|Z|`
z5PN3Aq2^0OOb^CBhicDLly`;R3iPyXW-50ka2co6%1~Mow*8Bf4Ls7UJDT-+FCTM3
zt)`8H(}l~{Rc>vULGBtr7-;i7Af${K>U*j#%Ykf}RoOobG<<myK=e!|8&w@k)YW10
znfuta?SZ<M$^^XdDBIZB$W8b8v<IIJF9H;0Qe!0ZetlF6pn;up7B!eS-ruHkAN$zb
ze{(hFTv{kMoa!`ItgnVrlI_n~kXdD#{27-{c|Sm^Dnwl;f_-e5tu}l(tyq-m>S!wo
zdqB3pkMH0|h%3H#P+jSpAdW?%peT6ZRaSCG1+S3VEDgp+0>^1c5U;x?jSIa$Z_B9h
z1(tUT)N2-&MSdL1of(;F1G=m{|L`<&6o;IX{l43)@6pjOj3(j>>UL6gJvBxgbXBjw
z=+2;abpo0h+t%ZE!7k4LWt=lH8BH(--3$DxCcQ0Us_U#(S85^KT{f}~iyzHtM;Bff
z6cl_4-oXzx1dv*M2&(W_o%E+alb=J4Zbt?wmY;5xfBfpztDmL0+uA-q2<&xgmmX_m
zb`2uD{q7RV@!6Ye3?!%%2>uKqik*{z+~Zc{+&!|g-I>_WfbInN7LpR!Q>X%gddX1s
z*mW=cbMaNj<H7D~yFk6Oj-P`>HsQABBxR!Bw&}qAtpX?!UW`2yAt#C0OUf4{I!bn>
zyuIa3gOhvz0>6_PE>q)^(<8a01UEx!8xH+N{Ph<3dt=qYp5b?AGN<1#aEB7-m+X7D
z;z6s^>o0`bnf`d1HC$y%mXDYNa_mY@rpTYQHWhjfW37RHJbJIJi?Cf}AG?SgY*PnC
zHX1bdCpI0bal|@+8sfrMibOtgtNjbuy5iQXStGbYhVM{#==k{fQ=N>YB3U>R<Pi=?
zROd}icUZQ380@Wgja|%4NWfXSfndVrIx16}mqjqRwY@;{BUcajQn;4rw9z>^Igy%V
zbyM~SSLXbhL)1wb$PlUzc5NfOn4DbN>?0oz#+O<o3lLPyvVWbrfJ4?uc5m%(@z;SW
zF)~<6Y5z`Sa9YH`_Vt#zP_|*Hik_XG92@I4cYV<7Ix|7rWmsA}<S{|1HRQHkB6GDZ
zx7x#d_Pk|VAwyCZi8-0=ZuHBP%*@PGVl?oV$VUkIjhx9g-7hN6G`+i79gyrgGaW2p
zL3mz@-UPLoRHOSNX2v2UAsvy0L|GcIyzsoa+-cHDGC4xOrV7`Q$EroBvp%T#ih}VH
zZaEpa%|YC*;5LUJapAHd!qlLaIIFf)tpD2!Fflw<Stxg(z$J)U6U+3N-HeDVc}&-0
zZ@CD$n}JW%oSm8}q3}WUN9@(Nii%iNWJx^&HeYXh2&mig!v>FNO+hUcF2Gfj_=JQ9
zykDVezCKtOpb}GaxigVybkB9!$yF%>yT2l5Vr748@eZ5c-&Dh8GBx^-zp_5(=SaYU
z`lEd@nu7S-3eEZeWMkVC1;8odq$--_Fpxezf!YDT{?zYdR(<c;#dXT(yQQ0g`E{TI
zF+29izxuG-Q6mhM2&T{(j|n(@3YV2`wHz}WNkf^Lf!pcQ>8txeBw@u67rtLD@<Y&{
z(du{vA6&tw`hiGwkmb`&xG3#p6gnixyrK2>L-PCPNHemX594dT@$aS0El-FLKCIHm
z`925?PxW4>|Fvr!e;E>&cHgY8BkG3}YD`a`GRqE@5en@9!BfXTkkWpDL=p>QKp;5i
z8x2KOvibH7O8|Tt<A2jZ?h^Su*kms8rnw=al9(32$b~A{bVAW;*EnDn5)zIG251Nd
zSOQWI>2BK!d+9q|vQo-l0DYX=c0?43Rp|Pm!m}!LvD=TfL$2dTmSHg1U4G>s$@&o_
zRdI-(@|?ZV1@}2MJcViJ&YjUoNl6<0qv$sXgzzk$8T<J0881dIBAI}0RqnoNSAtqc
zKO&)oc&+sjLRzfXQQDb5pPwbQ;ZI?9b-@UHZ9mUOe17P`r9De8Ow9NMXY0&)YLMl$
zh<-xs?fdsu*iv!SUaydvui1_xx4{-Q`d7<dwjLkuuHJo<F4Cql?c5O(N+6Lu{I!{^
zQQ4X~r6X@R1SAHVjIIPg)BC9U<#zW0krZ$6Sm%oxG5)3+7oKVPa1Xft;+t>e(~xmR
zR#g9`+uF|wyNHVokN1}eNBM}v#DhH@m<>T@-+ugf3|mc4-$hs@6E96ChmrrSLCoaN
z=8;&@H^%)fMZvO;wjpamYGG9CP{qZ*y?F70W7mb_&{wp#OHy4wPeQgu*w6U2nhd0@
zYffUSG~TBBayu*{(O<>|qB@w5*ufrxA>%YLFyib$#c79)s0LxNUAxGbpjQWPH{2};
z-9*rhTr8#fiJ<t<&?)jg3dCgDz+p3CtXOD*EHEv|!6cnlRaL2!H>bQ^X8Q0Qm6!gq
zH)q1K>RhbW52Eb=Is_BsPb7_5bY>8HaIWy?R;z(FnA2Aa*Yf(<bp8;_Y`-JGH5ZZg
z-B}YrxQ06`JtZB5A97$9AFBrY@{kS)TG<t_MnHnrchDs>M;uKCNMYjK*T*qAIdVjs
zgh>yeQ({+UtWL&7Nz~v3d3_Mr<w#Qs*?^GhP)RGxLQcCDZ|NQ-k7b;M;uCN`SKi+7
z*U-w^V9ggq-WU_5I>m60pCjoYnA=fAACy;=@p0|VB2<~iu8TE6@SV21dzJ}Ek?t$1
zYwH|4)6RgHat2#~Jnk=W<e7b66W<7tsR;B$3&ar}od=|V04;c%KW$r4e0u*cHE|Os
zmDrL{No1%)&d$yt<)Xli7wWP|oHDHt@>pl7gv=4sVnIqw2rLWyX!;|iFff<oBexd^
z3YQ}Mk<yP!k#WbD80;m=q-4owtsE5{A#`<|C;J)RgszK^0HI#q1X1U4&6VBU*iy01
zGDtdrJPH0Xq=CgUb28l#_2CF0*q5Sj9%%}mu1qn6ba?(v3$6qOCA{cQ<6DFq%b%wM
zTft)wfdzYc6Z)CQ^#s8&_8eI)D&-oIzJ|WOesF_2YCZsG*g<qC#b})K^4;HF5~hQ7
z&g&$2oK*UfpE{!dYM?|~7{v*_Vf5zAqqTmdcm&U3#XPlYlozmYCxRP9FmEE670D7%
z^5Ne93V*AqNXB}UT4+s71kbl-LJxZkj11bidA6|OxO#3w)AeD)*ajF@{Zx+~u}(in
zv~Q#bHCTL8-yZ`vBpYck@x7v=J-zy3F_8>uL=-BI7Y%>qh3}o!E!-X_qQwSDd~R#|
zD~)|lmq+cz{vbQU*8o9oBQiTr44YwZsL!UT@SQ*aL?%3Dy^^fl@Ya8$k=u{cJvEsK
z8-Vdd*hT}!gBq(mItEQX``et-yEg6jR>iM~8&tj^WigQ6Rh%^9Ej!mGm&bJhMF9uM
z7nmOUFhrBHhqe72FAsMGKk}A}*XVzenD|+4qU$SSE4e#mP&yaPQanshZh~{oK{tdz
zC`Gy5;RwX-P>_>U|B%oAn@s7bsvMvsenOA6XJy8!xw?g0?KdKHBKZ?Q0mmjy96&uG
zHAjvd@zaRY#G-KQ;KOE~Ba8Anx%%SkZNw0WtgRZfeM57>ow;T>GX<^8xa3PTkBB_@
ze;m{($NS(FGLVqspL(E~<bY5nPsHdu{>7@qauv3S&*?U9yPm*E5O<KlAqrNsgo{6>
zEj0<;z%|GQ$!sc1sFZf?*S2PWt^@tW!e_C<%D!k6l0v6i#%vNjC^~BpkpVi;=}9u0
zg;VW)h1{jU7L_eDg#rb&C;l=-cqghcF*CQs6DT?X!i3k@R+~AXMQY$0_vCt_{NJLv
znL?+S`rm7V-BcP~Ekr0FJz;z!q<1PVG4WuWrV4P5NZ~p^&`pvisPqc#B>F%M{3^&U
zoBXGhRQe_AW{N_G#6U=RMhh9TJc@0EY!PaX>_B!0+4v&aot+*~8rV3AwYcq4m>Y@v
z|KFfS?VYRhRqiF)P~4H&Gf12Wt*GHYsQH?#pG&G(=*+=i1bJN`{O`n6O%CibtMVW2
zcJ3m2TZ;66t1Foo2tU@kzs9OGhK5^$kbyt8_E-0iy2UcN(FUYHL`W<aX)z&@wd^SA
zO(AP8^t$U=C`4k=4&(BB<YFdgv|1m_S<ENj8zlT61T^ZkU!cB=B%rG@K(y~E=vu|7
zl>|Yb-O9jaZ!nALO!)o^YxWG_$e%zwx6INbhk-V+Owz4KlI<LWfNhXs$?19YNa>D%
zz&lLJB0q&miz^c;r)9h|`H&BnY=35{bJA?%-A?woTspn`|0uAz$*M!l1Z2vAJljsX
z4@mX{aT0-gB|J-bJ{!;Vpr-8C`TqjfsD|WhXlJyVSczo&N>1W#Zf$eCyXyor1D)9n
zfySfQ)KqXahIN+|fGDsDgcfQZJC;H0m%I#^#ZsNiX$l5q8}R>z(^QHc{XZ}oW$!oZ
zm?=8{JFLYS782b9yjT~5LH_<9_CdSzh1!dH<HuiKGCYZ@SeR5Hf*$GY;H=BVJE5sN
z$BGViRtVEGEUSsLBG?Kc{rj@sUV^W@*JU2zsPPfc<W9DduWND1Dw?tno9N>kX+GFf
z*oj}fxG_QZ`825<q7M5UZONo-7m~4%6sy7m(hV0&qz(wM!*!0mQO#*Vb2`EtfFM}-
zbn$)I4K2OK0WL!%guwTO_(`OV4gwXHJ~R~S<XD-9oIe3&zm$-#j}mbi;zAKZab4ex
z7kx?$(jei5fDRvXCVHy*rfuVq(<^FL85H^Q2Fo~D5_E<%Z5~@P^p<;iyZ%H~h3)Tk
zq%{=DE%ogLJL(K5dG>1F7$}vOM{C7*wXo(#+*2mV+?j7Dpq|&0eqQN-H&gb06Ma&&
zRd`V2Q<2GSZb@67k}3NmrZ2fuHW3F0{FxlD$Spwt618gk6ehHt?$6_pkPyOr_j5U%
zqBWIXU!kT1b>o)zi?8G}=!8rHbU~USncvS5MUA_tw4-#Ft}!S9&4fTaDoqyEzY$CH
zGx8~uY?wP$o%<OLpMAJ=7U>$D4e?yggXKgpjC0Bvhr2-&F~|@>#qMK!0_ke>hDtRH
zHqFISwN%HCKRcyNYR9CKgYg!DH9rTM?y)9u((Q*KX%>VwG=RoGVe~h<nLnVbbf2DV
zmv4upX+>H+by4x|KnEdO3IfRzo(;1(N-N8}Gy5`2H@B&9K?gOpEM0p<=5KsnBlb~z
z1wEB#JNnt&<%mNT+5<@X$V5rK9|A}FpSR4y1om{D*ma@uF3~7UNTW+VrbgH&!kw+o
zKznH^O8wB1M_THqyX&7}`C8C8T0|lk8k5j{`RbRaK-!=PW;TiLm9wPukBXs%S3POV
z1t>j)h77yWNpw3UI58D@S7E2^&btN*>DfxX1x&;DzM*JCk+unMTdmXlzg>Z51Si{F
zvf?}^`hYPAu>p>jw$U42hp6~DF{3VoU;_-b{<jEpC%gZt1_EIf-<V&y^y$fLDT0tA
zW+vM|1@+CO+O=DVwa#$o>DOwkJhA(_ry|du6VHWj7KnzNDY$qe)!U|E^?`-PW?MxT
z85c<1DB{Ww4DywIT@W6nsqL>4b1_%`vb9TCwXJzic5LgAlZ*^kyH!tnXz7IQ>vrAR
z(uskbYvSg0o?Ozvmfl>lx3GK0(Cva`97Li2@*2J~QdU-eY17FzkDi`o_3B0P^77ZO
zUtei|``$ekG=HwxQ1l3s+b$;dUE`f1dBuE1(S7^rk1H<#&%qwpeSLimOihcgGjAlX
zZH!DzSmL#R(C)rbP*AW`4vK%uix+!=JrAQLXEg`MIZ2g`QNF?(HqiC;_1Ta0^Y+wb
zdyS6TH>Orr9`CAs%?`Tg9uvchN|+C}_6>ge%-%+H$BrE@WPd?fN;&tHZ~pxGp4ZCD
zuLK0FlCt@I9Ycyjek%LAh3&nPCwYC$3JO{+Auat#H`8(yI7q~@MM-&P$Da2(8XBxX
zPabI5-5?~SHW+mCCSz!5XhT=m5(Wl_pa%~QtmI`;?-&{%GS$s-;^0>c%`5WbXJKL4
zaJlBgheZI~@1c$!Mxse9War`%if^!(8n=m#iCJZ@iitNiG*Hsg(v?4dJ~XRNFRQ4i
zm|M5ghjew<p(kVk=JE2?D;3UVxF(Bw92+O+9i$5b=sET?)7igLF`r2NT$f9>w);T&
z<SwTk+_>>$H;~Vtk&(iiEBG%#Wzn_Li$9j^4cJSl{FB2Eucp7%N%|h`^YWzxMz;_V
zXLpHVf>y!}RIr+!m&#oW4h~Mw-$2`<RiLfhe+CCvRs{%ZK@?um@cXy+){m<-`ceU-
z&{E8F@7}$9xnR|W{QUf?p%N@01nxjA*I8Iu0=D5o60XxSyUgou1LAuM_8g7VSdcr}
z&xFttojNi0+OGFfdkN=;qGq(<d3t*b{aFpTk!Dgp4}|^dwQKZHW<IDG(`<?MMf3Nm
zQ>XIYzNPnK+m??{U!MXRy<}nW1|uuWv^x9i*Ds~j*pi>V;o-~7&CS6^IgoVRQ6%6>
zI1;-|Mn>ij>iVWTc)EF=LuDLTp%%YEr@Ol|h+P4qzZWEqD40RjqH8qMxV2eIY7NA~
ze5{S)7K`ets{GHNm$I?3vF_Xd{vQ*Q%@98+Zf<VFi12VO&{lU?R-fH<&nK(xr^dxx
zriPB$U%tGNmv?VvZ$pDyz3U9;j~_p{ZKiMEULxZ(y8OzOD~pyaQB+W%z<7Aa_dMAM
z__Fidm-)7~w!3{jUc3<VQ@KeS2>J3v6I12_W53o}k<2u)Wa&~6$Wu<}e1^{jY?s=&
zah@Chp>UGoMmv*p@7}w&p^JN`$#TG;Cy7_tLk^8Asi@>blWOSf+|rc*v`pLv(eZW>
z7j1cYRZpGbNx$%O9+vLm)d*ysoSA9ImoHx`t>aXFUtG+DwiA{_xdAL8A?qp6p80MU
zg~b&kN~M~VZ=A$0@~3L@@+dAYt{|v5_ewh8E3U4t<=N|n0VQ^0Da&%49U&#t4G#}*
z7Znx!67=8pD)N?t{cSh>vGt);hAQ9Nmln+pZEcSe8Q&=?MqiDLTn~iFfY#v^!&-^j
ztEBCE6j6#4lg&EgCx=GL8{DpwEE_g#$d}u{fBz&h#gn6nA&+nH&c>RT4g-8R7I3w!
z?64tRW@@`#kFLM4{>l6gN)Nt#{VL)*?ey~XYbL)NkX-IS0XKmD$hv>OqM|$FqD6T?
zaeaRp_ZZd#F{PX?W(!y$LSl)V8&xhRD=T~a#0kb#GF@F=WlvA+xc`u!+){8)PkpWr
zZYLornHt4SJUGDN=FOXEXWcs7TQA&YT9+l6ZdQE*JoLb74i*-Hx7F2aSMaMT9zVXC
zQrN$r0UG%mG&Vkrjg5^n6nik(*xT!aa?r*dJ61rRJDy@dht+x<*GMxErDBaEM^X*n
zE}TUBO1_+_sj0oaecrowikRS9R#w(O5H4>DN_(#snTJL#H$c?u_gGl&32JC)Y(HDF
z4%G}br3Ta5S7K`6GI1Imquef|bR@Wjg@tV{dbHbpameB@+Kp7)P^vaUZ{hYGJKo~J
z*!FW@ct|MZWFu3&_ru-Y{Y_C3<4A8k!(B1;(L!$5cM$p?CMOG9SYsQXBJAK?7}v>@
zC-ZaQZTZE;2U^{S{G)%ON&p_b0)pUtJ3G7kp|pJyj4271^+H73Bq_;39Zo(ei$r~I
zhGFx2BV*&nuCCjtvEwk|k|imYttzOQ9BldkM|nch7X<kYa&lahfuZ3Q9CXY3@<k)A
z`RmuOVwsnhlA*E$C}0(y=wt}h<+=)Y;Wuy59o2{8E^Rw77uy9443Chv?T5QKsJzO`
z)i}7sudA!e<9*xH^VWHwmmty8i@$vNf`cDbX>Snw)j^VNm}ADGnKO002M4XsY+$9`
z$@0ycH$hT1f+urbmQqiU*JWg74Gj%9Se%N!*x1;J`!b*kHJ^H-{%B!Vc6ND^p7hO4
zn>I%I3?XJbguOzYTR`P~`m_X+(;meCn57s3UMf79y2~I>kqiq#VBh7-Qd9$`P#CY3
zz~k*r-l8N;4Dn?4<qe`}tTi}yju8PXhFx#t#;B&^(o$wr*LNe|P<aIf3(3`?(SDb_
z-R37pKJ59V%k$Q)d;lPi_R<J?O7-ADWCYj9qlwzA`}h53<>vN5=xT%wQ_uM5!64S`
z+O-S6QB?fhkF*HeZ$ur6a)Y<5U%#G$^d2aIkU{*TDF5wSl0^_9^9u_Z5EwN)Xt5yL
zqBKH&Kl0M`_bbdUT(~qocrGY7Sa}_MB(4~(4-G9X*HN#dqb``4^`R6Wr4k_cd(Bm4
z?%7YD1XLiHg-gz95rr6)kJzx@uLzqacN7Wz@<`7e#4Gv_+E)rHD%iop{m6(kGJqW<
zq@*@(-@cqOFf&^UbaT*uBOe)Rl;}R8zyWi2a(zuce2GMArx`^g&M5uq>-z_+hnE<|
zw`~#<t3h%0D=96aBBAt>U!pPMyTy3x<blOBvy0QhyNRdnG%2HQ+_*vPBKO_41wbt#
z=f7><(D)YHj6(*7`3x-T0*G{CqobpdXw;zG0J)ikgoKn19r||_J4Q2hlQ9Z1!T&`r
zH7MZo*hpXz1KQ_L!}_7)JwkG#+|a>j0GxYXT7HZ3ax$2E`?6f7*5Ev%JA|Qc5AUu)
zbMa!ROY<mAL|EKTSUb&RgM}s>_Z&j&uWxaFhi{Rtx8vEDU%XSN&1d$@DszOcci;(h
zrw-1~vG3dX>C-13?^~XpJG(&vh=BvvGSEDnPc3vbHCC@AdKLz56ucV|!42+m6Ats?
zl&HO<<0DwShR0q-MQzLODlt&N1*jR60mB6bvcvzPmwC}_^JB`lZ<?!DuU6t*NCu%E
zZrfXT2hnLBh1_V-@r5_-!cUE+CvX9sWn40gP)j>?=1e})lZ%UsNp*U6VW_2K=VO|O
zvZ&|y`uX*t<zl_8EGLDuQvj42CHff1?D%WjZPa>1T^!ikuc_%jH*fV50r&1*K}q8w
zLd2ip;e{2MJj7U<cw}VO!9Z43K@>SfHi4tLsp%>r_q-3<FO5*NQBqbOdtxQpF^e%g
z`^AuaWE_W=P-Tfa;#-fWC^TVuN{h*YLcXA*Ja6871Lnv>b7$bycnqW&O;{3+!&?_F
zS?vZmWdNJP+wMv2B*w6jKr!s<0?G-;CQ)`uOQXYP^BjG8{0eX-9fe?X9c0`2+nBXz
zX4it*Eo^PXPMOr_$cl=J8km@PBEgcN`8x%J*{H6rZi0jr_2>~D1(2Z#9g3H1FJ2!*
za;L|cU^V%`C3`1)xaF6fJ$v?M6m2B)maUVZu=11w>Q^i)S6&y?idR~<iiJhFb$V(d
z<@xhH+S+RX|2&`+<QEh?-W;o4qB85rivF=ct(!J&f?{OfXT;~UoF%{l-O2AjOx%ZY
zj(HKfKMZM3uc*m}oV`R?Kj$3^V`T})gaHL3(I#-o($aT!O$6W0rTK^17R~!+FNRnZ
zwBzhDssS~RAe7whmA?P?-o?0g`D;6KN&%`Z20kBo{^ldHzj<%o(18oiL*45+18}^b
z1Z}1p1&gOAdgZUXyFWy9((u?#Ms~UqT?pF^^XF3v=vcOB`^0f`6S>BEUfu-<Lv|7f
zNs+*aZWPU-oyA&e<BOIpy8$qX#2@?HU~Y1|31B*wq)SJU>^}NCA(>_ix4huck;u}N
z8{E4QRp7Gk-vcBBXNN=kdJb&kAu^%;(C<~&*4CF=KdwqMt3FEh^hpp-p2%;v?K(~=
zKzX}_qp@0tQgt8yX2HQqX<a?NRO=269v+@2mnG(0+zK_46|8b3?7nB$S6YW3ui{5f
z!nH?_Hc<^&FA0~)m+xlZJo{C-z2VI@a&O82Z7-^$>o;z^htWJF(Tqacs>yZLZ?$^j
zKghz)?t#e==fkNV1%ym3uDdkr^1LN}=+GfU()V>&TV#Ff-(G+s&6C1zNPwAWC|Fdj
zlYZgecGmwn@zzNf8w4uWeb|Th_pqW6g*T>o^-smC?64#FAT4btm4`06$mHbYez&=m
zcFK01V4E1}WuOH>eV4HqG9Pt|mS4X-K~?8bY7ugG8X649E|QnvE2KqZq)nxR2Lq&5
z(tc!k_3G7z-d<+P<i}GE2v{2}aB<CbOS$FNfxXSAy6Upm78Ml<awcnh@c=HOfxQUl
z;b0Z|*;P~TYj1A81_VV<K^LHd`Z;SfN_>8XU6rX61#kgEMcV)}y7EN*TypS!ABgjI
zv#On7<w~5hz+*XhA!~Z!owfXiJQzf%7LEWPw-*C00x#0)u#fSR1CS!vmQp~&lfjuY
zkH09)t;J-RgvIVkE*blj>-Uyp4FavygF;7UgLafJ4*;38Z5MS{Yk?iw7*f*G-l3v4
zhf7qhhq`iud)%G<<cLUpvS8RC%kt$I`ZDU$rAvp;EvKzt)FciGH^KchJpQ2&cSy3Q
zPHb6X-BqbhqTmw=Q#7Nk5j#Jh`ZF-FXZP-_f9)%-q^xW?7M`$#!w+{apcH_828V`d
zc^L5Q^{-zhy16c#R9Vzsrf<pm`m>koL?%LZpgYCujAS6_{Pc@5H{$oIbZP5fk-P0@
zr=B@BHf9L6{Ojj)e!|RbB<43j4FpA9NDX%VT!COqb9%uZl=GP=^yJZ_muZ8=&e@Oe
zH9mXR1E*)!0}P<QRRJ<2LCJA%$&{mvdf?k1K761kpd;mVz@n?{Rtah<z`5usfWk$L
zjOydZu`yioE}=GVsy7hcpRC?UGmU$dV5=55G!Pvf-2kas!($JH(%PmA)jxkKuVw_M
zT?svxE#0ba|E%+<Y(!}a{Ud7usXVf?v%fWxyss`|hr*rVgZgAvmV2U>b;6cxt($La
zD05q8uEQG{C>&Kfw)vb2Z91Bo=+|INkC1WLXvpK=jWnKbTYP^T@9pdGzFVOqUZEo_
zvwP{Q3QzsLoZR`z{DzSPpawx4-Zs?fR^}EF5rJkQOXcO~?|xr4J2*>IoUcpaWMkvQ
zg||un<ffh`CjNI_SSW2yvra+q_-WVN_}Ey60|yS=e6#_=P$WdZ{B%JWSHNIjPp!1!
zYd}H4>i`aT`HR!j(?7grKy0RbIrZ-Sd&X)M_5w+m5EKy9d_p0$P)!qy{$3yJaxQSn
zt`8Y1MH|6Q;63P5rOU){;+H~!fWdn~L7OiedwN{m9l?{l`j5qF@KQ~RR=#S(u6(CO
zIc9pn7gdT+^6U$cSDA$s0k#m+ufKjRZDeE=z093_Y$wk|Zxd?o_a$fpi9ebscp(8{
zNh8i3Q6WG5;lqcE4ev?8ewM&_KMtfq(|@$ri%r|x^V|FP?-u}vWe#_Cb~dB5#Hy&s
zBby^ESBJW!#pGz840rXj(@Y?uY3tO>{qMK`qv%jPJ+oIs!}s02<$q=vXZG#ecelu2
zdz0If<4E2lCl5G1ML(W|&2Rtvn>Gs8KJ$|}kvR*mHhTe=K`y(2rUyfusb0O^fP(ye
zlMq(UHDuM|)Q3kBbPxM@ditL>Frd%|fOHc#h`A6`(yjbaUal@HwUKuzhv@u6;W9{H
zAP*)#pCPE;dnMyKy$(kg-UFS|QJ`^(5M8~Ta<JyQudErNM7{&eVS#L>JBb~c4=rq`
zdEGjI>p<(>FLi~$LME>eMFlpC-2YB~W^&UB8IUOni>CSW=FNkhU%7sLeo#<QBszu&
z07V9Q`^jqKmsg&ii$*6VnqkTI{Vj~>MZJcb{_5;}q!FWnq7)qxTq8IJYT4}7B69nT
zj5ear;Tqb{n_xc(0{dcA0u0WdzlEY?-p`*LZEbDmx$PiDj1E=kyW1j{HuUr)4O=Ys
z4MYECaNKtb0rQ{9^fUFaj`)VDP+|n0N?qt9yIXYXZgDG8hdY36R#sL+clVt`@>80>
z|IprY@Y#$|$$}xgdDYJWOU*GCt+;4rjn61&r2-G3scaz<6fY#G$om5&Vbbha%>@)$
zJfKgl@#6@!AkrWYc{Mfbi0rmzr%B}fm*040WOS8Ap^j(=$Lm`DY52!*0@Bx8T!HuY
zzA4}}H>hD7#l(glmG}h*-vp>@j&PlR0}_rj8h$nxYU#Uo?;5atl^N#TTeoc^;z?y?
zrPH_3co>!2g9mF7G-JM73~UT61{_<4Q#lJbI5?sTL4WU{zXO13Eus3TnlVAExw&!5
zHD=2B;ZgvAcQ9l+3cA>8d6%p-x>f4u9JqMy-@ku@FnB0X%v>@bo*zP_2U=(0;JArH
zK211CwI9K=51qUZfrH)~EwMqacnj6RDxkACCt|^Z1%?yD-K)jUuY};mEGs+Ybe)-*
znVfd~ZYcyX_83bMCGU;yIxZl$fbFzshA)B#&`Yuu;FYlJ>U6U`s;W!Qe|vHTTv~b7
z0pJ4x3aaB}Zde?G$SF7Q1=N+7`5t>pq)!5VsZnrLScErDdn>Q;i-PFSR1MT*JZvhk
z9Q>Jl6n3mtJcjGK0~y^MhE%o~Dk{;!@$KF4aBqyO5pa_%IN>RS9Y>r7K8r&Gy7J>_
zi6nRc&Qsh%9y@nB>*|MA%SjYV{tOK*xU0q33Mp_O+7S(mjIO4nY+om7IUkF$2S$zM
z_X-r&H-g5`x?fE(<|hP<)Kf*Q4-+3fx<-UhurQM^(Tun6+_~rPf4$I$yBRtdVYFw?
z@J&yR=-1>puZ2Yr9nk5v&kG#Of}p(5vlsOq4=*pK-#-$)p!Vp#yhKM9&&f$9-Mro#
zDg}r?J&G{gvctZ-2kwT3PL{XN%FSfqqiWXdy5I?*ln15Nak%S>uF!tK!I(MDhn%xM
z##auAfJZYZB|ZIzi9!tC!v12_vG0Lbjh4LMz_<67GPmY4J>-70@8^M6P!t~@AJ~z4
z=W2wq_h=w@7u`<*_c3qTvIW{5(W{8?hV5;!l*B_CZljm)O<mo|M2i-GU*D_H{~J;E
zp>_^rX$_&LhmnyR0$gR5!*df>D-Zw_l2nOm4gcGVrGd7ijs-VT@4M!Cs{nJwEa>iP
z?HdGdVcqYai!NY8w95bmE$<fJwd+Q9wrp2-clis`V?^IXIGYnN5D*R{(irWy5kEOO
z+0fXyAq@i5S#?Vzgl&;iAC?n?g)i-adLyKtvjonL3$#BfkaHTnlsX*-us!z@+;;o+
z?c?ufBTw4Lx;id+807|GqzOjFx<C4a{Nx{c)I^t3U=VvRUw#FY6&Ilqvl4m_%7g}}
z+y^Ohs|3lr%KffLW=7(?y-t!9P~*N74+^bJ(2%a8;be{Y#oG`L|M`7a5vbB)q(_fB
z;W~K!>*G7`Sk%>HVq=Z5T11;6F&8L^9IS_YuihyEH5h{N)y<82)bW-+SP+vLP%hFW
z<jy0BsizzVf3X07^YJ*L&yXGJ;TnzuCz!eAR`NO1=b?YCU++Z@?EhDNZf+UQbMvyU
zSRux>iKSe;-pOwZ#~yVg7ZEJWLr!*fhC_!+jl^yc=`-cRPtgr0DvOE)>d$LJ*2g)%
z8)b6#f7V~D&(Uj1eH0aya<Nv5atjO7yH~pBcY9}N@u_uxSM}e2S=9wQu4xVw@kca(
z?u;THiP-$sjC)IW_eK5)N|BB2?H=G%Yk{M%UMx_S48J@=c>}bU-$GGS6G$Jbw)FbI
z;__sDc{HQR|N0W+RaHoOSv96m1o1>zW9$a)n7?2_oGcf0JmVtg?%lhuE22wdYyoVM
zEC%8<yRR<UFg$T+-h|IuaW$prn?62;2<&7j0?goZmuzgb;(GzqH~3A>6jmVjZV3oZ
zz9XWE)aANCN=n-VpG(S{2BO~s*m`{!NzhAX7*GT7#}i&>P?dIWuN-q+J<unSDT#;$
zJ@PGVA$r+=wF+8}!xx@{Xl6mSk_dU>3)mgKd?h7L5<!t*jPMYYH3u%6e2)+K;=JmZ
zNUl}8_p7%oCZgqgwE%IVzUPO9hAOTI9?(cQx~;r)7%L0VTz34nq9QixiAKyV4QEuX
z7BVq0Jq*LxfEHJIiD=$K#scJ8WVUwgTFBDumvy03xOby+(u4;B2>LiTy<<d0kfLz#
zfL}9SQ@PdZZkzk{>js#y3FL2hzB^89uY#1|FSYgGE%q%#+=^ayWo}+P`O(b2x|Wur
zN&30VD3tCzu3fwKj-|YCySR8Ebd7Qpqh3V3MnDHUSqN_MP(xH1#=f^ZbW1|QS2&$m
z$gik)1dN+iaONYwUxrRWq`%Z$u=GX2Ud8zo94a5?l7fcs0PiEMAxU>cZZ$zkwI67^
zQ+BaBJyD}jV0d6?Nb6h;bW2o<*H8_hsOS!ix0X?E21FYL)GCkshn}cs2-0gNw(g<9
zV)EsE1;9snyw(<y*m~=81Eu~N^_PvZarB43<7{&ZdN_4o+b%6HFW<g%=k?4?Nl@9V
z$vNYTp;>BpjH>77=U*E9rLLQ0PoN+`%wA}z<>*Ky4oWo8Ov~1^HD4jv5^13(+mRi}
zR!K?8?aCE8!DC6c(clz?l!)S`P=Dz(vbEBI1N2ndxv%@;1|U+>LkGx1=W}1%=e1BF
z@=#Dk(7YB9unO?3975SQNZd*y=kfuM0evktoI&mE+Y@zqh&qS@@_6RKlzKbK^LSvF
zd`*_a3LsEY?Azd1WN_iaZJhUMz=_p2cvR2M=^b&X740k4Ux|`sRC;*$86js#)fFEc
zygt^~>k}M%1p=u?;m^uS&nl>gD*U&fsC?~ZqA=#S@C1v#roA$k2N$8B03%hvKn*Wm
z^u{i{MQEwuF^}83e}6OD+g@6Cu%erZW$juffMuew9zA;WS7quZVAE^ZlLMURQ||)p
z5Uh!}6V)Gzn|h9u0~GLwFqhS+=fTBemK_m@nR;GQsFC&9w8`q;<H8RP&I{cjzizP!
zMcL5W#>RcIDh3*~^PKOO=A`4P^B5T!;WUe(SH?KyX>J8b00w?rU(XHi-2-ikB$Op<
zb}mBZ^_P;_sEjw*J2)7^dr0YQ<EBl9STxA^c}SLU5V5CEpUPbh1)4L)hXD%Spc!)c
zs#Oa>y2vg>IQ5fS`Sw#E?uxql&}4Kt)8{cz<@AfSw}3=Q73ku{i=*iL5<!murn`R@
z_j*2Ac|ckD4G_pBIBU%-8&YH@%X(*xSQ(mUq0$@uDtm~^P12WkRW-F60MCS`K(Abn
z5{H0*0I=;El8f+DBOL0(o9Q8b?Huj$RPi?Q8zeFu={zef#2Np!sv$cW>+0%`r(Xzu
za^c+W-SbGf1QJj)T8&Zq{$$lO1eNFy($zfUpj~EbAjxc~8o{$?@@i{k>y1&%^FeqP
zSmkO6Y1JJT%fz_+C!AwB8{0yZK6je_SOO7X^PuieNY~3*S$*4$f#y&pzZmXQ)M^<j
zWwRJMN*-XLl`I_9J35-@x`)SG)Fz-Gzk|%_2^~{_Qzsqpcu_SROE{$$Z%#FOzZh1%
zqrw<F3%AOH&!XPXK@@xh;{pH{(qO>J$w~Aka^i5H1~BX`+I@UYXON?O@R&F8ifPmk
zoA6$D8R-TPi{m*?lBfU%ccy|8&1YaBHAy^nLjch-c6Vv)FId<p@)IP8@49}PSDvoD
zsHwRUQRfD5uOV8jk=&mZuyWtp^!|GahQC#vr;2b0oCaEKwqnH!5^&>;w{6|3*bNc+
ziP$cMfnKi*I0l7!f*5SfN)}Z_1ldI-@&LINAXAAzJ^gU1Y2)#b9M{dh#jWK2sc;3|
zgq<-lF&jigScvS4F879xj++nzBSB{u6vQecw-o|gTXhE(oqCC0-ta;^Rca|Ju$jZm
z>m-)L^aKn4^tT=$>IgJzw(dg*4nSa<hq{p`9zbb;{(5A~fG!7<@#o8^t-3Fl0|A%5
zz$16w*44?C8snKiOp}mMDK}K60f$6OU5H{22w~Y>k6pq;3;fzeFilOz-2qKvwUE*i
z+barI-^xw6RZ%;Mx<w=mCSObdDiODw(_uk@R-Dp`ILtww^7TafMf@#wQ*U?o1t@M3
zeJgH{fCzw{8{Dj^rO0nHn(NR+x&Z9N2@eDcwN^V_fM0FLu{Pb24zTA2SmI0PiCqNM
z0U4sZFI&kNPQ4Y!kO|v{Ch#lO-+$%1!op@mFo{bIR837y3YzVgaEK~?>jl+)0y}+Z
z3?MCAkv)!8FT~lhfi80+qx^XJxm<Eu3Ds(>)HOIXr2t#N5wDwQx^2Y9(aPr}+u%^}
z(*96kCn!};EBFCxNfSo9V+m4&5!xBh*K<0lkc7oGYi?m<8_^F`03-{N+vL>FdsZ6s
zCI`>Ulxc6-yLayfLBYjzbaaGkLoB<34L1OR$!yz$e)Kg60iSHV0d}1tda+2TvfY^1
zyh$)g<<nDh_#!I&#!yKvSz9Y2mMmYf;u>h>qMNKJZUjkM@gZKv4xL}kT|A4~S*oyv
zg#eKB5V2Nsa5N$%kdh0I3teGc!PkJ(gI`T#aJV!d2M@2?+V(4=E#+{k(O#sh^_w>@
z0!V#{LKu)}s==FiSrC>%47DzY0<{v!Ujc;-6uVohN5Bh6wsvPs#~lh`KnCXKK4`hx
z0ErcEeut74@oee@BMXs5fMWteOdefCphG+g*{mD`izkgs?u^k;?+WpIi>^u$9$wxz
zH8r|1BH+3F7ajx$6YbcfB8gK~RrPL2$SnY-Mg(06pj`;BDX7Gv-Hx@VrzevyBk(A?
zN+MFs3=G^Lawx67V~ddF#kx>4*xJm<Xde1~h>nXAXt1{N*Mf<8)b!-QAv|M5*<|Eb
zIX(J@Nw?K0Q4WCAOUTVgBpbGETMG8VA1>#73n@lHUw@`K!h{9tuMx~*cT_MGdK^Sv
zjLOlsgk#K@oBimY4b#&T=15bU(1nAR!#(!)5-<c)WuyDrV3RP{yWph)M;?<J$P=vu
zHuRTW8fZIsp*npjN%jchsb~wxavWJ!?=*1dn_+ugn(GgB4_JMcecx^qwDZJg=lky}
zIGmOA<B9s74rBBK2~5U{qvQmv;=p&eb#S;_X9Hs*`r3T?Zncu`Rt=BCQ87?o!6H%a
zkN}Fg0l@e6`}bAoecE<JIY%>H#-V>c2tFAug^6;L(imE32{yu}G$NB+va>5hMeo+&
zOgO;5F`m32fu=jRr}2kK)IvBmJa_J9OUo$&TM)Y6LVWH+IUSGlTq%4qD-S*J-^QK(
zqHV}JZqPwU<(j+~c@xgZfFtXBf6of7PyF!&J`RL#{xk+=uG{eTM=1V*>Afc{a#rnX
z*f~b5I8q=7FeZbrurQ)ii-Q_m27PgxuyvxrOGt3Tg$>Qj?qCO-0J{)X*Px|}*d|g1
zIgo@JZTV={xu~Y3%1`qwt*x&=di00}SGPBgBD;`j!x{}uO}7ljVZYE+d=n_n2q#M@
zx4U<jL#rZnqNsth$4*H&^ha+FHiYs9hFBz?B%CukPv5C;0c8N<AYt3Jnw<SN-P?J!
zN%#|Hw3m?^sj(0vQ~Y`HC*v|4!gvGho#HZeIi@7cN~;Y0=k!RgDx85wP`o2?PQ6VB
zK#e3xbo0SC_Iz*}`18jNbR5{mUup!-=)UR*Doeq=(I44`Pd0dPyC?wjQ;~qJpkQGp
zM(KEXYE`<K7y5Ye!KxvP7u$rRNq12~-^9cO+NC($3tr+@hSd-=<mBX3WW}lD&(GX3
zF`B?*6$l|!=LS)McovCsXXwn{0B{!|=c-2f(U4Zb|5wp^%Yb>YYF=RMMmP&iP$#;l
z{{VzE;Gvg-Uyk!CU%0Rpz(qKXVc9xQtb)qwd$ya;yg~HW>oCUQ8g+CbLbanlglI+t
z1wBcRc(ALE9k3_rI%%A|$r?>X;uMt$iY+)c<pafv3yK|)(Xd~8D;6HCm$%f;amw7>
z_!hwtd(X~=ens{qD}qrFH|Z#Rqlhkp#j2rSR|0jH+m=k6aIrjU550gpDk|!h%rxx2
z4~E%Q_2Pi$)Nc3%vaBL1ID{+{z)z8G&7EddD^U#nHa<``c$Ra0!k0oQf>yFVR|W64
zd_#9t<Jo}F(5K&ezrms0ua}|>M*7{{xsGTeDWEZey$brl77OF(*jTT4W{x#$;x!w0
z_gyA8-|Z(?Q(H?ajaVNS%^@Da>AD<eFPM!nX=f4<xdPFOpgYa%O@d3&*3;0`w2{XN
zuF!zzS`oQs935j^gc<`C_jPB~v&H3?zq`2twg+gyUR(G)p^K;?$YJjR=IJPW<9_GP
z((#dA=Jg4?PucD*fE?NcNVmL%RFq9kJDrS4;&po}=mH{OKRzgc4ZDL@f%rk)IjZ?&
z5JmF0u!R&zb6xeh#LNSwNT&e=N<K>=%kxMUd3g$y0wfNL;jR+_B`=`4s^=HNI<|gN
z{`ljSjcMA&+BM$Z-h@4%3f>5r8F=YdTW#AQ8rex`rcwYW(a(8c@FkjDQPm=ykVar7
zuyegDB0ZI<^b}g^<QEyEwf<Jy^zr~UUZda{#Dqxj`Y7eQl9JNWUVM40tSzw8fhNy9
z0s{jzL=bw_o^Yuj7TBc4%g^5ojbT+4G@Sx8A@*-G{{}FRo-NXib>8dYdh~LV$8TQo
zdxz&7-9h6mJ=J2FX~KHIa8^I){IRW64g?LA`lP#m3{Rc9j)TYSFIi_G4e$m}hJMA$
z?2WM|4@odJa#SlRDM9sLu-{Y~{cY;`9vjhXi$gdnCTNV%i+jg|CPxIfM@WOC*vjjS
z2YBD3%kz5s2~>4(D(z^KXWAy|U_l*Hi<UZ>n%gvXOJ!{@N+XNP<NHvZc?Z$_f%y!I
z3pZIKhkI(b#4qzZ_;rLCW$ktHF1jW@>m)V&|2#CoVR;mSgUe1nJH?0|oq$_$J8#~;
z{SH#8A|tnCM`tJLrHrcYKuBKontAy5<kKS|-SxTpmFbXwQA|*&S$(VGh!(3H^*y;1
zlrCS)1p6YW<r3w)jgN03nvC|si3k>g12<yX&^a?$PV5L-k+B<33i4!?v;0zCPL32E
z#&)Xb>lvaSX0sr@3|cL*GEsO0N*?4+6IYx@jf!{tiL)46&$p{;2z`Jl{(;!6iN4`a
zNA2c{-Wx@%tgK)xUS8R=T|`yxH*Va@skXj!DGx@6C!-^atA<HwnI!AIKB^>9)B$Py
zrQ=9anUG1{`nxYaoKt64=NG)AqWES1|Hapt$JLyFfBa(~nHgL5C?rXi>{*iC&DNgm
zOSY_K-<z?GkX%JVlF*(IvJaAMDJ7MJ3Q5+IO8lPZGGl(fKYsV|U1q3zKli<y<@I`<
za|pPR8hl(m&S2+3aMu343=IvlE^-tCyrYnlg(T;+W}~1x{oxcy<kqQoG<d_ETXre`
z6-@Z|839mq_m;T0{BiK$<UitQ%Wa-(XR$?KPE^ZAClBo0x%1d+f=tB2+AFpK7Tce!
z_F+`%nFUWP!m%2QQye$Rb-=0D^*$Xw4vKB{=fJz`w|p_Y-?{^Q8A_qK2g_YAj)cNo
zK5|Y%9?tkP{n%?kKOjYa{V1j`LB6r|R%@uC`r!}yQdY1kG~TOQn#wxuL)+cEbYs81
zvM9ZGy=)3M6zxy#blh(H!kUi>2R97gwRjj(scT#}*P_J2@~M1UK-BTJt#gJL?A%;>
z?fUg+c!~|zZPI;weFL~_RX*_?f1EgR;vRA!H?#{!2H0%w**D+I{_EGT{k`?bzt6<P
z1ef13p1bwno&1j?lfF4a^oeA#wvD$~Y{$`tAs8S@986FZwcu%gsJL-RLTq>W`L(y*
zQ~7wEr-&u2pO~->K4z=WT4d9ksl%01fBez!V7(U8I_(J>#ZvkmXiW{(26C9{1@xP2
zQ-lRD&A}ZdrDs+A2mjOjf6$yU$i-%^xh34z%P)Iy*IS-vAzCnVI6&TD)yvH6;^l>}
zAm9++-rP9C_Eny$saq$BYY2@SJUj}HcI6Y$jbM=2?&oQ#@SuFL`u5+7KYXY&BGA_B
z2kaLYvhc%!?c28pg%4k>wP*N7Zs*?_8Dr<V7#_!pf~a}DnaCBG+>6kfv~eJriB|9D
zP+5O%+g3$^3pmJcEdO-Nv7X|V)XJGJSOwjW%6{Qm$m{mw&gh-owsq@zXTvH{Cr_^5
zv!@YMhIg}J-l?khGrl0AfT8T2^|fEWela0`^j`MPyvy>lJ;t706inO%QAh-|yGM(H
z_&tS+LL_OEw=DLn`-`?(bI*4W=>swQ!SQ5zhwBs(1tbPpwUj_od3Whsc}+IzmYvm5
zn_@vGwx}~ex58~*#{wG<D!>T;RmbM$pWwW07f*bX9^F=M_emmP$0zQY=||oq^gKL*
zdxrX}`AN*(Z#8OcsnflxXW0tK^z=9vyNCqh-hKciH+Ofa*}Z#r@9R6R2=_|6lf@0y
zTGgv3es0Kj!Z_Ea8&88CJ$g*yCXyU)Ae+7(XHYsC5ln;C2CpB4@3r2dEf?_Q@!|a<
zCR}hQA^W8O=j9$UH@EeSKl^xk-`n9}=K;$lVX$)i4Kcv$zlo(k_IZmc(*?li>1d5_
z`}coFs78&7BgEKniaHZ4pUq1_AJ)-Q+cI85Ok(l|ZP{ydcT*X+@)%#R;)^vHP+eBa
zp=eeuZ;Mtf@(z&+Zun%iZ#_(w*V}<ZVUv-V)F!H{?Ag^tAIvaIO99LpGIQq42)oC9
z?NOxx1Q?!#*4IRXr-+iAw+v{i-Fk3bY-~N>LYJ>!mQU_=$w-$ti9Gw%y<PE6uw|RA
zO&Sp}0PR~8*_{R%usXgOm?CLs&vxzF*;$lYZXnTd>O_x&KJD8Bv;zv)ElPOoP=^4t
z5;s^;NH!@7ABFoF=F;jj`CrhY^HaVNPpn-uJbl^3FF-c|#@Pb(^%)HNvmtQ%>--bp
z;o(8skG#=!&wWj-A8dOYQh=&}K?wTY!UDsnH+V^9_~KQw{9ms=fu9KYgtG)Icbs$6
zFSP8$$XQUePoF<G_~KWyNw?tz@gve3w%q%y$T20yHuh`D_ygP(WBp$^QjMmgn%UN}
z3Q7+I=0lqf-qW1lMD0Ybu!^#{mya*>l47Q|Pj?6PKssM5y5RL|ubvTLSV0t49Bdn_
z)N0kHlH*}_Z!4s&lI!u_l@$Se^78Ux%(igEc7H}mSG#G`-}Lg+9W0aL(9luM@CQ|K
z;--Q}|6^UeyuG|CLq-K()o@(_QzQohj;xyw9A&rS?>v4{OQlO{Lz#tkJ7?0Vg`-E0
z7F~{V3_~wj3tgLlpzW7GwGB8v*kG}86Akv@gR8Y$->Y}Z=mH%Dj+GO%LYwMmT_mGd
z6c}sUh;cFZMtju*uRuk1a6Ad#xcIsyR?`n#EhaW!hp4N5`*D9k1P5>&MN$dDH|QHx
zZf$#x+!Lahw!WXUyHM_z@8$6X&Ep~$yA*P=t$NL9<#omIW0~J>tld<y6KWuT)oI!^
zk+^YO<W2-UGh0lY9J!P1ngg(jCf8SxrMDcs)v&gmnJDG4a1|A3n)M_QM&*x<+G&65
zvDeo(B<a0-dCL2^378sJgQN;>F)=YoQw<{)G}G4hn$dT}qrS=YmReY7`So2JQ5bdj
zkK@Hx!5{vndSPa3eSLwFe6@y6^ZguVGX{Lo!Y7Bvn<$XdK#v^-QzH}HBUesHeL=!1
z=#shpvuuc|;3;0V5AYLDNGZ>(DL?!2TkC<bcPPIeSRi=m>N%6^MIX;RzuYF-e%jwv
zkSU2#Hcd#_S+#1_04+WG&cu7)Ru`ZRdw6PY6UDydA3n3_+}dwuJl+NzaW<kyL{J(u
zD%S1n#ST^6NJRchCtt;p)@`?71%6*(ZPATlg+*{aInlmFowl`SS)sZzJh^>}fg@-7
zk8%WH8kxGW@vq8)m-m_rp44eikD3MmPJ~^`wYsZQ`nh;p+1cmcHFdGG`BGjk=mH-d
zv}(@ezTdz&MIWLZx4)Y(|L`Y9!nmimnl@<R+>57B|5H&}9iZb-l8gD%$hr#CR%$_0
z*_J%Aeow6yBq5iPdbk>>;X}GQh6Y;hs~q+!X8{i4(r)tAD<CyB`Xq=BpKo00-rchP
zQ7i6nz~o5}$HljS;Z7*`J+s#EdLOzQw{SR$X)XzJwxo2s$&xpZ9A{qoAtM^pY{+=8
zlC7TMQ)1l%23{e8A+55hJ2`lG-2;m))*+>Ng4&~k!c?6m;91ry7uTP=b;r(UMZ+^w
zXDIL9y)z#;urfM3ti`VUQrmGwNNWb_7dt1BkS;)(7QSr;L;+PrzXAjGg9rBy4Xr^;
z9KLN-piykXjF44#9X_2nI>j0nGQfmJo$~Y;dxzW}icb%h76h*C^mFEuh$gtVATk?>
z2?Hb82=f#&)I<|+k}u@Wz;F&7KKvTSKWc79H_Dc45_i=Ozcx&B7N%`^R(zPyIF`TP
zLQ~}D%><YtMHE70=9+;}<{*k4zJMZvS@feWspE>mnUWrO>G!wY<^5M+%@M<Gcg8yb
zKKu;_TES`jI5u2JdaFysZOws*s8~P}9W!R?4b%79ZWi9Zbm_DFEv{_z`DIZZs~?8u
z7s-jxx6mdzzWY|7yBL>ZrsXQ0;~c?x$){l!7ImVB-5h8hn`-ZU#%}ps-M(wKf-UM%
z@u<kcHE7TPmEhl3u2cjxyWQ0#8p9Ugta~_~)W>Ayr#Ym#SNTgx0<ac`e;-zd?i977
zfnB`K?A!0&qOOH{!z$SN@OBSXleaZuP0Th<Npie9I(hZCdxIYx_BKi#9-C#l`0Jy-
z_NqNzkt1ZuU`-S!yIagz+MK?2WO~B^Rxo*%l-CX%G^oOj(y)(T%U2$*>!Tg{!rg8s
zwV*Da-%X}GCTc1<$ia<4(T9kYgojU_Jk%NM+IUoH$BrFE{`~aSD=ofIwC%?hNxR?|
ztj2oxeUpb@O6f2o_}O;vtQiZq0ff8NfJziM?#Jb;S6e#dMuN9gPz3q=+I;5^+n;>*
z+O^88$-pU7S|Hl+(CzC1Dbmu|IBa<P5QJ8|(n~I$1XVfXnybauy(jsULYsifJ8s_|
zjw`Byp!#oU$0%y{I`uuU{ujgMDqUa3^_WZ9;H+6RfB^_g49vuNifYEKG<1~`<M1rO
zV}LU!aCFp^4W11x@nHZOHIV68XEeNNu-fqJR+7Aw`s)nZM4S!Ze1_>0YY=TTe>#AQ
zF2dV(C)!2|*dc-oAO-}t8$}k7w!Ws)-}M0@O9gYEaba5#uUbv<ByborCFy!u|M$Lv
z6=8747i0&OeLdJvFk>N^yAB?1{wB+GuI-(GfV8Non~qtyAuK?`686dISMIGYTFJ!4
zB^Yp+X@^%2_r<Z^0XXA;xbpJy4MrX5Y+lFU*S5C+bI@SKd}_PvQJ{Jd$ry*62rrg`
zH%p3|f(NuPsuMfV$u6{2o2xL+$^7tjT7`ssazUP|uP?&C$yi+E^d9d%1MG_c8t#d%
zC;bYt>#|<*Du?S^uyEm3qPZyN6r=MiinIV`f2;ew)aYi%*tf$jQE@0PrsdSdZyq(G
zjo!wiM_WrI$@4q~mbwnox!w$!U&9^?^aP0q9Cz0LJwV+*IB+}b`;>>OsBn?B@T*BP
zEz_;3Bo<JH9E~XB!U|#b2O@1yoLU(S95RG2(~!9(i8nr*2CJJN-5lR8jq`8N^BYGc
z#rK_{uD#cMX=rZWe{*>F+>5L0wJvBI@j7HKv5G}eC@O@Y@SC&u_EBBxh`|>IX8jV(
z_02g~q!k1GlYRx^zQR#uo%>_5xS`fPd-fEyG@ejVL>628X1{g4Uevn|Dmbq;!(Ie^
zs5}WrtsU9OHzddZP1&YX&+M<GagkE$@NGw5FYekmM89U6#{-f}4#&osZBOdo6mjp$
z;weC1%JEKRnVFds4L<HYZSGd9T1cFb1gd6U0`X>vsF<PTXUxp*=r+hf?hoB++fyh?
zfV68wKFOua{e~LhOae|B)w`=5I`sLFsPv6J9gfEbiy91Z<b?CfDku+8nWBwrr>{4?
zb%ANmo+0h3@BMKxhwcCq6;z!1<0)MF7^4i$_FJU|guflO5FDZ(b}*4DQ2ImYkicBU
z`3V}P7y2Pi{2OTc?z|0`H+zk_+k<j|@9d_biFbW3?}b%9ad&lkEHIzBa{2NKidlJ~
z4e(sh>Nx-s2smpLz>gS?FBsq*-!9~)pT1u>mczNv+Pg-&;K6Mc*v;kV616_#A+0jr
z`h$oC5!`L+zul@^-ar@Yk#}!jy#gQBHsvibOTekK6OXE&@i<bUMU7xhij%26tc^|a
z?in`aAS%l%OGFAs)yH9vmG7SX5tMm&ScB+mhqe_Df>j?>@b2A$!*Q10J>Tl4ZymeS
zb%Fy!W`d>+Ut`v7IA2=y?-J=Y&T?2(nzv5nslm%#Wkr!~-H$EM$*SPL5eLB#mb
zPYILK8y@Ot8ku1J;KFd>g9Z*Byz$=M{L<d5OBZ2E2IQ?pn^M1N7N|n+GOPGRP|`1m
zh?af%Y=U^-jk=7^PuE%e7F(qf@bQttD9)Tk5v88dLHqZ69h^s~M6%Z)9np<QXWhce
z^7!@@oa`$9Bg#waf>=SqA%r(S>RJZYNqSIxxFit{fzc5gp=#ah_gl+JK4#w8%JHym
z(vVpX(A_6GjTQ9f$dML=+-T15wSU9zMeK;WMad6ZgzBR5f-h{dILv70Slh!rV#e(8
zzURHC*4_ee-z!UGhQ;f)JLB&mR!Qu^m$!A^M^ySSBLfn)nPkO9(1*E$=1#L;^V@H~
z**U-FO*h*=Uc)G&q|gIx%hC<>_5VtDpBDW=jTARIW|%vlHd!!qgab&{=o>ikiP0IJ
z?bzwlL$F}c4G%A(7_E)-#0x9zg0o#eb=FAqE{u7Wc)jdOdU|7=j&?e@oONDl7d-IO
zvsR<6-PvOIzEh{wUE~&)k7A=G@jlh5VTCTf{;w8y?bOLS&nfPUVcEHZpF6h;y6L5z
zaHvNoa+OwVf<Bt<*^Mgk@661mno`xQHDx2bJ_pZ)%HfF|W_1dKpBfCaAho3jS5psn
z*u&<$2jR?sa2Kn1J4#}K+Uk)y($1rTq#YpkRO9YQH|^2muXMhcMbidNnzRk^vaJ<6
zJM{Qe#qq!`@P&iK2)0#@l32Lcsd*xFrKkkBuItDQat^%xC&SYGf6!|n=(=|rU8FUa
z#mBecb^LOrrcL*c`*_wv$Fw4LUIsSZ`^a6KMDN;w)daj2as6x?ds7Lk1gK)T#64D*
zo;+r;k`?y$4YRTZMXwuO{^g5T&-QwH=5?PcH}?1TzwI7`m$2C1t9|?5F{f3PhNMV#
z`=?Ng9kArCMe{20B`3h!7DmK{+6K39ZgA@O8tU<qLf;}|o#1s(EzE#Xn!LrRNfEZe
z>Y%1gnv8vEBG)~y>bQM<R+nm1<RYB|5Pxn5Zv6ZD^}(->xqBl9`s?vjN=}Umi?##K
z;x9LR;$T+={L$R!4V0s!6k5Mz+_-%&OSJWg&om`qAVzKMOa_lH-Y(~^&QJ(l{SXT)
zPdriew`+r2yS1I*P-E}7;McQlJYT1Vbu_K}`yY>rv-SlpYvyZd*{a>N?Fq{xt)4eq
z#`?9`mP^?WSgh0<l9D_G9*;<UI(636U#GP(*pPW?t*)fug*~L<z&R=cME*K-m2D>i
z___>sYbvVuI(7cUKIMkA7+{;uri>_TBp2yh+~W&z#pAwx)4lZeyy=0t8%KBD^ZZ^$
z@LbpF4{O~yt+S=Y>i!4%`;H1&WtsC)W%bfuZLQy}_SG5byE-nsVY+^f&()1l*()#g
z{c`^J?Kw-9&0TnILi_P6E=*W?!7rgKWA@5Rr;E!<mYL0OvoYcBwt8J$4-r#2PLGtH
zED;XJwQAJ3j}>Z_S-N!25@PvXxkXQK(@D7QLMD}1sD*Ut)$8228JbqZnpQ=MYH7LY
zA*#<Ig`Ho=VU%GS9a8Z=i48O8AYL~U`AQmj^5(22)EQ0ddvx>ol5zElE4rKMmz0&w
zXq5b!V$O!Q6x84KoU@=~Z2*I}mJ>H;xfo!X-}Sm{en7W-q?zlXOTEavZrNpE#ky79
zUq&a?#KiX=sYuZbZcTd0eK*85#?3}AN`q@_l{Ig4yy-9F$!(0wztQB+^i&CDc`s%i
z-ddqjRi{18%V~^uCcCS&BwDPyVflf1_3P9LpyV<Lk8%C>iRoq3-4!xZ&CtZeicSx{
zw8Z-(@-d|j5plX}#}PNA!VSA#XfbHAg@uLnf(6H*7^O{*HEGJ(?5+ehoi5f)*k^hO
zD&_eydi;Le_V#bBdPXl9L3DTNo4xfuC2Y`VLaODon_ecL8qh^LG+!FDfYRQX*|NLV
zhoY&JEL~sX>gpQtb#9n_sEOgdyLU&9>QiLvS86<S?E$+rAl0URXaKTaoJ$}B&vNlz
z51BR1@o_>*O4qoGjXF0QGWkqN&8DF#Atz6sG@c0L+TH$?GzvX6mlU>f26^XLkG@;D
z?ziloUdf^>K;E|Qk3CbI?Z{KQ(tIHWd^4fICo>N5L^6@=VkWLRWYwi(yCpxdtsKhW
z#j>w(Wj?uCOHq_9)ve#keuGxMF6h*lZ)DG9O`_1fgRbrY<H&IHS<`3CI7({Vqztsj
zxATmV9;u_l?dg2Ka`~$ld~(NAY?pIb9q*ll^B2hd)cPw|uG~gLIEU4l=Y4G{eB-2r
z07u@SXod83j-RWhJD=t>x_K?Oe*fA)7Y=80R0}DtKfbEK+hKxaMqA6#&A$jqpyylT
zZ05w}dT018rSfpB!Mi(md7=}qyU$H8JoV!7mt~zhb@HO?Q*J&w!Ry(p0lMBk>~5)J
zRd_J}L_)R7l^d2(@sq@^+H!HEarlPx(_6M|p=x3N>xl$$7%yuWiBm5Q^jvHqBR7g)
zXPN@C-LNkDl;SW^n!}y$vzFzHnQ$=YdFxkuXM9Q63{XlqHvQLU7fs%kza4e6JL^(T
zd$exTr>~`s^F+J$QKwI*$_#>583of9B3+8S2;SfOw_0;q_FayogqgnCdT&4l9Hn_t
z(S_*I#;!}3cK<YW$`mTBk7xL;C718@yWyQTzD$rEM(w&4MOOr8_l+wZy|ds$j%piR
z7<x`mp5^hf?B67dH81;X`}S1EY91kIeg(V<Zn=jJ9oGJ<A8o^Y%;L&&{F8|JBj}4C
zYiNC@X@?Gx#v2s=<RYpRuYwp7PS9zSPt-Rqz2)Ai#O`6=21!MG5~zNk`T8x4K03;~
zF2`039r~9>H<G&6w6M51!ER!elG69m(sEDjQaaw;M_4{@<;4`|6mm}~V^AeQDdHZ&
zdlxz0qaTAEImFbpc(&8+d`h|Si}}9BReCPH`rv{6{0l|9OhP|pF|NR<M~~5*zUxs>
zo;=ZUIA)UVqP>yS-zlvx4EL_`CRx4Ig^WdY%D8?fD-Ou~(%kn{Wdumm;1~Vf2^L4}
z>tSUdY6mWL4=*=Ae*eTliE+5|X!Jwl&o8qAOq54_JK8ws*|PwKF5OwShQ4S8b7utf
zUGm!JD-&27mT_g4?Q(o+brwD*6lGB9n{#g$A<>|>)le)zyJOyG+|<vdm+FV<kAan_
z#dgW~h<>>q9{uQ^@zO=?u-;OTcGIunl(-+nOgG*T1k`MU(_Waed;9iLyW$pR&(DXm
zs=w@y7MdGkKKbVD+g_LfG!HvNQ?6FvPqqE_w!YAZB(2@VOUu>c=g)s@JaOUDr3sx9
zKY(5q!707y@pT^Jk5c-Rw^e=k_|b+cs)e_Uv5haWYNyLrj2Yt#6*cAc$`vb~_h|O^
z?anV>FHF)u8dt+hTYL4FwXhPWZ+m6{z4dPwaYiC?Cg1)QrHd(@;kSG+X`;pO;kM4s
zA)-%``v#}viH{B>2)%gkgOoXvGj;w1PkDqyU^Z25<X=8y@QVqw3ervtp$SVOgOv7!
zR_KBnE+z8PS0k0{f(4$Dk@NjJ-SO*_(a?>s$iC=p)<hI0hn{6;d;9rCP<RI8rjqy8
zstZISLx(a>mEG=~(6G2i*RFPCTge<cFO(Pn$aFih0BTD@_mgsHNKHrMUE^l#il{l%
z8FVSCFYL9P3xbvhz>D%+O-X<dt|hmPJRblKAmiF%oHIbiVS;XVdNQ9a@I{9S_9BE@
z8$ORwz?%pSgJ4100J4rMDK5yE5q8`4>VEryZH9AU6Os2tf(5ElvwG|UA~Ufm-~Y9V
z$&`T+rKU&<+YK7T=eK_sx6`3}&3n57sW2XQC*&K$(i&|rArMNU5lkWMi&3|3_S^tj
zL(;==Di*REoM2%?F(OsI>Div?+n_shL+t2i)_2>1x5PR#xk_U$t+@bsb(H=Oq-iRC
zHU%0n1Yy?==jad!Ip1z`$N2e`)xdE<le~Hqb0RQ*UA5K}zR8U0^s|dY5!uO6#Yc(V
zkV7DBS6=DmFOSx&ZD4@F=eK6PA3p56>hm$g1>uyijVHX9M_u;LAM<zwqT3m!-ZA9L
zTanH#2BVWb#fK+i*u8e`IwE`e=!_NE#sd(7<NMxbXWP8S4N^6*cvnCif(j#=gi*qo
z4i58vlRZ;t=8f&0dSb>XV)V?1YpK_YM#8b*bm3sYgR={s{)x`>5ZXw7rOl|&N^{eW
z52Ag0N3HQ4IH1zwHZ>tH#xeG&VX5c5)obp;5~t)(<F+5aUAZqvzy;d&_H@npi?C!7
zRnx0^{}uFGU4#@}Wlg@&l>ubqa*f(BskH3{T6rvQKeF8&bVRxN`JqVSFZD!VMX%Bb
zX0`Q4D=~dYrQt80Jn=_{(5T8gR_4{is6))9T04t7X)$u7Jtm!GDU#eUAHBo_rhtB%
zR#M)%JFe~!fRaj$vLuE2mEWNMXfbpT2QF8E3NU#z@Op1fpkZEVmOsOue420t&gVY7
z)7DL6J`}IYDDFx*LR~oTyv;;eXY?4iHO*ol3C_5!0C8M?*0-;x<*=|F?IxUm*s$)$
zmxosM^7Qye3$T59uY;M4ggOkCklL_P@quq&h5#pu*{;{1!BMW8SW@vET3SiWZfZ#g
zLr+t48PV4?PMd0oi8N+NL}Isy(nwS*ZQHdQfpYQaMIxG#&!1(x)q#M!d0d%<s5?B%
zj7B!Kb#%hWqVpj&)2Ry+rYEn|UC(q|o2u@e_uk%T5&88}P>><r?8hV({c*W_!ZJt8
zuZV|@Ph85n%hOIXF8heQW+ocCrROQKHC?Lnq|JC;TbDz=x477VUiP^4x6AhHL)(ue
zy;|Bk;p)`^B#U!TO&km0Y(~c`;}bi+jYrtph0_6a*r+(?EztFhux3fHHBI}j`Qj1t
zm|s{7(R?dPxOfoLO9de<yCb@?#gsZue{d=7($}I6MQ<|4vO?GO29avqp{C3u<|`7Q
zslJp)k=-uroq*%x4Nm&8Fui_1KLIS1A$L2nvtOr19$;k4v=OKeA(i7l?PJ2+D-e=P
zxyxK#kK^jrWTc&&Kl9O<C`L_$5RK={GJe|k`i;%Nf$I^Tgkf8BGDsbc$Pb{Udt`m1
zxr#-<KX(2F@7#xIkh@agWL(b|(`-c^?)p0aR)<T;bWz-$ZJNelXg?x}<r$IRSYF>Q
zv#k9x%6p9+n_=v|ckNQ-QGB;v!=A<wqWgsYeH$wsll`j6+Eg-%)l3*!+WRe?{^LJ<
zv4V&XpKxIe?RJ))f8ZSbidvCG!t;IE#VZK<y2#~<T;y_EtL(+IjW=!#1>0-u5NjgE
zJOu0^w!yA1s=e!8vb_BBIUp-e>S!t=Hp3gLvtz(c#Sh8i%~>0~A5)F^`R&6OcOt_}
zrbUnG3WPs)SNXX;WayxR^=j4f=hDqMXEB5Z&D*?3x~|Y+;~Xq3+-13_3V3vC6tFrQ
zf3xBFS=X~WW-r~3@V!ZF)3m7Q=oEC&DS(3+ekX`*E)*sW8=Tm02`w*cGoNJuDbK~Z
zk0H`#loYSc4?lg9gPn0|&sg+L$rKnLFCSx7*jy#oS%z!pJ`~88faFC;Imp^9PAmTP
ze1R!FNgMZA;4kw&SB1ZOhhNC;(Q?fa<|ZF_aV7QqOHKtXH$rI8YSg>;`Fo{psx!zf
z5W>H$q2W*}U5_};Ob*&1`fJjdoX>r#`xIb=<t=GPr_Nr^w9u`TtA_FuO?KA$GJe&n
z^TgmwZoOx1FO<DGw{(@qu80PQM7A~mV###B14nnN)j4n8%qv9eHs09%?jPr$6lubF
z=`M#mHMun-jMOWg6x*Zohl+~g<%A8iP^qloa*q$U(DG?V-66|%*X6YhIBE3qkJI~b
z#ZLEm!68USqg}|E&WcJvI+9Q9Y|Z|kC|^OOA=o>4uUO5cYigY}Tl0>(<U1*6yu}_F
zzPaH6RWDy2!=D>6-@jvKY<^tfIb9QbEi%<c>y|)6E=aAk<n05>Pf`vJs8){bj;d-)
zfCntSEs<ynQHz{2urWI-gV4m+Tj2*@AA++NjV4vL5mdw5(5=(R=EzpMKeh*iTb!39
zjWv_7S8pFa#^r4-K)RD@cLWnLn;bIz;_7n{?16|c4b99ZfT-9L@Y>OJx9j}J<ABJm
z_{@bV&&gdQ_-$uIYwjK%-KKYtt4fTVf{p1qD=8=9gGK7vjz@>@2_QVg>f7;C_Usp)
z#||}X*O~XsrP&SX+Z*R;3bl~oqoev0YFj%w1+Mv4zEmOrfLU&kn!!QMqp1f2+alz;
zVUzJ&<1Y6kV<5zpKpSK0;8AJauq_bYVmV3sJgA~e286{4U4A`!1#NM~9}XHk_~xBE
zBcKqSBecu{^DiBfaRILq4{hS&)oau!7$-IU;wneG5L@|{vg}|8p)jNyz6{m+_TpJp
z2+ZhtnPa#JmYAtEWd+^GpIOk4s`Ji%oiQ>2T+mR0tEHKv`dwUuoEN0Lp9moZ!Nu-3
z?X$vG@*mP)(8}DWPY;TILr?F?sZ08MV=<<Z*%3!^{VZ_n^LwuSdR|9ICkJkfPcACR
zNcGu&P7itXODQ_wjq4HNk~bMOZ{Ck{>$-5E4Fp3W7Vkw~*3GSPRt=fF6~qah$Q?^%
zBeq`I#)Sz%`9a^>qvUmmbTZ`1A&@wNg{J$e1xeFA7Ss-*uwW}%zy4}VHtHJv%3)SY
z)YKt;Xa_B)3uLn^bFs%YST)odAU_33g}5Z#g#;8xr2H`<@hK^G{HeXUxf@mBWB=M-
zF}B|X`&$*>EJnR@i9(-f3UAvv9v?{M(>+43tuOAzHCs8H<ey$(S>uZ*Pn~)QWYSrJ
zLsD*8MegD&>LzdAxM9hC%+1X`j4_yvaz4c7=-u%}HVk~C&BQgIhJV<Zea}c#UFdDZ
zkr_%!@<Dn|_i05lBTD^OFyr(Nom>><(ZV@%<`jZ*%zk=hL+SIq^9!l&ZUZVK2hbtX
zkKs3A=zcP>0bI5e&Q-gK7sug@ACL!x5P}=FX`_;Cjkf@Z(B;EkdTeOKZ#V9_C`j-G
zj`~Yv<M&vE=(XS0?C-mF0tc7!$9=5J?5&kPk&YJm1d`wmV)pMbv(lF>$&E))0j}1k
zw1+-4v#4${9cl5q_R`qthiiJxBX;Dd@5Ga|Kq&+g@+o5aeQ-1Lyek&Eu{AnJS5n?o
zjnoX2blcm?z~B{Og;!{3+nt$oZnoa$&EhOw=WS*=M?MD%bcg=(l`C!DNxt^uW!}b?
zB9;+89shp)eeQ=><5SbCUA{0GPUHF#0?^wvFqr@QWnO(psJ9ubsnx5RMGSF2trPcZ
zMmWR{2U<z1%>$afU-A2o-#WAVy_u=ve=OJea80zLQ_DzWHq@6bxqKm)<E;+tH0`F(
zzRGG_wpE4Qt23W*rLG0Ys?>pC1$*~tn%VR7ZUGCkKc+M}u(Fb}pfz3MFyTAAPA51l
zv@3L3xURd|HBG1XFK1Vk9M`Y9xqbt%JGDMaP<Adpr;FmIZow5wni$zq^JLoVah$WE
z6{g4W?`Oi%`(`aWprg3;iYe*ZvcrMM=OHhpP4M+kT$&HHYRE4g-@RkU<S_kZAG35i
zjuH=DKqa29Pi4i@8g{<P&TEgpe4lG;zJ}i%1WRKNDwg}=#Zib1`__Ev*t^?`W2ufS
z8DB;5Y%7|&B@@gh-myy@g85twvKtKfDW>%Cec7hVS^IjpTv?WVD<vf*r;BGG5JSTZ
zD4=d8)fNB19;7($SFHJ%uNl^=lt`<PsZ)FjQL0p}Dh5ESX3g&O(o`-_oOiy1pIRM2
z+`4yPRmHN7PVZ&+9^Sq6A~eRR6uJ977P^SpM?ruH%SUOT*|iIin@lZZl3hoW>hmOu
zi@HFaaNV7XV&T4bVuyKaGy@eBy}o{{KRGfK$l=4MPu)Cd8Z?$mVQ_?Vb0Y*8>pFVr
zb0gSN!!~a|eM}iwPcuZxyx+Oyt*X15xmR=NlrA5XsQFC)j<7wUXbRFVtUOHyA{PyH
z(?~`e2c;#l9=~>P5cHwMNYU#)uYH}sm*VElql#&JVWH+g=H%v1b8S4-8r0PG$nZug
z>(QfqXtg+EWM#$hjl5l6X?{%X6cba;;AQ_Va=FAnVTn&1;;o#(Cu&SByIw};u}34J
z-e{+5s0tY!6`*NzK5>}lDcxE$(AAAZu`pCk`-&hoYtO!4)Hyjhf#ix7YI4}vIh-~f
z8u-kD^Hl3^iEV+>a%Wx5y|Xw$*yAHhr?-WozD9L*1`isP2LfWyx}u^zgWfG$aNDg_
zABu~2U%0S3BJ%L}Z@Ht4Lb`2s`m0v29x-y=ULe%WhZCZmy*H~kRO3Ot-miEJps@2k
z-$V1VESz_~uOEpr3+O9*zK_!Ph8fGtS9u3$2EGouo0cX`|JC~Z#N9_7Z=k5NBv4WZ
zO1^;9QS(F&63a$q(q>O~{`+f^$KEVmiMch~dKYa&p)`q8qh2FmIo*5ws2>ZJdST^K
z32GRgd66)9(h{9V*D^E5<K)$W9L3H}epB?uq8m-dX0}ohX4(;w4QIP}ncqcAI(-TM
zY^haJS{i~>PaVj(hqdXTmhqeU)`{qYB^_2JNua_$<mRoA)1)XqSMam-x5U0~ctbM$
zZ3U(apZmx)9VaO9^-ncAoS!sBGtp9Uok1xpaFlpj?>&2FEz#iv-RY?R=%F9Q2b$$g
zVG!G-Iay!M?ikx`&zDvjQ*cJb`JnyBV(Q{mNx1lwBO{vZw5g!DZy}w11=nFuZw;(}
z7)XuCA*XBR?J7@BNpuVeW!R_Bcy&%eLAW?SDxnZ9cQ$RRD292&HMR(sB}(BY32MQ!
z%~)crn43uBn|R99)yWi?6=a@sK+U>*)|8%YzfUGR4taN<+c57nD9BzPHvJLpArl+w
zcV6>lnbvn-89RXpje`qn3g(4yD-rTzxD1uC*g{J!@j>c9qyra9p6}J1fCkiII9|`B
z$gEXl6<zh{@AFeU5l8=uJDE-Js}2N72&UG;;Alm~rvSiB%pR-RhI8N4Y}IJ~x@#}Z
z{7Pjh^6g-(p}^AOT8-!?-qh4}AjOEl5{h!z9GDdTXVKFuRnpSZ@{sa)b7lGM%TB=3
zF@9@G*>Hw+20)%<jYzEa<Ezh`He<%_X}ivGO!83r7#Q#*?MzJ_VKh>=nL43d8Tj{i
z8*}YgRFu`ENs?{^Q1_Kj5_s?lpRDb}d1~A_W`>i~OwNSGuwk7|>nWFq3la<bHsy7K
zk4bifeTk?3_d}+ny~;c%^BY^K+_!AGPtGicrsI8Z*2PaQS5T=zNH^Bho#*-;HGMOZ
zIJgVSvlCOrJCj43IH#Ws)}KXW>1(?1h#cwZjp^Sh6)>vC9VfXFlB`Odo96V`$2U>|
z$xlnSrr#;A!-L2W2!x%&KD@^&%!V_SmDi5u1_gtQ5jc60P=dxvlcYUOH3yd8bDJ`a
z4|7fo*ua%_nm<1hIZ_C}BS%v^N+N9rDeTh^rZI+GGP-xToW3)ngQwPS04Efe%%q^!
z=yGGt#c_+LCR0f9soyXKn=taTr?gPat;)#pi*k03GHTg!fVvA!%YPg@KK{mFLf=9<
zT3QHxRK#H7JW78xPkxx-i5S)D`d2d14>s?btBgDYwxf=l2~1lG4s=u?2t8koWzeoG
z|3b)Rq&n-G>kfK-h%dcopO#{&O<8C%*~$JsEA7?E*u(6rMzUh^S`Z3(6N-D)3@#Ln
z3s6@#w%FfrpV{g$lyr9WD;~0~@nL^||5c~Q0*Js6d4(4g9g$FNH4IsmL}Pg)e;+AJ
zGOoWxU#8K$dVJv=EqUg+8Y$*-XlyJkJ!yNuvoS@ZknL&mwvuMo7IfI+;qd^L=#C6E
zbB6`(wVEOv2|o2_5r<GccYt-d^IKKL(3S+jnmDwG40NbkDgekc>kZAQ@}YY8fddC7
zta=adN3g{yI(Y2Zu|H>OW_()yN|n=WvW^f2JL=Ek4mwV!`KDKUjQI+x`lAt>sH_R8
z>p@bIBlv!wlkE0vGM#Eyy!*SFV0v7ugx9pP^7r8!gh<jrY%oksd2bhVX^!FcJ9k!4
zv<6U94o+9EZrzrukcpQrO80ogLG_20Rq56)3reLct%Ou+k{}=w0&zRbJ6tivumX5{
z$%0UAa+|V~G-rhx9H2f)<t9m3TN9JS94E!?Pl&S+v^cY=_EYNvcQEbA`tjq(2Cd~S
z6#r#(=?$k0(|}XS+l=I|1mHvQa$7=u_u5-x17LLGac}BEdZEu^bhqX<H&Xod1QCQ9
zNx@>bQjr6tBCO7N{@i}jYVFa9oAg0eEKrQ87hQOl&FGFKn(|tCj+c)$Ji^(M`%81r
zaUkT~di|~y&&)ZBk?n(NnywCn)rp>;H35j-grLh`PqbLLI)NiwsYZ?*DXbSQb*55Q
zDlhl#Or*4x{%ts6{qedjPmm~;qvp|k&)5+}-zNc)k#-DK3#C=`I!*F?`CIMvlJo;;
zS&>QW2a|=dy9bh~`l(nDXF$Qj3B4v@$jQu{)}3l#u9Eb3!pefKhsF?`lRhw|sP;D9
zz7r^A+^rvT_N+6p?DZxNhO5P7P<)j~_NCURm4F=35AWa0_jxi;xQJ1`->;VfIF%8z
zh6Pf_N&kC{NOx4gRjGg?PP1o+0{h+o@TTdk;@>d+*yMHe;|`}E!B9=MFX<H`d-*z#
z!VzKQ(^JV7yB3(AeOQt}EVlL{hx;|ozs%k!0f}UG^SXRv#L6o$X?sw@8i#txIvQ{_
zg!ViFCZak!A?hpn-}~pueV)kWq`8vS>+F{=gMefl7}(v{31ydSPsMx!lY|E19J)`h
zA3EW}S)|AU)#RMrM%Fr!CIN&K-g58IdYIgk#6npx3LVh{z!#}ylz7o?iZN;*R5xdr
zy${frGf>Lp>*S-ObmfYCkZx9lGeYWfty_(c^#05y*)ove%FE;L&@QwwHXaUybp**H
zMAMQb4`5c3FRfjR+pA}PT0YYKGE_q$arsOrx$99x4Ksn@xw*SG$6I<qKy{sTS)sV<
zaSsUO?X$>iKNJ>rF=<46O!L}sus*>{yLfa3^aUN*y=l`BzjPwYR2J0fbw)<Ut^4<F
z5WIgVD#FR`pt;J1Kjd?HxvhKvnNxJF?x>mAq3cnUx2(oO5Qc5s98X(X7mg4&q4%w2
z!6fJT@Mdjpzj%M8U8!dv<k$e7_dbEm)~Rf9W5ytKCahAFR+hv>Yd$}%c4vPt2Vkde
z9>pALm=JMSoAV}ZgccG4jC8w;{YqmHElxooCi51-bk2Ik?GhT-``E;=Cr>7k;x~-j
z^jni7HO)SogN!VncY>n}GU4R$g0i?qZU5+9`ts26(Z(FrIes)Q`2O+_6sj376)C`}
z@cTz7Qp;y+c2RS&IZsj2RQv!2DwIEOO9-Ks!%Q499`~h289I5%#Lt6t)sS0VIk%%A
z%Dq#Zz4d7rG!vw3xt~cbKFQ4#v4xN*<SvG{-0srXk{rrK2(8U|mYN|9)N<UTv*lzR
zh|*M|wg}GFP4NEn3lv3_IxztsAPH0+C$hT-m&yj!yA{Xh&DjECFS(7B;0xr25uoSj
z*5XPh=tg^#Q*vWURKvD^b^*INmZhyZygbq6MI@^a`hF`JB@?TAe7ZNFW*)kk<hrAJ
z|9vw}u@w2LGw#;6=i(4piK#kk+3#pd28F^moT5D<F;1HFB}#n5=8I;*7ZHfnV#%hS
zv=_sQc4iEC!mcSbsdTSWx$;XGnWMB{oiwGtc57^xZx5FMCfZdZDQ_5ubhU{4@`|~E
zJI|dvx4C+SCy%wstWZ16;xQcE`;gL!1Vm8Z5c+lgQM{504?w$NGzkJHy@zu;zG|8G
zeln^X9A?9cU@2DQ&NX^B`M%dgR9jFU2!xCJe}?Isyu4s&1`*G+_d<WInD(NLwXmh5
z%t{7AEqC+i)uYEtsHq1{=AUogpuz3{cbk)gbd!o2eZv7tv<}dA$?zO*M)9mhBK`pK
zJ{nRONx<}yF;H<uOnzlb%B{dfhHo&Gnsb8yeBHdSC+@@U@boqyWh)9_9)yXye9cR-
zJPRveEv(PXnL+dtq6?uqHy@rjiMTwO>gg7D>NROHTl)_b%(HRnr@utJU&_#xA9pr(
ziWP|J=zf}sua+TFmqnXdUn=|OFJU<6q(6Zq8J{*As%qh`Y_7?x^ypCeco8Y#m<?K0
zc1wDT!`Axkx0LT_WBbq)=T&!*%b@|dFBh98dI28xg@bgf<Pr;{W1=pmOG_rH=1mzq
zeE3$v$v35?xS`yY{*Mp?AGfWQDJEIORS}Ud)_mDe_P)WJxW$jQUs}-j>y`2|9-4oj
zct$t1-lnqmBn%-C+t)9GmlJy)hWBYu^2E|>oY2GQqqGTE3JVLzUVQWFl~59)+zJVC
zxKn|oEWxBE27y)F)bQPo$PX=s4z)yY%Y=Y%D&J-}9?Y)_e1^u{k(|>3pmX@pm7%m=
zqUSF&f5vc$gvX<-z%RQi!I2Hu+$m~w4yX-5MhFq~pzlL*-TtH>fiVpaFh=pIsSZ-6
zLJV|AXv^WpXghRgpQ&K8)=RFge0%P#S=V`wMiZNext1p-ySRyIUw=}u>;xm7@_AJe
z#m$0r0L~vqHSpLqV%<88lSJ+@0+yk3^YB3RxWXu47ix8kC(b>OSmzb<>My3hr2Z27
z|F*Og3;7blD&Aoux;qJ(N}Q$FV#tIGrQ;_ilz*;KoZaEk_WU!tYoE$KxHQ%LJNppP
zwtaC+I2rCG0Oj6hYrnXp(B*Gp!PCdEQf*e#z_*mcyodz%M7DCd&ru#z(h?cTTqmk@
z9!m^Ccp~f<w3z#{yV7K7aP7Y+QkF3ity>>$!OT%!movxGxm3yobA3pNL@L0k+vCM~
zeY60)empptEILG+yHl%Igy*whC`zdwfWmcnSzbXoj!#SWY(Ah1o$*(V`g5FeBxCP>
zd293cMTtXngE;}m(V_R!vgkp?a^d~;CM&YHbw3FNOR2gMJ{A#liBVxi#VL{#>P+3H
z=<enCgOIg)Rdk-`kv%dAorQPeOHR{vle>n($}wW|mMu2iOSpqp;}0x83PQ}i>+0T*
z4AuP$Md`d2v_?V}CfN{;zY8A4*r>JZ)uU~UzC;0viXdXY->MI)saaqW2n=lz3kX;Q
zzIiJ?UWIPY3aOj?33n?LjTMoAJ@HsB9Vx&|;VC3`lyHlZ-46Uo_Bc{z`QxObzpoSj
zMi8P%{$$vti><JC#KB&ko|7K*t)h61p%xB-*&R{cVUKSx`R4UdN3Vwg+l5B~Q7>`6
zdh1r(EYo?nSFJkDclfaXAcCxt2%UCNt`{Tz|Es&Zj8-Elt4>$=G9LL$3XlW|<w&()
zGNPxURYB)n3A7;UV%kda(t!YWpvS2#Jr|gCVn?KHjg`CYVdoFeFfD2{cW&!c7Z(>{
zr=nU6MeR#q<h-g+yLNXBYsEah?yT26@|S1No>>QO*SIvdOF0=kw*8|8I1wFfM^Z1A
zji^3f1c=D#7(+N|Nf&-bK$G8p|D7~or#?B1I4cT*jlI6^+rOWc4v@=9tU7vfAUl95
zw8Kyn%$q^|{LMvWYc{B>C>DpB90<2(>G_D=r%#_IL!P`OOec-@^3#2r{u5z*piFfD
zy;(ZCn^Wv22a(=FM+q)~i?&s)1ssUDtxmL&#$@{!MvVz4(;pY5yQngdcp#}PBHGU3
z-`Js-geki(UAi<24EM;Mk>5d5Vhd~HjA5dM;r+FnHVwzd^PuQfjqJmM2xA)#(j%%1
z0d=a)pylr`?IDB@iMYbydFMzvN|9vczIx?s@l8?gTNQFr<@MyT3)lV7FZu+wT$|%#
z!cX_fx_fsdY=<W(jJrA6k^u~Ri{TmwmEY)ipC}X6%Y_yN;@<$fk^@ycmyK$yyqT;0
z9hCxIUn<ueps$O&Ic4;Vkt^v$wwZYGJe~WD;ROZXL&}WJTJiINeWaRSw52-=?SR9C
zLZW-zpd?+zfAk>@C@N1puYQvzwAddyb&w~SYui?>TE)%g9B*yGItLIS;DVz5VxIf=
zmsSQQmuUE8|F?L1vr;fj1b?~xY42#+!t$S0P)t>XnZVZDT6od}gu7)NH~Z$UcA?_Z
zrbH24y^M-$W<#FnZqQNQ_=#V4WPc)MA&v;7C;M%WN{V@~j1TKW47%aLMVdHBu1=MZ
z_848ot<L7}7%abgBa|8|dPcUz6-ivPb!#$stn2yfu$VxYqiY9pA+ZVV1Fp$Oxvh-F
z!4{GR86oxpze;5w9;VSUEhA%Z)8BaOJI>KJ0a^Ke?cbfcaQ=KDK_21Jxyu<<6|ZIg
z?uq#tn)K%5`1!;x_Lb8vt-a*BqAZ0yq;(ROoWKGc-MwR1`BeWd0#PAOJ&^lQa$aJz
zc!b)4wxA2E#DRc>5xZ!e%!KRC6OtS&r*(2Xi_q_%Q>$oiq!5r;@{VCex}qdY!@l#^
zVGjwScaweafjCdh!{HyU!>}F4yO8X;EtXB4gJ20{H3a>U<c$7Kw89RPL=i9%bYw96
zR-0p|XulYQ8Yy4qg=&S3;PVIm{QM0+!{))YhZ47h+2_xVj5=|`hTafje(LoD7&X+K
zZw-BA7z{nomC7#3k;$&6$5E;|5TtJt(V#>n9F3ea0Ut9ZTa+AKt5XFf=I1x|(ZowM
zso`zVULHhh@sfbibb<|%6Ts6ZgY?iVg%I=nIQCux;k9U{)uk8LHr5({Bq*hT9DK2R
z;8!R5L=uol3`f6le-<;-ZXy^kw!#ZH(5hOsO<%56RUY*+ER}8J<@St@2M<_ruqVP9
z(IjVg{lDSk=gX3^$ZK9UR0L9qLwrGDsHK)F<0FMi+SFdTxhdS#n?#0m+rCG(`r(n(
z<`x0|2ayGpY^$l9IsNk%#X15Kk(h{u;)c$BeBKdkB>ufGZaA0_j?_$apv1IxoU|XC
z&NIX&knssb#m;Wnv<XpG!LEg3r-{gT1SoO`Zpg*leOw$Oma)qlt;v9|U5&yy1T5fU
zvr1gZpWE)$1qt@x+Bde<>qkFzj@+F)oWS=q)J2Gi-?bh1V~)JyKa<4dCH3r9xO}35
z!*ubOv2p@F4)thv;naW_LM#tnX__&Q$p$RvlDLlqKO({s&xbT-`5L00zy7`4h9#u_
zJZ$?ziaz%n68+Zu#$fXj8e+@_oBTye{;$b%`*XLKS0L|X#DNDWIbRSG&sJ&u@{1*|
z4c@f%%9Ulki#|RWJAW2+5vG3G<TD@$gV}^zHW`ZY<+ni^Hkd`UY%Kf}A9&qe4>;p2
zoyj~>AC3Mk=Xt$;G$(bJfx_JDGfaoxt&Qvm+G-D@uHD8;L!-Xsg*)W{&=8x%=CP{>
z#Tf#I(0hu09K&k%DYU#;%0Zst5vwB%0{n<b0dhn6^q&j=s0po376FChPg=C-FFiU&
z&aeF)b+~V+M6vxJKB%{C8&TUpgUtKPB8nIswTUnhA#nfq)ks+>olXI|5&pD*-nkjY
zrsP<n4H7jG_^d4vg;v-%v_ZJ!QRo_OxXL8qQb;GT5x0u@HIxMjKfT2pJ5jL0Hpy+I
z@D@Q$9yK#uNYd8mnTc087#v9g(k(w0D5%fJ+not%t#(btzob$qbcA#?VV@<I_%1uE
zjiiJwJA!6`@8A4Hq3f$)E`w|uV0-S%m!oIy5xf@2BlRKWNikw4`n<bbX~Too|LdGS
z=|H(!S7HWi1+Z#p%8HDSG}HjqtK?{0|IjaiobBV$(e17z%g(u7ks>#iMG71^8oM>H
zy{K^sEN2M9HLE_!H1VIrh3NAVsWq5q8b=VyX^|Y1)P<<a-8c&{NPjJ`6Zi!XJcfSy
z!TNg?<&ERNzCxMzmUm5$S$Nc{avkI-BrDICjR0s)0SZI7AlZ;;KsX%#tXoG23t1E>
zp0ltZ5!|Y@*Fy<iW>R@30Tz^{wEvjXI*^#p3Zk^$cMC)QertW8HS#K9S0V0#Se7GB
zqziTbHEK%E2(k|riaRY5C>p)!M7k_o*xB72SRR};K5VHhl4V7q|68_PcL`Xh3O_$U
z_*H`WXj-5(?YYP><D*EX`0{eO$u?uBM+bAq<GDKsBHWU;{d<$mzf$%*3n&cFfh-8B
zcUN{-cr81W65oT@LLj(cBA7vSze_0;w^<^QA#*d-*B>adCv4%V7t5C~Zxp)xv&N1d
z`D<rI#nSkv$uwMstrAFQ(V~-b4meyqPVTa8d-v|;n-La66y&~sJ?>wdrM(N4fe{~o
zrWU&2Va%BHb}xiQ2bKmEx2CZc0<5NW)qK$F|9sFY#3e#4eqTg78%OSY9B}IN>D`PV
zwW>TNYP1N4{tfe<%ws#rmm6+qM!KBl=YVpE%7}h+SrFF`#AA6)RY5WQ^mB`RS|YB&
zwzDUE#$q}-vPHC{=L>vauVe^Q(wN|Xqq8&nK|!3c0Rlykk&#E_%}Jn7qviM0nsRFH
zSkQ277i+jm?H?o@yBs?JDpuSf7aCJzU@+J|wCGeS!<K`IM({G2Kz6|k=<bK5$+72n
zp||YH5JtM<trowYbDy~CBw0K(`Am=x)Fm;wI*>FX1_R_pB;`e;yXx{aqkeu(?TO%+
zgdyj|5D+Yiw421|)_dRoi%R{t8G5Z*f%{BtugQ0L3Fw`SY7?25)y(qDhO1uVk7lkW
z519V*n$S|wE3;!VGMOOKV-af+ZcEvaW>by+v#E}ZVjyN^9f%A@5K5R%$Yz!Vx-u*X
z|G|@4kOlp~GZw5Pg%N$*)GQKFAxe=%wFC6(9qo-)ge0L28h>u*^>1izo#%X`ajl4*
zz{^J8t64>P(};{o^omk*1OHPF*_GzUau&;>6$6N|H9SMCHt9dlm<GP3UAOKbvA%-`
zJF^%)nW^EC>ebd^Bx-EJz#TALQU*s*qir=s?lJiTWVoywXb{P*o@zzTjv*&kmo33)
zHGDzPpGRDeIK~S%=3VfK2ykJ_QJfa-P=e89ow00ISMeImzTAi943dbEvNAUA?x)@~
zen5@c2JOT4-XDhUdf|%M(EGW+US|JK^x(5i6E|ta(-1|(?B&ImzNIke#0h0D?ZFQD
zXAkaoBNIWR=-9}5&^J+GRft@n{=h|wVYEmAa30_kucRL|vBG>)T22ijG+_Defy_1s
z(vWRlFzVpHDUW3-if@f6sxTBgfiv8l-IZ;1Mx%o~WZRTlHEVAF5l$6vgpdg(Jaq>L
ztyHtOKdveBePa-Wm~-cDktUFKd4s+>r$&Tp#6>7{Ir)hLYW5L@#xS)&*eq(jPXj8t
z)se*Z9}7xw2n4P7UkfS?^vUu&JPX8H-~YLWkM@$b%WjdRNvcW%@VMFY!m7G{bk{?4
z{o^=VoEE&Dx+--J<<#&EDYS{_#CCG;MK4cLhKxI1BBgK8dFnu-U5ReQA$om5&_uW0
zS&$A=0YzZ$n&IsYL?eL>Z-kWp-J|M?+wz}}WPaA{#8YYxwrVoG0xAJaFKnAPe|`s=
zT^&-ck>oL~n813FkA}Fg*hIr4(X$d^c<NK(7)%{^i8D-_jmW3jf>tU~CseK3Y~R~3
z#Zs-)Ysqk?bU1J|h&GNUCnt}+D~SfRzvAbhFvgK)8R&y#r1}55vu+tNbh?E-4#AR0
zPKZ_$E$NN-zTgufBwgfXTn<zvKOBI+QyCDQfaEG`9FdfDy-mxdZ>u1ADm5ER^_&Ro
za=IHS+G8m=Bde#5P6Av3@A^c7o=Now^~Ch&>R=Z(I!b(!+qnNc)Oi{f2jTcJa1_%x
zhCj&goBxw%7^ElBXR&`Pxu}NQU_J1^vwpSw3egXn^=;MZ4C-PqszcR6(gAI@%bHu!
z+(pK{vXE_vPM{4?P~7^|8*CMn%8<K={qkP7Zb0qUEYR|wA*E8{!dOc{w)=(^tJkcV
z3go%){1E~^Ir~h`$)Vtd^$sNn`*Et=Pm3r*!}=2?<--ZO(qq_RFYShskxPcbIC)c1
z4wNr(QH>Vu6`$v@pJSkyi4%&rszz<v<n3CBra6Uf3y1Dqof75a39_D3>EBpe(LO0T
z{NTZj+x|I+j-&9P7~bd4r)&Q)8l%f(T68>vyn1|NaRjUb1<(+qu<UrUGXM><`D}II
z(#Ok(aE5O!-AilD^C;>&s8*2F6j4+0;@k+u(3xnstNll&G+QqjV1CQ1%^`MIR!f8m
zv@a9iWM1K?p>VsxIISaK;N+RLziq!zw$&qK+&8nYdkZbq=AN~j%~#B<V}5v@TcMHR
zUwRkfE7Yr8w9GR=@2@I{m)Uw<)$QH5f=QPhkG&^U{3U5X_4T!W3Ebh-x5F}@tLwc~
znblYIojol)rub2=@BU83moI*bp1T*;Xy+>*5=erbF=GGm+usqMOZ%$#sD#6n5y!IJ
z1d#Up(Cy!=vjXPLpq))b2>^dXN|3~etaelo4!^UUCR27nmi;iAq;5OqEQ6#wSzS{+
z`+$B+VnyKQjCh?ItNzE!-RrjtflQddS9aj-&$J=0*#5ivyYPum>Ub3`=SgZKL^?%Z
zvs5mw+?iJmagm(7(QW^P-5LUvZL00^iVcXQ%6oK1hf80MQ>fDhHDcXEg7SIpmXd2K
za>Dgz6~0U?Wdv8}J}<z%s1>QK(x|%^<s}VIsIE8`pt%?Bn?a!`SCzOzTenSHLu_UT
z50Ok~6fr9Gx3t>Sl7nvQMhk@>FV}UH=rPfEqQIfiEp9-jdZqWxrf*s>TJ;C|o|250
zmA#mh{H-%mi`D#Sw>a{2sj_zl*v08|Y}H=gY<VhQT4cVNF$;%40BQaHyA)S<>N*4G
zA<KEe<|}sw`1WepFAbDVxs;uXo&u9|l2V9F)Yu2jSQ2L@I6^B$Lb3)s#j6blf?v8e
zRQ;21DmnIaWJ%^QhpY5tZOLP;o7#kpHW52cpeP4|JRgy!cR|zxcAGml@>2|DCS&mI
zHZVWO`)E*??Yl(yLxQUg<lXZ5ft!W<WU&#tBgaH914tj6`MyHV87jjBaTjnzWZD?z
zRvD2)ceIc>RA9gUau~><O?BF^`HlYbto7iDdcysQZfQhC0lz3Kw9yfz02Z;0p5Em#
zwS`-99J9P7SSagdO+xIx8l`?0+D1`Nvu;aWS>1;PoFS2jq5(2i5w`4=dY@oZG`@Dk
z-Qro);_ySrV)l5{Q<7X?+-*Ad;jw=5%)rVt7WADXbHaiVAibjLEd`Kh;*RQgh>dBX
z{||)9nZv-m0<V(;0rYSL6$=Ui;}jFVAjs+2uGIs<#t`A&UGS^89?*@~G+NCMh2{tb
z>})ha4%g-ET<P129c-nNQX8(`j=Y;XB#45(>8nU-C;^D09KknlbOk+;ljYI<SX4~{
zww*b1W>YCsSzhg#;b$dBIM}8&gJ#p#55@GfAC#m?<Yis9`%=LteB}doJ=8KIKmGY<
zH8M~^k~QIc#fpX8np+TJ5k*1?ms8L@++izNU@G-^f&wnuCaCSZ&noCV%ws(r9b=VL
z2td*J<}IgP>tXD~cy%FUt}~Z#k6&Fyp8*@AiPa&YH?S>7bK2j(eW33pK<gu%q)?6r
z=A~0}@(0gqp8uk~mk+%`atp0_NeElA{Mld(<RBv!5C5ZL6$!L44N^}l+z}?mcTN_T
zXeJy@o`-}`QWh@7c5i}2Xs2h#&RViG`cf`niTWdR^Q-t69-4U>!J6txN@x~SDbHUU
z4trMR-7>-E#1d(12T~^c68U2Mr;or8m@qLBeZGD9C{F9CtZ}n}|5@WhM01Wi&EGgw
zK}oJ!ycPCMl%3(f5N3`Cv%DD>H-u8=I}41ZasjuNLRr?Y2?F*%Nlb+imMT}Zh<9H`
z<CufCPj2`lHJ8F^*?M>eX$ng~9lm{M*%ulSH012J%$QisZAct?g+k&jXR;-7)-^Ae
zgKP<0BFC89r$=60H=r^WNayFxGk4#-nT}9pg@iy%s2DGx0%@}6tA<3n7_`~6qj0P<
zK(uer@JB{l>7R~S{_zOlvNn(n4Ih1){;uS-slR0q$`WxF5+oBhoFG1p8@z1U8TbVO
zcL<<PTvhF+aK`!CU$r~^I)HF%1$cuLRZE8of&n>54E2hj*5=ncZy|yBmbc>KT5oAt
z?OhO}_+xLTa)&|y;)2ljbX!dnTJ&gPf<~dAb@76GgAVhoPqLD=`~BRKp^piGqh};E
zn?1-=I610q#-<kXnCJ!~sjf%g(R^&G#dfNYBva&`rV@q1Y&HjJjemVid9uUmxRXkp
zp-^{xK6!Xc%KP^=t*_)P7q0r`texHFJQOUPbsoTeb?s%WEOYu?QX+yP4g##y4!n-=
zwZZ0pRGhvEEV)_JpO!=^V`|Aux)<U_h_mu&f0lM!>UYq@9VH2Hy7lFgG;9!yO3W$x
z69*(Fv!XB_%s3o?r50ErS<>;IJ$;+jQF0oQCCc5Dia1hjr<KpTDNojFHE16u(HP2L
zM5luy&K|_c-8eDyA8cAt8WoEQl=8Q<PN#A30u)gS9yICiKhus6c7^{70RVc51|bo)
zU}r%#0t~=%ft%RQM}OB<Iz8dyoaKPDpKv~iC86f9ciI{p-UHDfLW_X1zgXa<crG0W
z<u{Qb-{RH584NPuFNOdBFvTS{2OO2Kg&%Zmi3Nd%F&cf$pSQC0H?L67c(f%9EX(-B
zWC4xorH?5=IcvXZ5su5LOadA~LHR`T_#IcRh@Xf2^7F{+O(oQpDkhBQobg8)bt4iR
zz>S^Z`SdOnNtX!67BQ!J?~7HdR>{Jmxv_`baR4?s(Y3y^Oh-@&@|`g;u7^ddMlbEX
zFX&iQL?zQJqJ-9IzbnlcLv;jWKZ`hw`*>=3#K6sUoEad;Sw9MJY0qYMZDXM<TO{&W
z(N+NJMT2-`o^iSQyrjh-cTp%cHkq=XlRm<|4*qBI&r{>JHBFowM8nj4kvTG;V*?TQ
z@kJA^b}S?<%L7P>j=6Q7Z4`c;)zK(>2;z4|Fg(qXnsUJ!L@|Y3!<+tD6eIQMxCsZS
zb}*sgTk9ydy*)ks;nRmhB-(Q5iq62|GPmjo?<XYyU&1T9>4-{*v`*fNe98vFcu`E9
zRu(U%zlTQJzaBGOv&DCzI<`)6x=oFJ0ONf+4@tefyTbvQqbq_gqM<#0Z-EZPXT!8i
z>GK>j(0sr~SqLMmfPqI+`Q^Kuj3NoE5GI_ZwSCbt(fRO|gW-w|23J#_OCe6xUmN-p
zcaGhT0T?CHE>X^CBF0L}&?KmvuCN3&=D1<78ByO|q*LAO23AnqE*x<kW+V?OFVE(v
zK5xG7$~4L@^B(mOb^{lBD|wt$0dw@js_SC;q=_M56}|k785N+n+i?8ZlX7?QZ=m@T
zjyIETK5#0(N!9$dmx|tt)y$0|{yJu`ijLmWhd~PXsL&pHrJ9mcN1Ow#dBh4)FrK*L
zi%=kB-rCyQPE!`KmK_fMv~V{(@d^2W=10QkM~xPGZZ^#n=r449<v2ju$1C#NNkX_H
zjKwJh4-r|V(~uy=&G;np<KLS0v;~Ga(X0ux=4Gr_%$YMj4I81&AY`0PW?u@(6dp$t
z&mw5cmqZyRCY=<wK7`N6**XeupLAhuYC}`kKqB%z5o_5XW&8eM{ghkVh(y272>3T<
za$9r`X{N<4<2~WC>8%>feK2CJIJQzBv^SD(a_i2WcH<XrB%>#ulF3)$afnq83KTIB
z%&DAPkd}juk%N!vnBRxUwWTz-lt_X{yU`3Cfi1^SG;ylp_S$PGtrjInoyVNP7?<pg
z8-HyS0CAEZMaiusYn-O;{G#HyWyilEh>|oP<&M;_cIvtbtk+_9`*O=po5=O~r*O`;
zQ{MjebOZ~NfR?y<mp<^#Y-n^u;}ds<S5R&^ek(~>D@z2Wp*p)Xu8e?rL@Nugj;1_g
zPwCUcp%q<^O1){*CNYA<@q2WTiXQq(QO#@=;N;*t^`+ui?&ppcgNA#+W##n$j|nJj
z(rZbW8O;oA%jgK++v~>rN#4ItoS3EKZZF`QZ}<5T9MoZIRy-I!CKOw3TECK_xAHal
zme40efch%1u=}XE?9MKlwX<a<x7)qwYyrb^5LixS4XF?fSA|V|oO44__C&*>i*&2|
zyhon;<V_<aw8fc>&^%nH|32Jo{2n=`G`%2%3Pt_UZDR5E7gdRXgkW4m!+AES>I71I
z5#;dMijy}|Q&Di2;-6b(o-<s*b<+r*S+z5mCaLt5g2KR+4Jt|hubAlvep7CoM}8`>
zFu;X6kdrYJKOpCUwPss?MwUua2M|;swdtF793gfIg7xf)t>E28g;7Z>YU=nMNSx9!
z8d6?M=Z0}=!RsYMpyHwn?>O|5L?AqeEU|tbf_BmA<>uwhaz!j3wuF8^gbKsCRXg=X
z=!<jP#SxPCbsIM&WStabet)^0Z{0zE70y2hAmPFK?b2tAz`yy^N=h>;Fh>DB$az|+
z(of9TF0CD0bO$QSg=&$gV1&#OmmpS!(dZo@dI-+gozl1UNF1b#(g+VS!W@JDY{`%$
zS~VQSj}23kO&*aW$;TKBsiNE%32~Q%W3mezT3cmJc0pH*0Xq*>bSrE`4UF{gQGGjC
z>kQ-YzW+GIoJnv3VRo-Xu89Tp)u~4?(ayN)>8?M-D;NSF+yaMOwQ=$d)c>rwEzJJy
z-)ktI0|bWSzip`j1&tYlW}GuOdC6$S^9bl|2nmup5FklP_F$2Z#9LKRTyKKqCR4-K
zXk&9KbR}Se;p$AdrA&$lw*48#Z1+O@FXhL@Z&!+30IU4O<UYlOB4$<tSo;(4zQRPy
zcL_cR#`pF8_O+t{#u5x<@jFY;3sIb^+_!DBBent{pXzE$wl?R#t9g?j`F=~-Y~e0;
zwxr7GDFsZT0oBhu=UFO(+xXfQ%wN%R=m;^e${NA2)ZF+ssij=j1|z&r30XT82JDhR
z1-?3*j*fd{&#c|s_TOXD8`H2DROPUKFzXLqu#3b(7UZ@o7&c;`@pNq5g;r70Py&kw
z61O(*`|Zn#vvdwpp^!^~!6)LGCg0%%9hC=?|BHD+1|xYHc=R6PS*2r~t<g_oJ><{H
z4S|dHMfZUa&@AB#7=KZBtEGDqS*bj2Z(}pF$443`lFg_-D{gV)a8%-jhN<5?z7Pt8
z7MJrE^yr?TpT@OWbs@CB7tIjnA`ES}C3cGD*`!V!NKU?^KK|7LX^@Gjk@EQf;!X=t
zLvMY7@1Qffno`{~k2!*exFJ%nsSd=Tg^C)5JynLAAO07*@hqkm+xg1+>JL7_Jq+cx
zn5%OD7P$P586w!*oT#{klL1KQ6g9e&u<aETdL`e#L_e0<Wt1p(HldzplEF{l%G?i;
z=opQ4KwedsuY6O!WsC}jMQR0ANz}`T4w)0-wvyIr0niF6YtAye)kls=jsf@rT^YRf
zRU#536&db9*|5(aKOra27s64O%DBbuCUy9>Jem}|%q{~l=)@y^K2WVVYX1568#nI&
z`e_M?K(n``^y#`a$PD7AG~76O$v-+0LVFY(Ezow!3Ii8G@@z#ap}C8<7<B{jXDtaB
z8e)-p5PbSHt?w51P!7)9w;fc7VuYa^!9#jBk&v6rIGGj4?g_n)cx=`5O)R|Y-Txk;
z-gEIE=zl%+=?dXSqiyr!e1P<;P!-oP$9Bzf9jG*0${R$R(J=Rz=8JR&^S|)<*%tMi
zR&H43yRx$8BU}CqA9H5aoNzJ&2}s?R@Ptdq)S}D`@+!HdHT%*_l$GEA`1RBZ%Chs0
zzh3Ql`}cpOfQnvA*gIQn9B6*1ECbyhAac_tV_kwluMmU|r=P+HNKT5C5xLy?y~gFK
zh9q*p3pNDFz+DlX!#!kKdJQNbv&S)uzEt8}g0L@T0&#BtuQ6IKy_C@0v{N}V<hns`
zMX^SOAbJ*+nu~oH-^GD=ccv6nO1QMV$7b~vx8yVD&WWyC9Y~b`RSF9YhLE|t*7;8w
zlhAX)6SpE4Zjdb72Y0OZSY2HT$*&Dndf$CW5n)pP-^DeIh-%+0YyO|*b0lIpWfp-|
z{r-#9vN|+drCY&m-!(qja{`v^n5-w4xyJwcINHb-f%kil*jicfbg&H{lT!K_z{y~6
z#V0OH*6QYe*UawKp^CV}B+Vh*@}JW@gS2Kq(uFO#prXSm4D)QD-NyP_Df32tQF2a}
ztzFBXBmZ%7rC9s?Wn?L(JzBK5!gu*Uo9|fM!Id7OqJK1POySTIgt=f9s&U5EF840}
z_LUx(zB<(vy+}jbAkQL`GRmErb@-3j?wyaxQwLV8qUEC#PY3@i61P*oJ)XVyf`R2$
zaM7&4h)_6f@(gk^2kTXH$)daT>Yd|{i#YQ6#J!WVY))LgWiM?_+#^?G#vPAeCj+$^
zHx&$pe?AmJ&a4h3soSGd-7U^;-s89yFIJHUw7Gcn5*@+fJ<zobbGd!UxaAA2Wyxp2
zhTs|NXdbqE@+P+W`)m2G$qm)|)iym^cpwx2T1tLIOM@1B#*!Y2o71DUS!cePRa8vd
zb$`QokI5>p+~HG{qV%5dIR|s{{pC3bFqJ-d|3lW9z~!8GZTvP1Gou;LGm|9@W)u<=
zSwqWMyCaf_RF;G!Tcu5E%ws%=xXa##WJ`#$L|MjCQj|(1%!CSMX`@u{_uM>VdHcMd
zdFL^?@Bi}qo!>dvb)D;s6E89e`*0It#RgR-D;t#GK;vSqgsWr2(ydtN$GrWd^EB_L
z^V;ZE`<pa}2<|;NYRu>{V{S3YL-srwh(H!8)pf8ThYtI~ljMDWjc(Bx7e=BFnwrB`
z=RKn+i_hzmSH$6v(?DO*)kj~s1qFu6(RP3QukS;2Ko0}>S!uRYX(OqYY>O;q0Fi&n
z3^1?l^ml4I!@#$t&Jd>>5|g*L_I%&&_oa|&t6mzzpvmJCQ2Ux)xZ)^<0(yg-!+;o3
zzUz6xx6>F|kud+inpzaL)3es&ovQo2B47$dQ4~4fjvv+X-21(UefX&Z-AgurHpi5K
z!fkEyZ9q&lE{Ss~j=Eb$27vDDgdV!{bOVHeHtZR7%S0|Un?S^Xj$uR-bzoE|yvBTx
ziEj83g~aqMOi1Jc^3LOYiVa%4NHg$018=&h{=iNYDD*5!tK#*e9MjG`jV>eA7wKj9
z(Z_+DEy((XC9fnHC^{Vj|Bitu-Ve^5<A+PwCF_%mW+-b8_=`0$9Ks~9xotbukFmqz
z_uE4;>o+Lj9SLLxZ$GFHx}=HtUrUXZ<48rFOCOqa*@N6=`z#~%=V<6Y(M0_|^8R?g
z;RA+<%8VTI5Vn^4MbHgaMea>HkDqo`U-BhUD3pc|fK^4n$b7s)YfO~{-vJ1aUqgFR
zCE_~HO#_Y2S%+nage(Ed(TPW<|0=)9JG}*7#X~zkCL>}q&~|r}Xuv;=IEtr1s3*$_
z%;_3bxU|oJw-RGFlpTE_bh-<D{^HYw08e^}#J95Jvgg0?5-e5=3`eJ23=e&e;36<C
zVoLy?5eb|%J0)YPg9pNTX@(MCA9=chG>G6>m<}4r#sk7liA$$A=m#MTmU;~VH4mV>
z!<m!(;a*_;sU-f_RUVv4l_*d4313iI83M#Y)Y&Vz2j$O{xTcC((u~CMJa0nbMPM?q
zMnokR#VI}HR+339VMba?iDhZazBc_sTKw_D5G@`?T)%+EN)-w8nga*a;juqSqx97h
z*rYJ|MzdQaw9{NF&O!8#rzZzAF62=?&e|J9Qt&f}u{>vfIftzu&jq_@h?$!oRz9c{
z!lXbU{65Txm_|w<2x;HNe-95c@i#84m|a|a82y0@h?;b{WBuNlUqx{x&6$sB)W+Da
zVATR!)=q$3`r94NKd<g)wahj%02dfDC7DjMKeQ8*8I0-DXE}v^{|utoGu>Dzo*mhQ
ze-O>=T>$8EzWVmt_$MB3_^%_W-pjV10P?s@^J^eV_b9_>ZW%^%e#}L_cd?g75iL5s
z@yD1DdXL^b@zk(iyw?UHS#SCkcf(jMJ)6UwY0+S(T%1Hv8ug^kzfpvJIfhBRa`&K`
znwk@<izJ~uN^?<?i#<XSolDDk27RS<sb_IvFOA(C;5qS_RLw*M)s|CYbk&@{mfxc$
z?>e{`24q6}0p2ek%(}9Sc46K1r-U<gw5l^oW>V4tbx7@BPn|tC_cfyRAdLY7PFQDH
zPztQ$dfHG{CKx6$K{zpoIi>+GFgU$k)kpkW+fZOzEDh(u?;Tj)@T{KPGLUs2eflz}
zSJ|4u2e;NYH0f0H8lpx6cd{MH7VPR1-uK2nX4Q@4{Bpev1mA{5rgS@7=X-i@xKDSz
zX-(s9liy@5i-5uLk8=!e7-IBF+(l+x8FG6L$+h$40g$abdZ!#MKI_8^{{y6PjWEFT
zj$L88@wSb?C7N}KG=gGx=zKXsOPMhdK=B27iRjw1q6(v@C~)i6t$mB{(f)OlsRK~^
zQLp>lN124o=<<Kpu05t#{j~OtMD8)$`=(swLbg{hD6uUmQi7htgI~`0Lr_hw-C+I?
z399fm#SlZVt8P^OUw)%`16EjCsL21Gz%q%8Co+iHXNxMSW-EbxqrFd*a$5%o<1j57
zsj38jqD8Yi(Tt-~9M$>NTU?(q6aTErApbYRc%$QLT(<Ue*^K?<K_2*H_KN5R39{0a
z&Hge|X<WP^1tXb6lyAdvdFMXQ&emwQYSo3-2<fjU%pv}_roLyM*g@b1M$ykxMI|@X
zj;5Hs!Q~zsvN`lPT1n|3l;%z8Dn{zS)s5}wSGkc!3vDGJ2u3;+Hct#PyGBQQ6#XCv
zX<3j8TUZ-%Rq|J{6+zb?qAe<L6dBpn+)K1$@5n%RsrisbYAY$Ov*6lyvDCVwmZm#B
z7j2xxe(|M)T^>`@mQrjYVOpqvr8x14&nl130$qz>nt+1)SvI3pGKojHbt>i2HMU}L
zC-@iZ#69TCfY9mVJc2?jt3D4}!K~?_3zO>;8i3Cp^KDXqBia!Ubs)CUD&Zg?QcBs0
z!YT?B%_%49t2M|lc-+ftOS^8u%UUgJ+v4D(JIr@gAbEk2Po=1~#Tks{t_O+CumWi`
zq}gYFS&hKQgi+^m-yL|$aXYEKG&c2|I*B)%3qmOVr5HBO;}!Xo#pg<U7}bLG<VoNa
zt~zW$qOreJ6+DsJ+~kHm1RViA0D<F|m!D<|n<OrR)MuVGvPCi2mlhS@aeat)X|on&
zits}HV|w{v^x$~L{TbS~RA_Ye+hkmBW6mh7-Uw^MXC_leiVIebp`aG?G$dEA4#UO%
zfFD*FPGxm)hfxiGYZRDwlt{#ausXBjDZyI<U7bs#s9g;GeT=-iKG5njJL<N6moAq}
za9xUD2(htl-@bj;#p?qDkH2<^ws!ux9~-s5+kA?aLN{v32}X`^Un6NGv3j#jY3n=-
z_+87kPtO*UYM;}CN7@|d|E8#;ZuK;5kgh-3H!V47BS-Db#=y`<sH^<hW6U=hcL=$A
zucC5Ntqkh2XcSiS<Yn4Rzvb$7Bhl#Zn0cTQL)ZCkbIL{V?{(q{`J5KG>5=L~gthI5
z4V_m;)Onu6D`)Rfph(8zUSLp0FYxF<4vHytR@8x723CcgbxIG_o)62fkZD)DIc{>E
zxvu_R=V|`4c7MzWHS*1|9pOlpw!hl%d2!c+-TkyTxd(+yh&s4LyHM{t{D$PF*v&iU
zH&UTpL|U2e&k?!wQpmqsTt+9Wdy&@guQW8q9Bo9V>Dp&DW-LpuPeUnh-Pna$H8-P&
z7Jv?iKR%OJ^;#&moFuZE%dG9qGw($e{|(<^;j!R<F>BWc`KE&bMZQ>nTq+(N(rqUV
z1()+3h@i~-P|d#w2r8<n98i<96YPDVt;m#=bN;NsytRaU$_BzV*zYb`0!da}0tWa|
z;BK?#9y*SoY#ejZl`p&j&-`dimRv(yI!cCZvRLxFpl&3Kx(#V(UZ&~+8pS906Qkw_
zl_HA}Pe}G)ux1@y&?CjTA*KvIwFKBIz4~?<Vq52m?X=X1T{<q;It9tJ#cap37WbTm
zHEsWxu@BoGcBo35(_f4cmUwtrB>0P+E4_9ub!5=D@b;Kd<0DZ99smfnhgi3gsHl)Y
z)2f#ZW;dkH_sdStu8mPgi=vye0Yu;wUFyJ>#URn%u^8Hg7jA?BzC`e)E4LC3e-cGq
zeC;qRvC<@AJtY29vN!3eR3?aNzsU1kd`TLnv|IFzsKljhYGrE&v!fZ(OA!#DaPh_3
z8(3<rTRC87bnSqhFnd053o&JwRAUDDQ(h7;S{i<BX|RpQdwsIYo(&J^g&sksax|gY
zw{4?JN=o|Xf{q4wf55q39VWQT><ISeekPmcRCBth$~@`C-BAstHX)3OyCW#Tj&@Fc
zVC1#aZiz?={N}g$gHf|v|BAPA%rNXh+AmHfu7~YT8BCp-vH<sWc*EgqG6pjs3Yspb
znl98OKlhO41j7E9_zaq+p!j0S;P!eXUNp!iNW1>>RR`q6-?Zwn_3>6x(bCmj!;ko`
zgQUImXy`?hSGVo1ZVN?fA@qArHHj54;M`mvC=VO*)PA^_%!*jZylvLYoLoD_hKZC8
zfJcYXuuKV(3VYQ0_D5ZUHf(<INL16HW(@T}merzy8G_?*oXV+2Ym(Xi2dZ0(CGCxu
zc_m!htc5eB*$YEDsaC00G2)>m3{}*?N9WHzBn{U@t@FB5Sua#G))6Nr_vr73Sy$$m
z*;U{-WTKgNb>|;c0oO~DLrV3v*o<jp!3oAoJv_2mx#9EtQAq4C7W->#@OL~KL<H_N
z?&XP<{~*|=v2na-_bPu3ian4zDl=Jys<I-m)U0sagK2AP$aplgE^7Ik7o+NFUo2fS
z;<V?C@wTmdaC+oK`<XPvyd)ug0t0v=v5~CP*)KHHFs)3S$k2gC<rklCUJD8Vb#39Y
zDfnq7#A@F<+7YDrQXnDi!h>y(B4F5&_`Qj_aCcb=#qHtwWqH@~$z`@h_wePmNaXZd
zy3Pfm)!6VMx=K>{7=)|yJU_clAv0lFIt3<kJ&(GCKmDE#O=QM-?JHisOs&kwgSzZ+
z-~ufMm8dD4=G}$&p@Mj|vxr~1e(RYljd=je)i+*{l^@`RX@%RUHL)ZfI~=87jw_9R
zWJHn7;XKbRjef2!zxC=&x-zHU+g~(hRxxWFg9y{Jet05ch}kbE&LbH@6VY*ygAjko
z&$O5hTE6NRv-E4p9Yw%Tgsb8W*+wAV*(zKrhrH9gvZ!*K`d7(~btY23u*h=`bR^1U
zvJ~UojUEKDE`@N-MDyNNDJnmfyS~KgOTEPw#;584QlD%ZG(x}0UZDITAUN1vKaam`
zUDvfR3k1{fV)kccVwJn&tgks6sx!qSdOI7{ahx2PI50SG)~t)Yo;oB8msJLUEBz%c
zs;;fim?k#=7IDk)DtLNe(yh}TE<38l?FSS?|N3HCR?Xf+X^Nmu7_mJmB2&;QxI$$r
zA}>-&7=seA;F1K4{07xoOx<Bo3*Ww@WoU@zEzWt=$dR-2YPDY6H)NFDX^o3!RV!ld
zi07emwfB+kf!b6DdQ*dqrGBdUQu?71y7-x8(jC(wB`Qc>*x;c!Y6$qabloAmW9;kE
zZMy#Q*pr?g{%%>@smQB#RD+w8nb$I$H{IGJGW1`x7}2_$-`)jl-MFv+Roa`CwCX?>
z+_jA7nNYESfeZThTMxV9iFE=3!9f=*crsNxD>^A7Fo-*_dT@0~-m&6gp8aL+Nguya
zFQ|;1iIx4Y2%!r9&bm8y?%b?rtvZ4}3&HAjmr)+GGG1^p%=i3k(h}}<>0Ok%bBV*U
z0!OZI?_4opa%y67N9K9tqLy^8ul0EHg0jj5!NpSh_F>}%<{-`Nhf$WxrXcYbrLbLU
zV==VcVCJ7+xn!<#Kb(I>4(XYV1a)&&ZOyZV$4;d6q)-i>J#=bjs-xh+@&3rp=I0w8
z-8=5IjdC?~EuH^|$?KG-TTc}|S>67SfrAHMLkv{w*`uq8zs^}hQ+TeYZEXMrAj(8D
zzCl~S4pjdcnHd4BhpU6WoBfYCPobz2(-F~FP~Zv?D<wBA1e8Vnr?t3SH5lDe-l_KO
z+e`ECZM%R3>ONs%QIH7MB(5+tEzBNN9`*H)Da!4ds=T!(6PEBiaF8Fxdz}ENhj8~N
z;)b#NclR*leX<oek+v$Q(L?f5!kdQ#hu4HGcZgd9vsCCu>9{b<n3HwVs$e~FJdOT-
zs8rOk-7L>|El#YJi-mGZNMMTC)1-a+rG=-04^G+;N|gzWc_}tYvB6&0AC50(N8QoZ
z`7+~wS^%FkI*frKHlH}*nQ-_Ss0&EqZR_tDQ#?r7(c2m{+amJ;>IbVkgXst}Lxw;z
z@=fA_TlYWRkIo4SKmN@~TSrw=&79ouhF|keCEq~?L$a|4h7H~4OvdB*m$?0YY35Py
zFvHic^_)jssI>O>SrpWo%7?9=ZzRTthHG!x@&D960YmN+VcJ4-w(2pt)5Tty{rbl@
zaOt8(mZh^J(fc0IwVWr|zkm&kQt}A}!$fp4ecZSFef+>OvzlL4#9d*gLB)-*E5Rf;
z6N#GFdtdC;mq~#ZjDuIl-a}DE72Dye_U3hUUh?ab!+l7<W1syc=11f^QTbg<3nM1b
zAv7av`STTpqf&nN4BrwGYpz!vo)P64Y7DOOhh8qR=Thu))OK3iO}e)1W_f?7c`IBi
zQYz1I&Hg2xdV1ZzoVed=1c<C}$;q=RCb}=4KWFd6?2GBc-LM+2J(@iG*}h+tl#R<v
zE)>SR+D3)v5Lf(jX;^udup%kxQxSZQx$bgwvMTHZCL7zf+o&8kd*QQhJ(}$BroJ1f
z8Ka6upM8FzxtNf=D@0}3knoP8XFx;i<Wv8ELEi`Q_7h)~s>!5URp^rT^O1&2dJl=a
zOPMU0uCk`OM4m5sSBJ*~=-g3Su@R1y&FOK>AntI+fi*hs1hJV<rL!NeG~;aJyraoO
z=&!_INvOX9qZ&VX-=!T6b2H#;Hm4momzTYW``ZZePBA6u17y;!9Oii*nF@V&C`d^D
zD9Xo>rfocNqGp)q(}jRl7V!-1MDK$DNE%dWM*W%o$vcmBEx6e0IMuHB9ue&`fxK^V
zX2*ChHWWx$tX1Rfk<~NbY9#A?5`|UA(+UqlO8;!*m%E}oXQzMr)S`T2hq0|F_R}6O
zTC%d4-XgQo#I2ckpDmsgo`{qzu@;5loV@LMdteb{0?X!%0={GZ>%mhvP3NC#lZ8ph
z)b!uA&sL-l&NEm@2sq|+7Y)Z8<%Qu^vKVNPI6675%y+=dKmR*_TW-v~AW)9Sc&`d-
zAkGKU`cLW43&|400um_qv!nKO!G0ca=@x$!gSk#<XeD|!8P2lQ#%*Xh0}S-VqUv~;
z=4oMrYj#EjNySV;4O6$eldn@%!%JibGCHTqm&Um$(TI`u3w22T-qC!L8>*zi8};-|
z{I?ywf|}Sg{o3|%2j6W&Vq*ViXDZfPT(|<U5D&`1@imtGqJ_?~S=QWlUnl~q6{Q%|
zlD<wvm$}0j!qtl8pfaNIP1n&mX}UGf9{5e;y&NZHlFoFF)`x#zvje}#j}FTlTvj|u
zIn5@sd%r0^mD(Tr{IkzywHf(G(%8^}8y~)CVmfva&R-Nb$Ehr(&mr?{&31uLSguoE
z)|8P*&ASo#o6H15e8F8wuPf={(PFPJa5$Ty$5AW1FTr_-F~cF${ODWTqDSukH5*}D
zca6thRho5W`|}w`CnM*I61amLfksA~&DLF2agQo&GL0N8EiM13@zOf<FW~urr=vco
z4S#7x6k`h4-TdW$ukH%c`saN6Ec`DJX5FMuKziDG+Ml*YK@)Ghu+1N0G6?O~d1{ea
z3O$jTXQu0xj;MgcN^thC&YNTBzU@13?WOkJ5~x1<7MuI*w4b-&?iuGSYM@1@9mk~r
zl^mNNn}&)#!I{y{w~eYv>y}pe^5ygMgOH>tj*Q}{lX)#H_H^5yrXz}^fY~e_ozU*l
zzUe|+ffbXlCa7g3HRW$i(9*GG#{n>1%gj}{vHQ{<U1+2#yFJ8&dQh1O78n<^B5ijx
zIq1?deTu?}M|M*P_L!_tfy{FCe0qC4FkOOgL*Ui4V*P3_R#8wh<B9()rUx_8e8-*I
zmxxkF)Y?a@n3X9$^6D>sl6#^9(<VXj4le1jK8<W^OV#3?LswcKoYeD&-QH*Hlb08k
zC*j3UV$I=_Qa#$|#6$F=eT&EXoR8RsuS+*lICbH<)TZu8&9X`REYCk0dxMkS5~`^7
zvo8ZinN6BhA9?ypQL#8FA_$2oLw1JNS*hy3@UA%KQOV4OftbDW)#Q}adZ)|-J&E8M
zUemQU+|8IKxtntYs>H-$uz>+ze^@bILR9UVUp;Bik>Z)<)Mq0QAp+uA96JmaF2bE1
zjwsBI>a{wC`Yg}RG-6Znze+t3CM6!6fb=Ujy8iykg)KNKPblA<g!bF+7BR#-Gq%l=
z6;Ka=hmNjYaj%C~S-Q>$fYiPoq#R1BvkT><EfCnT!I$auy7tz31@t4{w>>=kw*9pm
z?L1nH8a3*G?~pEHwd>a&r!+tRGt;Velis(OIK#C1h^L8^Iy<6ZC0ejduSbs?Y3yH!
z2kqvkuik9xtE*GmVAv1UG^mDHkid79bs$j+2>oaq@VI~4u@q9ozKEqdy{0WI<ix>g
zFdp;sf$L8Ae%l8&CL?-KD$mjCY(Pa)W$3U`qpoON+MMn7K;-#|t6aC&i%gkrt;q#I
zWC_YZuMOb6*PpF1@*H?K&S`Z1`A3mV87Vl<fFhg3%X{|zolbyrmC3G4gY1vb-KZ_5
zrr;pQ<pF%D#RIVN2;zTR78dnJRqwk=cf#fTaOUC&(nTmz##wyxYqgm9vKu$gPUc&?
zBcI7HXI@3WyeCP4%NoSI(mq1J{90<pGyx9wS06=xSD55i5c@h8%@jwqxk{*@+%~NB
zt-pVGti{l0*T40*?BFSa&U&N~jf5si%!L}k5PL`FQ-;eq?^p^q_SE6x{1#oBv!V2^
z=eENVglow+N~T4Tupd(f7PzWbgTY<4x<8$IHUk4d*Zh?0<7r!>Z+S=a#Fx8Zy0otw
z14o@W<@i!+%(7|J4o`}(o9vvO*mL3SBj+Fel5y%+4yTFuHK<i9=Er8HfDU9;CEjJw
z$Bu|F;N{HN=GwGl9VcokGck6)V1KR2khrHjOIwd1GI%SV<=-Nbu=FOniR&ufbQ6J^
zC&XC(5hLF37%}Y0ztT&CPKb;7OWoaXF{q^4GC4|lnM8J+TYaT=ra5{{7m)y6!F)re
zer|W`is&9~Pp~$4l{3huv)BNCjLcjTZVYF#$puKzfn6rA@l>s&!5R%$=Gk;|7g*6M
z%?S9K*e=XHbhU|@Jf1K(wr0?5Es`ROI=VO}pv;W(qy4BKB?q=kI`%Fhh$(9amVRlg
zIu6sL6!$17_18Bls5Xu}cMU5Nn(3QX;-FzO|I}){I7Qe6d9(NS91d)x$_eoVC_hMn
zZrR=)V~j<Vp{bvgNJ%I<GgNZXeO^A@Wa7VE2Jhu81r`S<baWf=xEWZnEv@C_Y*w~`
zb-={+{PyXZ)ORRRbg>7y)Eirn6AczGs;kFV)iqTFt*D!5B0a}sbIRIj*+Ei^+ZJA=
z#!tI3V+G2VeCv?JY0ALv9Stdf#?z1@b^U~=Bqq;Fh#)!D9Myn8jh4x*KU@DA9cMB-
z2>k_XDp34ba+WPW?U>GDmu;jp6)8SkL9u4>STnO5(gU~Jg;HxGJ%w3yh<_Cue*Z8M
zKIB8ADHkUfh^Zrma}3Y0+mIo4!qm_x1IKz0{>q&8(&PIF%?RS7_H)$HfwWn$0W1_s
zC>Z58g-_+mUC8T5XLj|Ne58|o>m57F$n&#>Idttv3o*eY2if{=Q20ygxwv-HTtvE2
zNmje3uKm)I8|Q1iD7I=5ZF$f@Ao&VJ^9hGp)fjwr9h@}x<}QpaV&jd#H{~Rfq7k4&
zTxvlY24ow)TtPxB<7(9;0;jDD^on%#Q&4~a@1g|W$6csJRYFjR1^*#WyKzbur8U)=
zOQL^cgSvkK$94fP$cD42TSS(|fN5u7>b%-PeRdI$)#ARhbU~7WtEbhFiQsK{kS{L<
zd76YBoO{vbZ+EP1QYIV!okOsy-2Q3_s2*F_N)AY<B6oV|q=l2~%0~2m5KXYCwLy;~
z=BM{o3`r%2JwP{zwEj@is>&zY?LOVCgEe76Fs4Ua_wJn&e)r(R5!~to5@teGJ24Iv
zffL-gSUCtKCo=v1uG{1NK^rIB8Agu$1SOmbsY2ClHe(gCN*HPT({9D<F<$U^^J+1$
zsZw;H2}7sZ#?@Ac6LQRe<}}mT;Gj^+8xcT2xEW9o56n!}h5-mN4?$SiW}eOHRa!WX
z9PHc|YQ=@*7N#BO$?iQ&^;dE6oYY#1djst~uW2El`(qzS{=~}~48oax2+Ho-KEl%W
zBEdqJu?a7;hPFJd5GEsCi&JqKPR=9w>rdAxRRFZ#Is)74hX24w-M+>A&FJ_>A!}_t
zX?)n$uv~hXA+XHcL@24zw(nIh;QI+r-CjOOS@kFx_?XMt^7szRvp*Yo)675;^L|49
zggFA)^mhaRnw@Ld9?iT*8pMNZ?pA`!oD#bTL$@xM3}L)x-r;~L;SaYz88GkG)q$Sn
zp@mNx(EeYhpKIgeP!Rype?7sdne>@>q*peCGskj3;k}?J^yiKj3k!|1Ux>V&Sg}w9
zK$nc^xiDM2jwPW*tUjifLrFHLw#GsA*Ry8}YibM<w+RcqFb{95@v(M>PwpKf7i$ZQ
zHY)m&yN5?VZ7ms!+-k(v$6%ncAJs1q@+GLA{<fz+E&LWW+5sFUUhXk9A#G5Xv%Zz=
z%B)yyoKDVws`}!~FY~e+&l^qh*|Rj3=91~5q&HuE^_7`NeO=uf;h1x$LLQEfGTYd9
z+5}47K%`J@kRhiSH?k<E7N>#ewP3zlUrMDJ2+E=j53|Z%AZu;M;dA|Y%@>dE*NZ?d
zb5&`g(Z6M3sDH-DJ`u0Qw+1D*K;6ZbSZM|0A=90xB8SSDWkiEAzPE3&JGdJJb#}79
zzeEEubC4vNSeRxzF_Lbv?(|2Hk?kg55KBJq0~fCRE~k>{OJtNOQ0HELTP$rb#z6FL
zMWhrPAQ_<v^m#KxcS2@AfB8cCto_)M(y|1aD*WU=z5rLkIAKvCtr^tS^O7Ow+&-Yq
zQn2cqM0-(#qN+Kt=&uXv`BhZwBF@}q)H3=Sy__PP;uLs{5aI@>lsgzXIVWaxqewh%
z;&vTF>NELS?1CV2lTywCF*=lvg0O*$hk_wZl6Ehm34DXNl$6H)t;)K)til3y(ZqYp
zD%MGpCi`BO#8bd7Ku}S5o)w0QE<WP8!SRute(^IdHWfWMN>a&lK&f>^ZixDH0<53J
zK0zvIzH$w(Kie!IW>NBEsWN%QO+*y8KPjT<MJ9%scnVD5tO#*G{P>F{4$}NUsV}&`
zbkjk&h*ycyijTa7`2dgjUU}*ePK+$+C-QGtCX!{um7=quWA_01oNk>unet`{!e<#A
zJTo^mul_ZO^IUj#oMV%vK@aGs8(nVv%0vXF*wjR^6T~^~alr}kgK)|{!{X`{ajBQa
z#iyO#E;RmWaDlcvoF~Md1;Eo$^ngQF)U;5{c4H<PB7$(A$GUuV#?wW#e17o^O~jDR
z(Sy{pZvOd@cmL91<og=)pZ{k1%Ll!!S1S<G4i;Tvr$UjI@{>9A{a?Cz>}7F=z7mGd
zz!0gj1LhREN@?0l*{JZcJBmYSlnrL`gk>H1ANLnmG~Q0EIC<dle}ampEw$`Fz;0-h
z>pB{_1}$p}ADj-HX?QBi=&!9oGqTbm$9CE3SbJ+s>g}lL{tt)PZrpzNp!3f&X3k#V
zyy8^A{UpWDCnGOBe7*b1V1t?4K3ny$V)@x+XVdG$?!69sR=>)3WrfkI_m|o21O$MN
za>|q`O}jQxgtSI-mPydlsjqYULSf$S9umsroUjP3Wn5$AlAZ4*>#03qsFf@A?{n?N
zcf60@Da-h`qMPdp{{}m{k}#}kx|h#u26d6(S^>fskh=22Z>240VSzEF)Ne|K;^C#R
zh}NFV*JBBGy1t?q*hZ!y4E$CRIYb)QTz5P}z-j$U$Uim-rl&Z-n-^5G&opFklfJ-K
z1xXch%@e@4T?^!T@3ctby;ypWul?r@KFkSgZFtA~1yD=Pg-c5msVnHmDCjS@SyPm*
z>LmZN&#v2h$^Vjrl#y5!Y!F>Y-hGs~FFT=|5eJww{=P;9lSJ|pUUz7&c%}3D^=oy9
zlE}FLNSbzKQS4d$;a!VX8Vq%`N;VE7Y#cvtqDVas1cDo{nTnv(1vS|3KQ!4FgP@Ny
zw3|`}!G$o>YZ2o9DG82OzQYr9w2HK>D^(@Gi3Qm3_d@0n7m~K)t_3K#wlYj8XWRAJ
z1=RsfK9`5o*kBkE5g`**+RCt{%`en1W!(ptmrI^Ct3bxe%S&;Z^6Rg^L2`6aGWaJr
zWtPFfe+Q(l=S>9EEvCs-5Dc|6RrdV-;W+gj|L{4BV}gt>e5?ECy5(9^cR{$w_(9dJ
zM9+ovthWjM%G9Yf<7x@3qgA9Fo8V8?w<@i8BH~erbqp~m>;HenT`;-%1*D5jyRgzG
zBA7hlx@zT0E$+ObEIneBG%~1fq@}#o0DHt6zuo-lpQX@Y<bnmU2<p^xd<qJl&|hUk
zm(NX>n$w?hi14BgMn;;_4<vKk=o;lenX*v?@uHhhJb<Vh&{5tinXCvfn2DNACZuSW
ziuEcjpSY}QjHLm<Ja5O44_=Wkf=9B~4dU`E?6WMB5FIki?6SI_K#jvLGz&1eh%gd1
zPZcE+8#F=cUuwy&l60@k!`fPNCEXm|b;I9p$nO8SbH`3>8v*}RI7<;(Aog19=H4~n
zJqJnFr;^DKfKXPo3d?BOr{GN1Olg5fk!n*UgWQYacfVJEt@5Gf&{5KEa+7*MmmOhg
z_~LzOZP^=zwoGcw9OrovN5(T8$Fmo8R|ITiwe$m*FL*+{9{);V`N+%LJFomfIBT!9
zgO@m^7z|`;k(qQTldR73H3HrKhFEMp^Meh(vNv)88562<$9hHTSXls_r6EP`Ph0d1
zZH<?mJii)KOr8{k-zIIDb1Di14H<KkDMu}Ss>)0f&AosY?LyIiw@Hb`aOqsh`!<kE
z4+X`fmk7P^FTTFMRP(Z=<(*n3(9^;o$KgD9!E9wPgsfWT6_u;MOE_@$0wj`*+h)D?
zAJj?_86r-=;Fyfupt2%ql2P>M_ft#%cK5-1(lc_XGoC>FhK>>&V^Q%kdS>0XUn;sy
zL>6$9tMaJp@Lw4!uDeXptpdff*taRI92~+$fQJsNolF$kE}jsOhj|W-MI9R>Wq9HT
z5y5)k;WIDS(Bb*BGL?BNutuALgB^KMrp_nPOD(V1aj)Cp!BbUzmpm|oM7Vq3TCvC<
zxvwl8WhPCg!uLFU^r)?TRa=U$y<ph+%ge`6N1zgDx|bN|^qc+pvZ*`%*;&?|01zZK
zl73yKObL=?1h<E_jPtRy;yK5n8IaUN6(v4XqKIZBz`R8Uij)2_2}rQiaR>Ws8)JLc
zCmy72olG3+otKv<Ht=lAuF84X8?dOmLr4FPiFts*eTxsbf;Lc^3`{1G2+@|wfAVxm
zk2J>eL0^LeSp$?EdOn^$Mts&bv*VWR1=N%x@d0*pN`oEOQ8fl<&+9jkIXT*PXMHAi
z2~L^xij4UNSv&7Tp=kGBQyF<ds1rO7rYBBc03IPD-_`rUKw-LANOoalC?!X$K>g)e
zFtqf>{aK3V)5S%O6_gilRdc>sM^PMe2y8x|f@dHY$Y{xT#EnyFMII$C82cz@ns(*%
za{BsxuYl!FBZNrutkiNx>A$Y42H4%AzTNMn{g>iSh8ZcXxH?oWLdWym&Rs0!fy!`t
zI;E7Eh+UaxZHoI@HWwH0_acBY;Z%ijRCl)0Mkyv<-|`d{sck>J|0lq8GK)}h0_mI=
zFIMCjTD<V*y<~w3<bdy9SQK+D>PUE{8GtJ=ebI(zrEaqpEh8VXPo(7-Hp^WO36!dn
zY*NCh_I`5oS}NLZzffoV{J~+fX(*hBwA_OdjaD)0LCkO^gsDH{zarzJ=n^v>Z82uR
zSV!bfJmg3(0nelf6XlJz5}a<gDAmb-+R25x;CE8Oj3p~7(YM}sFo>d@-3M^lbg#UH
zmLI$Y=i`_cQW#7E@YhjNc4rBV2`~aVoO$1!b8{nOqe$zWf!$8icdd<1YOL6cpbGkS
zY%_(9SUXUJh-6$_$#Pc@ZpCo|=A=#NcZx+Ocua9|`t_yb{3Q%%?+3GxVG?k}=SwNY
zr;L1m&{%ip<PuXoIVK$eAmnswXsAhk!eQGqnxRIcIRM2LhUf-oW0CMeCch%l7Xzc;
zN)?e03X^_#14HKOh;~&-EZX*FCI+-oSSHAh5LygSB$hrwr3$5+kZ7Fmc-gfHCA@}s
zEaij09J%VBxX|kY)rndQkJ_+*E3EG@T^540y84bub4B3hEnD_7c-9g?a%Fg`*viT<
zM41cr?U-hY%dbR3Dwl{cXoAE|XAVYcv#oN_@(UZ(hb3SM-?CX$cI_4tU+?-i`blHP
zIcx$V+j|GU&v*9Sii?Y@ykgQYUTUN;|NdX&|3r_Q2$oq%?Y90)(S0vBDKm4V*ntz9
z(T;Yg+|l}HqXg~8#qQjH6w_l6TjT0shpMVa-@7zF5=R{nX`JCXF0dS=*ms5{S&1;o
zg%3ho8|r!oH_K#Ts(xbJt+}wPI2b6D!+*6#3(mJHLi)Tv2^H!#A365v&bOX>2Y6jv
zg;4<G=g^7v#oxdG%=FS0>@W#1)KMjzshMCy-;{M^VmVJ~4;ilfPF@8oS0+=-=!)j+
z5>KB#fCf{jYB7eO7S#&jhLa?vp&A1P68nDq;9Mg}CdBsyJ&sKKX&Oz<^HB5s!Vehi
zT6lUQC8^Yz-`Gbj-hpHS`6;7enu{AZK7|Y!xwGd7#qyPcgX!swCW}Jk5F(bSv}Kf_
z@bSFv!H`!{Rk5nZxb=JhQ%H<ajgA-dqNLi#dM!N+na3O<efZoF+h81@N=#S5@HBC5
z61hx7gJr%?5pRj5v-nz(Vw`k|C3C~|Fow-0ctO)BQnHp`d~o@@#84X4HL0P-ztIN&
z#9o8GQ&<n8$w(9s4NsCAM)PO~@P{Go<X{Fm6ytF$N^(jDDlAL!QW%bk^ybOMMba3*
zyNHASWzhR|c1}MJ=O%%S*&T9OjH?_u>a&1v_VRWlZ{gYRAxt14zf)qSu#6?_h&&Hm
zFh9;|tt1^zPEJ}R9o^eY?+(=kV3t_vBDJ}>k!^oz==*JdKUbbC`s+<hR*i}2!C%!K
zJh*^##&Q9;O)7~cXTSsyC~4>CF=ohr`U}HO8IH!KB|JOI@jdd2!GE&nO<cM(0kmRP
zohQl}iIs4pBd>gJIg$pVf(AZkG*5IQafNlFoq{tz3vUujb4-_DPBxcCGszm8SiUd6
zBotV_z+g@7hKgw;FkG}qP!fVVs+)H59{KqA%%g&n96)1?M&udsbP))j7^rzFnX;-a
z%bL#@9|f$dQ@_S?>Pyr2_b*>z`dTWT<H!4g1I1V#Eh3dNx3I_-HOS&`VJXE_N>sip
z|5S8u%Zxsb%ar;3W!V$yN2DKgID1+;w0I}tqQ^4Z6ROFsobbV}l&=Y~r{Y@~etYqp
zQ$;{0H4~qnY|RT2D-)0m|K44iT5iLg{4t4WNZ$M$JWHlL$moW^QZ(;ff|cOEljJxx
z!<8WQ>-Wd0e6}LcBk}h5TR#l$kZ^mkKEfvfsfG?4w${&;KqvhNB%Fk0GL_WS)b2|h
z0E*u^(&LaazPh^l?KRK%<eJ+Q3yQ@986CyvGE7$@8wIz`yy7n`Phns$Jr}GzaausZ
zNsp!gK-kQ0Ntq>``lLtWQ^mzmYuLxnJYX?U<4z(#_ntiu-AaEgGvNSZ_bfG5yc);S
z;zIdY&LbNKhr9M}()5foo0tYM%2dp9HRLg9d~krtQwTUp1sYA=u+utSyr$9UhoqiT
z++WT#YoP&Tk6<@N&9OGLHMXUIQQO9JiARymt*M_tn}GK_Q9e=O$W1;0$IZoPck-^{
zZGD|pxF3zKY)_`b?0R}>IvvZ4VPia@XJkaW*qotil4on|S9|r+r9B~HKWg>CmXl8n
zcp<ply@u)&ItdLj&Wm(z(VS(9hvuX%^i!wJK9=<QQM;BNDRaQMW$-80DJ7X8)SYxO
zCjIHtH<d9jUS^-lG^?YBjVOZ!t3<2ZqdLAx<Z<K(ze?r;2_ob<|9Yq2mGkF+B~i3u
z2TH((T>o6fwa)yY9*}D_%{wA>O@|>TC+9+bpWxcWaNon8qKV59tp`OATX`(KDSH1~
zT0aM;M3K}ogl50`39R~mpL&yeU*<7hikrkD5_hx%gttp^9TXuWFm@#^3eKCq8SNS%
z@S0E?S|}x?pqCVLBYK_94S5b;icg+H$dA;(ldzoV;1h;Qvs4HUq;kEZJ_~r+Sx4v6
zBt;$v6?w5oZhJ5hBgVFhJvVv@$sp7ZpzhQ7kT*S^MxyNQB1uB5?SJ2c)AfKPo2Y9p
zZT(J>8i8TIz^Aj1O@W=uf;v|(ux>mA!U6;3ucfEQs-J^z<og<2p6<EHC$4pX1eUJQ
zj`I#tR+K_#O+x&+M)yaBwFWUGpZOfDiJX`zyFsiq4G(i)0v!<65M1hede&HtWiyn2
za@hgB3A2z(?>d%R=G#Cog9kMpsOsST;NRk?jf=%4tsfNKqe4P70e@ig=E{DyJ>+>E
z_xTAN7-%ujh@0>H4&-Rm$6MI6lul8)$ZyhIcAVT`XdOYe>T<h<Ol6uNZJgybFQZAW
z=J7<3tM^!aRbjc4@BiOMgt!0~Dc|nM;9fB@0f8sVaasLjfp7Zm{h2GzNwKFZK$_ql
z;0w*yiF}1*%bE!6>PSCUHg%T&Vhfrr<u%|#GyG`rmjS7dfWE|Ek~Pai+rx8iZJ2>z
zSQ;5v_Vh&R&6!gykI!_^Cy&oGW$n6k*~p^iNUO)n%ak>T$$LSz3O&ZKOQpaD5AuzD
z^5{`CFxdgp!b{;=isxAwe~Ybfk5({xC$4twCBvbR0ohPP^2UIsXPa69#%O;M!Oh1~
z2w`C*_I*Q(FxI*O3m=X6CM7++Jp2j+uFM;uG}qiIMj1xFXa{!ZzL-|#JHu>kCEL>`
z4sm8ZAw2W?@aSd-IT>v0JNHH>3fr`RC@_;YpX?URWB~3baIJvLb~_1k{xaa8R)4e6
zg3$~Ibqq3!CV6Pt){|_D{}{vNv~*dZI9D(Jj~I;o3<pjOc#I}<M7j%xs;rk;2HT{f
z0_E7egd0um_sNZ}$>125Ny}4uY#v$!(F447gSx?K;ljp)IRXv=>d*Ie#gdt#bl%h3
zM=z;(tc&O#uA&Rz+Wh`{UAl~<=Huu^fh`lV<ml}%WHKIl62t?Re1KTX(g|2~!oP8O
z{X>2K@l5a9`9g{TGFzYWQ^ic-d2Z?wG04N5S2&3ZN89=C8E>*B|B^ICfNG;k_4Bd3
z9^57I?QYfaE=rN60x8L`9H5QV+}vDwup!=`JlG*8kp}!K-Xg|zPx};l>#7)gMW@{X
zQj!#=2=2`2L!<2V2$N}27oj}s=wrkn+pAZvS_1R4T-gzOQD3w+)Te`;eL>t4Q<tzX
zd)=;J>tl9R{`Pk~j1<?EULr~s>XpxSAWkHPI=QPy)hPnZ^_f|~kLM;MmTF(nxtV&t
zca}nI_ndshlyB~lfZ5O<J$?Ogf~pWK)IchI8CSgtZGtC|L}bIQBee=qzfBGEj@=xj
ze|oKc(%g67WBJ>UC=pnD0|ve2c}@3_lm8?W*QLjh7w1|;PMk|DK5(zT+|w?!x#FZg
z8Y0eP4tfEtt#9cZf^8cQ^1%Ax|AcaqXgq!EfU<RC)%W+me?&f&DVL}S#WYVerITJg
zfAQibtuO9)maSa1YQii0+D}lo>d|s4#<-Z2h^h;TcPdjl`xjHM@zrfvYWT~`OxSL;
zGt#QuyVogeOUuZ6ZypVhS=xP@DI)(aQg-Y%$<w&hk@Q3!2oR$Dn2zX$oRTN1{C?cK
zjJ2Dqv$L~^OC$+7b7n|^m%uPU5RW<Aq*MGivd@;+c@l^O^cb7Qvf>?R8tP-uTm8j`
zcgN)|gXIJq0mQ_{5ja9v|2<0=5Tag2@4or9<znd`rQ+)I?rHB+@Ztj3zkNKGrRdFv
z3JfTUTR6On{T+z=^?$YSm)2c++FO49!)J=_-AQbxZo4=y9&F6-ZML;C{P|)yfAMLD
zEtIq$(oGgRya|Os;_yDUJz6QPNb;B072HF2<p65G`>U@N-IiA48u;Ol<<d~LlmM-a
zgePBeT^U4xcA@ov=Q?Tl@Gn+kWwZati4)TJB$*lk@(D>X<=RF<2w?gA9iDKIaw5U>
ziGdY$N)$~7(ip6xA_%nhIZIQCKv|6an7%r+#mCf*TumgU_B@2c;&aPIY}&CyPbtnc
zw5bD)@U=_=9rru=S@%Bv{{AQnN$d5nz?UPOX(8Vj<#jeOnMUBxg#;GDl5SBmZc4Nx
z+4KBLG^qY=;3n#y>&1b<qT0*bGOjdaz{3YbiCl^0YB~706tO_Sj0WA4huY4biC`gk
znxov*JiQg#D=w1#Yp`vMG^~e&hsSap>avKbzx4X*>*$PtzmSo;GTs1ZAqj+3UoG74
zQdkwb-D7F>qlsKFXiBpd$6>%ccYU01SPum<MK~>j;BPh=0i+&~N`qrROMHqEUdsb{
zehrLYypW~;0Uf9n!ejQ+UYb&fUN`&e8yVRXopN~@4~vR+1DCtd%{qu<)h~i~)A)C&
zoKP(POgQZhCG|{|Z1?eb9{GI=05gHFCBP1j8BTNNj8@Slxyxpn-$cZSxg04Ikr=A4
z=Du+1Sdxk2nj|v|vRra^Zy%rhM+`gfPG&$~nz(w2Wgjq+NC-faM#TEUr-L_CB^e-L
zu#z5TdggfvYajYGQFQP4@zG?evmq?(5Q|J&{sa~xKhajwbt*mUJnky=3+Pa7n@R-s
z5>H!-(Hz=!4uGz^uo=XV7rVZxa~+rzhVZ6RLJI^o3Z^ZTbR04B-9Rtb4n6Fz1d!u5
z=qWust*S2yhy}Ypc~R3z3~`UyZs4>A<VjnmW^x@9%+2uxu#FM~M$S4_F+4f1MW&2O
z!?;j|tR^AEsc90t2}!o}3S%|Cj~#ezmVq;n*P7L<y(23wU%4Vvb%=zQswR<f{b%=0
z*nkgyG?PhdMItYq_<WeFz)7M}l1aXjEfYwlpJ81{f4rRHu=q~>cf>;<q3$XOFIkLs
zgHwLhFY~1U6Lml3Q$w+T!zjjNTzWwxNz3;Ci=OYkxBCXZ6S>a+UcsT~f06I-J_K)|
zs*i%PIOi|T(GEVPI#Qam(#2|s-hA+su$hBLLEN$1as3|kBRKx;oG%sDb{`+l0pHwD
zj~m5API<8PiBA`ETB-dwXs$a?x!SExoI6O~kxZ1bMy2@z6WGb~pFBgQ#h%QjVRo%6
z<M=LpQr2|4(#@Cc9>)k!8SlfcZ~f#nB7>U*H{nJKiU%kZjUTYQa@&^QG1kelnt&DZ
z2B`#|3TCTge#DxMB;bk6u-RDEMDg&`4b=XWP2R=4ylruWUY$SZ_uo=d%Y+<>4iGX<
z@wTHY?Ld|;-7NXwjq<O{r!ZYefL#|79kGH`zts$^(}l-D!Xa<fa`7hj(qSs_GN~TO
z5nYgS93cB%9r1(0vibvNPd+?Sqb{SX@10+ky(tbh>IU0ki4xM_I4i|!@HB)Jk}6Z0
zZpt@oqHt>cNgU8HA2&{V1~mWmFUx&T|H7{p(*Y&%X!lca16|7f-?<F)s!(A7MOucj
z_n})E-zvIIeNVirZ_&w&`GVX;6|wU%=%w}8vFoG!P4YUka)jL#5y2u!y_+4&n}5~&
z(4>P6FSjJ`Lp3A%0qxHf=e*W`a*lNpXJLU9Wy|r4d6(_&>?*~*h*+*^L>=xVLju4t
zG_F6F#vB^JYl}@9_d|WjNy6F$Z?T#*Y4grJMdX>I%%cJlMngf=6f2kAo$G)8I0gm$
zChgP7^<@j$bU6fcR~d3G>}o$cp#VtR0{I4{>T&u4`6my|0HI?use43*3_&6zhBMN>
zB!7s%CCCYUTo;%YQn;(QyVE9&oPQ%76VN}1Y;5QDqz=Z_t{V`MB&-6n63@0=7~Cu%
z^*qHAOt|2VC}4NfI&hV?{gTNmKY{5q7GVH?7B?CnJi3~1GNPP7`_)DmYc*Ok1~OH0
zEfkHZl!uVoH9h8Osx2W%G$c!8q|P@+O<?@az-w;WzP&r;%z(RI!+=(7K$7IsXm=P5
zR5fx`)W@MRBpUFJC+45Uw&av|X?Ga<)xqnpFGma?zK`J8v<o2g2}Wyl@73#wb-MTt
zNdgRTn8k^`CKPA-o-+SkH1o74viYsL^S*w@v|v1o2zAhI<{v+N@Zff})9l&v(5K23
zckrtmFW|lS-Bnih?V<n4%GQhm&BM%6AYjpf7GO!2NOiVXuw~dO^;L~DWBGIPiVe!{
z8(GO}-s&;#3<4=~cpjiwX}af&fHhAr^APq&256G>KSmg-ieihE%3pkx9(EA6SL`{#
zT=#^3fG5k=uv~$qPAtd5T*Pf4?8nymdU=&dr*A|<WmhZ}3t=V{=e`1L-~g3#>6=2Q
zyU-43?QW0>RpM$+8Jd2M@$N4~>?+wj+F}NX?IrD(;2@JCz#AC(R3#h<hhY|-T^RTt
zvAX^VaL}y@u^a_+xzV)_c!<a<mCQI`ykOTNGb5AQ;z}E|h0r3B=Yzgz0kd@1XQhfR
z4}dY3z9RDAAu=kJ%O>?WPN~w5R8SnH;8Gm+&x+IEbW9x?{`Zef!^zXcxs4&?b2$eS
zE@1l6uP?lcsOgv|ypz+#R7aF(B5s)UiVQWE`<GcyVi}`ptcVO0R+r>NguXz0w=bJC
zzHu|cLzpCbkLvf2;X_LeIgd#l4#EM7!vd*?j*@cxmJI%)Pk>wnaJ-!W#%=zBQT3s#
zs2`Of_|$0CEnM>8MJI7#<DoUb9s8A{o4JUkK?_SjCiYUqc2=H-m>0$vPrsq#miGAi
zlcR3c5j65r9Rn$Jv#v%Ny<S*6kml|bJ+wheHlrA6$O9BM6X0gJY8JIO#0XPIEM)SC
zkcBn75pJSangDE~y&s_qX`o>1w5;3CEZ+0+h8z;WkwLDpA5vGKVaBg$D=8?q_nEVv
z%v*%Ug7v-qW?@<(V?^I<uEL}KICYCO<B;zNYx<@w*C=%XYuu$yt3haO;W5XJ)8FWZ
zGUdac<ttf@2SF@&Fu_1(hn|i{<}X{FG)OeyfT<dSl?5)P;YtSnvBOo`B>$2~u-JP+
z;NPDS2+4>(^k~`Y^>i<aBEbXw=}ePPj;{O0;`<5sBvxPI70?b(E2dCXNEs?yktnC8
zznn&pv`P3Rtd3TZSVj}nL<w|zI&iD>xyULeh2K;;;pSnUvtaZ&6`zFI^79Nrp5On8
zbVI@SRF$2$mtQRvi8Ow)#9DUfFg6dC`F1EmD6v$BbVQLuvO<S;tWcDqrwA43QobH)
zTP4hw6btCuFD?D<<Adm&{}J0mB#JkMw50-^$74wF<17&WM2}a`XGp!oA~*;95g8-K
z;IatupH!L{{pCnRpwg1Z8L15dWhyd%wwku`tpk)^LX~KQKKN;TMjk0ukJMS;b~Lu5
z>WC|IDLfr-TwggscrbEp@#y`H{LFNU>=D7`R1JPV9pNk!y^PL&(nQ&0*!}A$r98o{
z*`|lfgpU@9=^jsF@jGjjOt5!xvnoise}B3_*sQFH0Cj4PSa_B=pj;PTzDk>VSvms+
zfJcxN*%$q>l|Oax{Kh%U9&WOkuXAqIuO9=Xn{sFg>Psv^1T>en0b$x)rva);2*>>h
zNdNN*0PZjE!)QwB;Ch&DPZ>)4IqEm_0_aUKw<_Jcqj_KlBU@fli})@dpE1^hNCL#&
zQQ#dOiKHv)-3uZngM>u9fWMODw5Hy@DGlNGl&Wy=M_N4MJNJhx6rIPg)wuJ`+Twag
zXXEmHRUhY7k(noHsUokEdBqq(b|bG*%>uNQ@3?z*DwsFHsn0+_+iv3o?ULuHdO%r7
za1+U*hD;$6vp#mTDvDF6geI^SXF7axD_(-0P#<tBB%Mx)&1Hgyu<Ih#=1auNSp6jn
z$p<U*T@L3tfc{Pe^lRGXGesM=&}{kZ+K5!93Lg6T;SnFdkCv0DV(I*&e3>9I0Vscq
zHcb%ttdBQ0H_xQ|UIcGaaR$7cC+s_FQf+bP6{WG@K2GQT8*BWZY#h~Lqc(WC9~^m$
z<DUT4)&miBi2p`5+&)-};IS%ga=m(TZX4!wN@u4`QCCO5hj~OS&VwFG5#fObh#0M#
z9_dqMs@~z-4%m3t|CaMVA+5PJxjU!q&ED#rb7sSpw*S)tC@hs2p<jI$a}oEHc~0Qb
zNYLXu8$gZ^sKr1+XP*azc1NwFRF6Huh)u1wR8dh&I8kZh)kQFFc-*i3+QA%5n<i4L
z;tNymo5cPTyD#6fbO)0TX>NC%Gp8#%L%3bYgyca`zOm$QNbz)p3FOa~xc8_nYUG^=
z4qq0JwhjHt+E8Eod5{+7I}CsiC$X!`ehsMh@cCWeXJT(8S{8vGfgpBwep{7BuF!qU
zH!z4|KUA$|elcr|(OM;&Y#bz?{goubbVL=Oke7fXj9Tiu7Cjg$$AR_Op>5XY2sC3b
z<vstxwmSdTAdN2J$bEeGY@J_*WQ?(Pr)K&ZEq~($d{7)-L1`Y#Gm*0h>g`otd<J?p
zvMgW?bMlM$b)*KvfKf8pzB2H%WUBzSsn6IZ%t!n=m*&$1g4!J4dM>HdGr6uXV>bG?
zO$F%Wn9_C#Uq0Ua7@4iX>~UFoyLZ34)%M4157{O!KT9T7UORQFscC?W0U{XitWjqu
z0@n1Pr~x|*uFSHS07A7;ErS&^Cu~-xa&_6{hy7>;rWAOX6ttJ~+4O7iyXPc*am;Ob
zdFyMO&GD|&PAwTB_ztR0DRBr`95Dx3@*_I<a*EM|phOTBqAkh{P7IDq>96Fj_Dbh9
zUu;8EFW>f4&z=?_$Yu78TJg4rWPY4~^;z!aO{i>MX=)?ik&){>Pgt}^zlG0#ixdZ>
zmP*{G>aE(6{AL7#3EkuR7?AqEy4bO@;9kR|_3Q20a;TY$4XA5@xaz7!y4y(81=HYy
zB@@QwE}Le#XLVk6-={SNDHTI1Fo?#d$)c>h>iA>(_TOADpo}L$ZP~J=jxsepJqOZ{
zMu$C#>#t{I<jVhGO-fHO`Mp3X^z`oppA}7)N*x_Jfs*DlATKai8I$lOR?Cz+F#>mG
z&fB?*X6EKrU<y(eh;c0-x$a9}{)_x{s&~J5oW@$2ztFbN>@hnpj6xo#p7n(a73E0Z
zl_+#A|CaWjf^x_`^<CQY-&_kkmoLB5LJsGU&q?lm_|7USP&1W5pFT25@m{}Y(;91$
z^r*Cz!jG|*=GU?8t5%u3Ilt8AH$=NKX=g<EvjNGk*M)|LV#d!jZri!{Sp4dtX=xXe
z2`B-LbK-4f3`{907C>OUsICPSIlbiJnwvM@-iKw=SxlfNaTH_<o&W{1Yk^3z$zGfZ
z$8f2e2xRI*+G!&ICaX*-DNh$B5~{;Y949SDnX-0&<#<cWMfa{q4j0*83(p?c!N$f7
zvGwE%8?^HZ3Z&n3L~L(S2UnMUcEuk^=noOU@J__nHQ@yqyX-|OT`1ujzn>wt1xyny
z2g(`YYRNc(QYD8W$gaaOTYdtpmNvZceuprL=OIf!Dx#r7D}Zh!xj2#iig@$BWHoVl
z)LZI|9e8?xG#JFVLI1E{w3U0uMNG1_&G+j#nnNaF$)99%34SQdhSo{{zut-=Z_7#D
zCF^8t6<sK*C;~}gg&l@rG1}4|n6OToOH*Ndc*_qTKCJ4rkR5a1<`nI8<gP-=lQGO?
z6daXB!n8<s0Y8y52O)$J?|*TFgxe7aN1J`Aq!yeA><D*6t~W=34$CQiRO1O{>}<8b
z-$DAi5?Zt$$Ws?VWMuo4>(}F5nZejHA40WvLE+(!zE6+(#{PBxek3K5naaw_>YGE}
zdvO@NhV2a4XPmvYcU?*+;G(qOlfUL^Hj`E%>1;%q#iNfPPKm#$J2NHZM{_r78JQ{R
z^YrMI<9J&Z+)L(gBd>|(dYvgpg&|PqCr&lYFTI`cIqF7rlScWQGq3vXX>mpM)#nOR
zr!CvIIr~1od=`E8cq-B4d4;U3yo9L<^QX+6Mtbd<NS$wzIF`am%@WRw7iG!SX={9a
zd=AjjDfXR`ccD+3OS*EvYR?+jSggKYAB^M4YVL&_7TSZ1_9a<7PBgor>he+#y}gfa
z(>MUdYEMmCGNDkcjqxhng#fp02%G%=f)j_c7vl&D&v$eNlDDC#KV1uWF*SbeCDSoK
z$6jFgMomPR8-3WEY{f~rXV_Y^Gs1gH(?k}G+T^+Zg0mJ}yjV0R7k>f$mnR#=RCOK%
zKrOu^A>h-Yq3t2Eq<*RiYsX8BqLiCge&>{=fgG(2V3e7m;nQN@E|`$TNWkRpR*Dle
zOKoK}>Hj!rjM~Cy%ex8#!hxc=QvlhjUGMHVanz_^u6(I@6UP_iB^-;3E3X@9>_l$Y
zL>M7zjpa+z&xXHb?^>47+J5R3iuGH#-=QNC&9(TOfEE}{EIA2YR{wYY36$LV2{1`M
z8QzeX+*B^%K-wsjnd0|z%w@Mb1HcPQK#1;~ke<uIHt*1Lv;_Mo=465wwxhNst-HQ`
z8NCEa7sKbdXd>7Sb9ef23Nc_aSn}m}M<jPp(E3S3=JhE}6eq7^ho4{}>EQ1q@MDp`
zEQ@qJG|6S-xUUg9WDzpOn?(e+o@L1u>zgoeeNF~B#hikrOO~82K7;=(sg*O0a-iiD
zHX=PNFLsUmPhry3FL*ZF5x>FPf?)|>igV~b1Q8=?Cz79iw>Wgf*in*B*q4>nqB!CO
zyd$8eTMlI-)p?3No+z%r9lK&$5onJva~tcVC?{q-jeP;S4&f7h)lrgcir1Z!`+T<V
zWN26znV2}`BiyS?3KAN@RO?o}i;dF6KE?~v3Z1T9pP+*fSb1l3aaj4A77z3bE$YZi
z6I>CGM1<AJ3gg~XQPg>@e9XTJ%mN4A-5J*A9{a`m@yZb_6TuM3RV;axUQzwkcNXv)
zxn7lMw>U;UjaC}NI)La9berzsS$p+@gHdVICGmSOwbTy`pI0_Ev_I9+?ghD?RuNG~
z#14*9WK(o8{%JSj4i|2_@ycDlD>YsTjUD~xXstGTdfT_}n(uKFTC-iZ_t5=+9+;bb
zs5~)g-#lm6dD`=|;|mj1wj=&~Gi2AVI+xeAUH7lA<Fu1a-q=;8Z202E*A5w1b_ZSY
zSiZhu$l;=3z3m=9=I)$%Y0b7YUE1pwzk$CqdC?v{l-Xi?bAYBjL`(5HCT>|2Cz{S~
z&Oky>O2McCn5jL~*>h^W<K|?<@@rR#i5wY^_M5AphGO<oX_>~Xiac;!&gvwf0qG<l
z-lc8*iC=LTfQ6dOj0#*V67)80Uj5lmq(3kWI!c5PkJ0c)Kl<v78Hl1Z1;~6k$;dhI
zLJl1|bSi7;_nK7A&%bBF(`K2O$mPn6L_)7X&|@B1$4tICFo}VDKh%i5ZPco9=RlL6
z|4=;p+GY!Is)`A-2~`_T_2}7iqO`Lk>IJY)r498Ig{b^^l}WM6RuU~~O7Lj6hpBs{
z`{LE}znCBQAFV#Db_Mg_#N=n;J!7TX+KHvj9JE_$Q-0F*F=tqYN)k<V6G=Y)!1ShB
z{7va6w(Z*D3co+00V1o<nxTR!uhwFr%<6y;QV<jIH740b-PH{aUw(S+rcxG~odiQ|
zl*|a<qGuDuiCSuR!idbST|t1w|LA8N5vCg~KQp>>+;@Z2Pl~j^p)*tKMvNHYOuK|M
z?l5CE(2FG}1MU$E$bimybcx3qQKQUhPfKmXh^YEB=p=U_ZAUvax(`vO4D%I#ST*R%
z{GWZmw_3J*?%{v{g`?emD;;Q9r@36k^Z9JZb?ZBT8w~IqBv-`)XL`vYf`TX<u}bK@
zT6_mkPTL?;3+XdwY#%3LA5ZnVP>EF46gOiii6#)NuZMkaWqK2aM}H&3oKg@+GX>%X
z^5a;-wU|SVd_yXc10Gp*|NQ5oC9OqlFlJ3>vXK2iq@z_YA6?gDm$z={5I&Me3yn25
zFeAd`FmAt2bA!N$gq2rC;d$|xD9%k0z&QKc6v2I|$L!|I3$mllSomOBGI+Z_qW?hs
z(>M(>U7BzqSjNJ0^<GVuV1Z}hei@zQ;<CVkoQA1h2QcE2BluL7Y1<yx!}znSTT#4S
zLxQN1nLL*+Uv7JpEXf(IgDQ%Rv!80!_T-|StR5+krNCs*sI)D@24&yo%RA|%>=<xZ
z(m_!$QPf4yQ1I*dx0hwwYH|4;Ly<n?L@%0u<@r$vCgSO7k$yrb5KZaH;cc~lp4E?s
zyKRW~%My{g(qIkN_a%da5S}`Wd?Q*s(PeScldtT@h^@;LS65eopQ_3W>qhLA{nVWZ
zCtxDkIX3-Y4p8qA9-4T8h;-mMJv<OBZ4OVJI#u$h2*2uW_;}8^tRW9a;ni<(Gg8-#
z)$#H1cfE^$R38F;&fLYe7JpyWrcD#KyP4qM&CV2#hMHRHB`4?CM}FG^2*=Bck}XOA
zm#SsKy`)1dmHi?Tq+Mr?rmkxKxOVN@b#BDwBWm041d`q+d+NjYvS!*9tD;!Vv%G?^
zC(-g{q@a^meNk?Wa}Adj{b7_F<Y7g1oEG-ITrdM+f|&tE&%ZiSAXiuR;N6QlM!-7U
z!!yTdNj%X%oZQ$jl|pDESrI2m6wMzkd)itHeIVBAATHa}g4V3bWc5)qnv^o<L@;K!
z?1_k0aQYgB-*a{IYxNX&!Z48Eh;#y4=PG6?GQ}F9WF_VGezNS0W#M?gaaCgw<4e&(
zQnZ^fHR1v<BYa_rdp2*rR^wSBJ?V^-c!GLC8f(O9k@vq(=4{ifi9)!-x<=7OrtZRP
zJ5y>fjeUho0APb0Wc8B(nC1HU1WAmQ9wfjL0Otv)#<n1gM>%1QA%l=%kMrXX!q(TO
zo$Bmz)C}!eU(fHoJg`PF6^XPqy)OWs#s+R9)eqIJ4qRL|gOYc?2`uYpQW>ehF;<wJ
z{+<3>X$BH3G77Q70p8f`bamVau(3hbD}rK?q+(pvdo|H01)7b0G!d5fdI+aaz-z+T
zFf`ykpjH^lEG&0-&=T_#rpn&vntn7sKAT_=e6~R*uo4P&#hGm#RAunlySRQqo|&uo
z^MmZhAusN{J+0REMq9K_mf3s<fDmyL{#)ip-ePN4-5k{s>B`TS40CdFw)dK`0zyky
zDdU(4xK;GC=x5&%GkJUyjC(iEsUl~6d=Deu7rlK|Og$Tgd4a{65r4Dkt?*H?va2Z>
zgyX{q88yKR+ws^?E`}zP=(?Skpd{s><-9!6w>W|1N}Laf)mwTj1?z?1GP>k}fH#vY
zNL)?Wxl`2#z5^A390Q_QxNav@4IelUaNYb#jz66z!ZmSUs(6po`kyj`O$^47=Lqow
z7_s+>+xW}@o4195knt0MzXcvf`-oD`RNL$Y{Vyrm<!4G7=_(F^lV#~s0QA>Yo8O7Y
zqv%EuIKGWW<D|_2G%Ar-zl#tm9h0K@gE79j^Ct-32>*st^fQWul{6uw26qh-+vG4T
ziO8%)1i^{w^mH)e=1{@vWWD?js)|PCg__GkIB^7qVLM50L<uG)*(}+0<t>rDjc19|
zMH2Ft8?kg8*@QLWR?uG7S$8J#K-JwM;hp8=q*EI4b0`8)z3WCbWNXmDZ3g6#)-h&!
zl!&1~IO367QhZ;2HMklboXq>SPr?}ZwIEHRzkwO7qSH-}&+R%c{1#%Ub)RjbU>9{e
zMzrJHlEt%!H9}osjZuWy{G;O-F3SqwOk%h+>%}D`<go9=(-0Yu_=jI)lkLiC7Op2J
zKR+rNeZUJ&3v24I#GP`NpH3B_AmpIjo53N=?<Ft%9&gPgtZ;>DP36p2$E{gL?zLC#
zWb!nM*2TkQ1Tc3RGOu{W74Bt<WFnji{EcYpf~Y|Qjt%k~vVc>x7JbmV#n_=p!=xWZ
zy(y2v;bQQ98NY1V8FhPa9E2iZf~d*!Zr|RTmh9h{B$_4OiJ3}1jeOF$cf{U*gGR6k
zpP;N#MF|YR!b@eV#GRt#m#~6{7BTAEdK0(e&5s*`rmZ@0*Jm`SHbU>~Vr{7D|6Lwb
zsxVJ@a+vwNS}bd%N6!i6tDehtlgWI`IdQ@~@n`;9vvIv20GI?Ex)J-l%!A<l!K6m0
zIg%V@i@{C<aXzMwgX|vDzlIklr8^~0V2v>iP1ZG=XK8*ee!r9=7V%%1pj(yqO=!|C
zrKiOj5<mQ;r}^uw(#~!--y!p}4Bs8ZidiB?mw7l?1IjLJJHQmm)PnGyNgA{6?eE<m
z5qIRs23YjoNwg%KUh4u^Ak!$>jwVO%U$o<5=uEz<>4=5rO&KZ^D8_Rl27@&<pY;4E
zCG3MWSy$i-mQq%>;m@6EeUdKL-S;o1r;lZAjy`&Sn2Ok{!(^ADXMd6o<{*3yEw7NI
zi~uS9kyiDFQ3OXh?|L^No@S(CoOMM<+7ez=jOEbj2Vra7xZWOvO_JK1+7YBudj>bM
z?7EYDURu$j^sjz2{%-1hmm_WbgW%1KD0sBmzBRf-IbO7i_?~tvdwZQ4{}HYFe7hNz
zUL^X`H%3perF1$#Xh*FYU?Ovf2^UNqxUL#-m9*l1)lEvk;0r6rJEKl@A#YH_YQZ!A
z8xtjdFs&91u|i%Kd=$sR?%3Zyegi{OFl!lJrE^C2S}nT6fdg;1fsC{#*Y^5*J0gbd
z?!yhCL~zfDD0lPm@`}pKcETR1%Lsy#v2Hq1WCwY%Q()npJ0T|i4PDC%>FL|_i+z5Z
zMmR%MlV;-{pZ2RuK#O~^Yx;irMi;jy+q`k(M&Fuvw4j2ehvuT&Dtbk6no(cT<k7E^
z`~LHDPOqHcAc{Cr#XVGR)4<{kW|9i$fG>-84#mJQ;M+cq(fp-z!{~&P-tXRbll}d-
z54f_BI)d6t0;gyU@^p<iaU_`Z4zH|xai#eeR#v-)e;xdS>?Gosqt@DGkuS(B_PE&d
zUMwyHc$ON^5y?pC1ZHQQWw6%ef%Ghp;rPaR6ZZRQop8?TVsD7d`qscCQiGHXOJ~_!
z;cfiSL=IO;CHCiQ(Spg^>+zqB9hP?YZa<>q;ihZ;aVn8Jbr>%aN8xNGCzl!E++Uy!
zauEDXT(E^4VTn3j9>a?cc%WMP#Oz?fD%6kK``LS`^vlfNQ#A&KQpKNMna{3eZC^-v
zWsn57nWeLN1%9HzGn+MU-kipSe)cXM^{;M-!aCgjkej=^HR(lGzX8sk7Z|3NJ*zzh
z4s}X9NAgGV`kwZ_#QGh9K@8G}^&l&8UOH6S78}qpxpzxvxVrY_$aGxBCMr%Ckk++q
z$-DFK?dv;Z&BTpE+`0%QSqr@!P8>gOEp-^OIO%W?RHuy22O(l)=(g-P6+P13xp8y0
zEg^>kOXdx+heS5nI&9u)!+`^b^lo_dZC&QYpI3Cnz^S*EiP!V{rQeS}kBoE78nbU&
z{}Mxn8|8v@)O!=Id4;FkMyUjT<P)N&+z<;Z_u?(b9WMj^jJeDKG$19JU|9Ni^G`bZ
zX#*jHPUnzK?J?|HTD;KR-Tibm*8WGNhYC@rj15=2rOlife)17!k)`?VrnDRJ(S08d
zfaj5<lq&ufh1-Jf$rOT#VbZ3ES<zPAVLYHtMLsp<u9|+sM~<8aJPBz#D$X1E@@DOR
z@MG-ZWgB&8n~fa2VaVb2IwWz<OTXP1#U_y^VHMJkMZuE^Jbm2LudFrhP_|{#%~6@@
zI^$Hw^-?*~EiNk)?L-wo9I~is2ukD(J^d&zKR?>AD*$SjqCfxqvunBu6Ay5T*jT=x
zvsPX^{brx6%9$K2x#+8;8pDS1lJ+Fb*}ahwQGZTKTJ}0kl1#<(%6$4{Ku)y=RMyqj
zcG0W-FD|iwq}oP3zCT~_!fT6$vb6l=#{4%|nwwe$-yPdtw~UI@$zbgE#$#r-is0>0
zVYcBDA;}T0t;_Itad8pV7f$AuWc#qr`YhbwemJFh#r`;L`f|ZT-v6BW&-xBV2T0me
zGMFdz?B0OYLg#>#(Ww`_Zr!?V22z@Z+k-S^6CI`fLM2r`FjWe^>su>GKy3(l6U9Rh
zme_4iZK^U~*?G{QLBpN|r)SLFyCJYVqVK(^A{bQZYs%=iDoqX`T0z==*|AltR-&~f
zeVh3x?fI@xqPI$-Hz`qH)^geHqZhSjJ2~y)j2HSDy;AtQi%aInmqB-=AF3!`{8mjr
zrdgP)Bz+z|di1deTeF?Wj;72xcU-%yRhxhRd*jY3<zU|jP#o9$DVyg<Xd4)0&U(tC
z<C*$rj_ow1kgF5A*YOUPtlt<rOn5Wx+xO2*(P{14z(Doj&ag1ylXhq><Di$PCf(-{
z>BH4kwSyrSb8GOkGme2W6Ne?Iv|=6<+PKa{2XNqRdY5d(ijzK`g@K3h$td3y$O*Mw
z(@(RTr71eQv2KM(12S=zswlR&EK<XifzAz_&|83QHk8+2@7b-})me5ZcKSGd=C->8
z+~J>}PHUx0Pl6&<tTdQOnKP3NU%)etV1H-wueS&lrL@Aph9rJn5ox7R6PXudJE}kr
z6K93}_<Vj$OblVi)iIPA%<SR4E@iSS4PptMr*ovww{WG7v~_FYc9=CQv~MGi5?w`q
zd*9KStbg%hC6T}NYMJjwYHe48vH2<5I=rosquQSt)a`Aa`R=-}&iw4N&qS@Y55Q5x
z35$;fQL#(HM@jG)-WVx^ii&*OG4)kXBQ%OVqT^GNZ<^m-SYoO%Yll96Q~5M6Yu1~R
zqOi?T6@VU_Vm**{;Dftxdj<D3+iJ4r;sgW2h}su`Qr%F(b>5k%*9-5SF|_wKlhNcl
zN-8lBFsIYGM$#kPrRd0!BYku2m^Q|TZQJyH+JAPbakO_=AJC%{{RP-Uh(b>k`0xf#
zpn}wx!9F-<fg8lao7y{t+&N94Jca&xYG9hOW(4X}@RQEyQn|p`q?{gd|26KND^XX@
z4BSGC3@2yZY8POIUBr|&MWPUqy<TJJw`^HvDB-ew@cLF1%HtM0WWH$)JI!i}KGjGC
zM6sf+1T9@V<@8nUo-<$6)Qoep7`evw`OC<UR?WFUagV`;U_WDo9@CD8406Grawgyc
zDqtS@(k8;ORv&r%X$LtW<2XYui-X<pEkS9sX3W^h(bJ7XTNQ<@V;*kP$a2RnZXkz;
z@)rXOr3!yp#<XoKuXvaeJc_@%?C<SxQ=+rKd=m906`k+tm&xQkbZZ`#8DTXhJUE%i
z);U5NbxPA4y(IvcZ|>n7*y6`Q+S8`pvo3DdteG^7=>OdB%=C|=+}87)4l+X)2(Oe<
ziq&85^d~5@IzM>k-<So8b^WFF6z-|h!4%M4nQin^#!FF2%!&?lXd(GA30(*WqO!$W
zS8++;9h7jO0Ka5OS1$*zm@T5i4%U8neTNOpfXsaI=?nw~Ix@FSmNUuIUdqK1k}_dg
z|BtOVfvY)x-?*Fq>@$ATjBP9_%Ly&E%C49(mP14f64_cLTO|sq*%$`pw2-9+Eh<FW
zl4ZuyqKFoevPG#BQKD4O`#za5^E^+l|ML6IbUNpJzn|~DT-SA9H_2T+#*1iW?DR`K
zs0+zS3CGMEPrQzJqr)-^Iw7?f7KBZlW4SOQ{u70#tfV6rUI{ghmcE!pG^UjsszPVj
z*|`En)9ekh?~MDV<V<u1lhC4xLm#6;R9hx8ATQB=;^DMe?Kk?@9seNg1{*LozZy?n
zDw?v)-QTFsv{yS>ltix;RP#K27(o1>chJj-J%Ta=FG!N6wKAVSXwV?RjN}b@-a^$$
zIs$pZP}uYRm}OdlF_la%J?@SY4(jVh2kbnj6HYAh3#bLfe5P#Y8iMZQYkTaNJkOy;
z$=TfuCW4ETo0@1@1VLD4#swapop<OFBP~<lS69?k&6Y(YSKoknvrS>u&X?I0pkkf)
z@_=7CyYFBlE=@&*)tE&nKj%NS1;`f`T<mh75)T1WTjvY-Bm*pfPdg949nEE8TWpsh
z%e!7pLINr1y?EHPEf#5e6EVM6)dT#=C>173nfiy^SMR$#>J@^>q`L|LKsSibanq&a
zV>Y#wW%ZBC%NMo1JmN5%nY@#X^9w2x`ny83wB+DP`LiPag_l(Dd{4|$EjySK_V2*%
z=j_I>Hqsx6I21eag{{RF6CJ1tTHbbSxPAvEZXkkXFn<Epqn1;#IkLX@oO5SnEE>ri
z245CT8Rt^Gd*Y@`VVDLdPCUs$1Q^Y0N7i~nd$43{Jks82Txv_ua0w4$&M7{g1UbO*
z1DwK)3m3W<HG~*UpG=W>lgl8how<6V7?T~R?j@OeSRe_~O`l|gN)}mNZSJ!N-tTr|
z#M1^Zq>2|+vf!~?E&~Fwi1a1)bIY%!q@)y<b#HUM?8B^fds^!MQBHs4sJor}jzXpi
z1}z>#P-4!l<EHEtU*iaB%nYKq`kLH#;niO2R4eOefWn0u|DTcOyT7D9>;yDJ+CEs=
zJl>x%tGTTcsklighz%k|nDiv(nEE&G*N*;go8_?_kX0m-_k5erilayCj6K6oQpL$U
za@m%_cl%qdZ+WHT|Lc{;pMCKK^Y$lc%vUrw`2^0O8I%56bR=-x#_HFl8ImqbCc_kM
z`hR8lpRu76$E!)<X0zEr-G8I)%$WjH1S0Pe5edbgv7^WiGUp>v#)#Ho%z3CeX>Te1
zg=%bA5ift!&%C{smLZxGX7AZmnSG$D8E{4h%8Z@==_vg+pTccR;*E$BsR9OVWNP^e
zLlQz@NH~rXg#LN=Avm^S*~1Q+0jn^r+x`QGz>N}XN0ld`c~tvv=wc~tL?SC@)FPnk
zK4#MaU;(j?B5uCER13Tz=nJ8e&)uP2-&r%zgNT;Ia!H#mydF5I=bEAm7rqyW0sveV
zH^%_r%6{ea-3<OOR8u3M{W1~haxN7#SPu{_BqRihiWPA+XV%LEyVEOT1n+@YbvR0m
zX6!)4`~0Dhk^p_%Fu#$auDmX~<%rnY;>IFQOyc_8x9>H@G^~`t(}Y<{IDUMf`gVD3
z1JYI~`Sw%i&+lGT<kw2P;6!3Q->`rG>yPeUAO`61ulwl_>}rLwhGhchwnltP9~AJh
zA*V>@)>WpTKiP&p*M&vXJ5ATDOqjtnq~|$=JW26;pIN7-NO=j!juzmoLp4pRi10Qh
zq3tiD2NqKeU={P??YRUl@eC6n6kJM7{|MxhJ;OzTq(;XfL(aATSO+3n?2537wAuUp
z$6_4Cb^{f~i6RD=)ae?|N%t@R#*ht<!f{wdflxr2RSfDPAJwOq^@wgcDgm*SfD&Wb
zcX$IdYSMg)C@WB2DFJx(E_HU;F_DZf%U$8JM3W|br_ibnZU8eWZe!0aodTTWGzfrS
zJmyrgeOhD!B5M+=g^TEdY#Ob+*lMT_O~R>*fi~HdqqOAB^3JYVg}_1&T#bm61C^Wy
z%m^-j<@W8Nc_X+V`(@Q-hjC-ZlnJ~mW2T_r5?W6qDWZ^f$P7i2C=1uAQWiq#LaEqo
zTEpUH;YL;3C-(0g;x<2J<dlAUNd*SjHL4Kf?`!+zGER&AU@oZW@I@ym1QRF{0e?GF
zRarUy>f<yjL`}P`S>wvOgVWwGqm-G%U=VLn+QCf^Xdya12);YT{;h0Yum9U*=2v6D
zl@je`m5JZCeL<auTub};q8^pBu?!DPeO@gJ1Nb}OPLqeM{0$RrlnArAD(N?AGwHq9
zSbhU{^35}l)yZyQv9{z$qT!gnY0U^#zd2_7lUt9D^O#8$Cm!;W>FAnR`8P>`$A}Gc
zmW#I)aloGBE_4&bQSf3`r8-LrR;f1ulPSvHes;Ey=h#_mdW_}e&7pjEhc45mO<Vo^
z{*MZgGqUYPKzrHDM)1zKdbM(6mq#rRoY6!8yOmXidNfIlcHDg3C8dYx*FbcTzBnvc
zupr8ikP~1tV%^9obis_yxuh(ns{T2;PyLb1%?=tGE+BR}Tu_CQ-PbqBY?Hn)(=Nq?
zf)8$XyMk{MsGzqmClF0;zYIc)PFhzveC5;YjC2_7jt0#ZOYhC-V4HVdV$?PsCUBw(
zQAwi1`L|zbYUVSmsB8N6|A^avwkmy|a2kx-je2~tvB*~gJP*!FvP40neoz6H&a6XE
zH)%)z?>K!?vX=4af>eEl;?m|&wWPo=e+(nH@JO;3P)bU2>Y%2QU}6AdG6Pvt;1lAI
z?CS~R@23#rGNqc*@am4wHSRkSE<P0XzY3>@KX|I5*YqLD{b?=+S^T95t8Yd2AhlZT
z4nMZ=K5&c_l!_RjV~}<Wx7qRlu?b0;O(7Q+TYRaw%P8@BZk+)HC{~FAVBqW%03E=+
zpL;2l@hloa<?3727a791Sn6v_v>i>-G)`dvvH#XxEm<h=jTk1#v3Y2Z%vwaC6yV}U
zrpyMWXD_FvxD7yW%T?jq*#I=PtsBC=E!Jhlp@9@2iW@i;cuTCVWtZit65;Z>OUT4l
z*&Lo_D=ul|6>C~z8Ho@gjZ#aF4=uOP^S?e7qHe62IgxDOb}pz?HlRm2JP5)$i>D<g
znUX$`dbd6GW)7}n%n0rS#TU15=9`T?m^<a?zKuJoG<6*|()XM+wyg~;BYu;XrQ7KB
z9W+Nt(WX6}U63f|4bb_>U4x^KggM<Yybno`rmO`O;PG}?vZu-ND-b00jasH9>cWc^
znmT5SwUSvsIxL>PD5aVUWwcgXi41kS+gciShiY<~Sf7h;8tM+Y3{pRnj3k=rDuqv=
zm<}(0+wXVK^hMwxdMTE%CB8M6hwCz$LhMvQq6?-^@mRdz%_*X_fuUh$L~oJwaHGIy
z+!EgK39}KR)Pki}KuwspABm<CSUaA~4A*qYYg3-Avw-nWzj<>S{Plp6c!nCJ>mUzV
z`Sjyj#9*-^R0Q{y$(zHpPiY9GDO#ooB=38!pZLuZlx575)HDe^a?sz`+EMH$N&haK
zMVaZV9{!mUW<%ooK%AQh|5d^3c68)j!u*NbGR&>1J>ojSL<I33(I07~?W6HtswIf?
zXxBdBtBf@KYP_xe<}Fz8(z<A{_Onw{6CBd5OWXCcFMQJ9jf$XE_&C+~qepKkrnS=j
z-EQX0MSx(u=XH_N0idV@fH5GL__EPU6j|+QaC}IiNT=Jm8OHR^Z)(u$1U<5K@><0}
zkWr~nfoXF=ML2nb1WuENLJHQlP9q*xRV{71R)&8B$3y@iLyFvM1Bx{orF?F%rD-YE
zwG4$WLbo|Kdb<oJ0o~2<sSV~aac$d~LeTU>9bE!LkaVF4FTa4(B=^ZQY~aI^aZ4tf
znwm<-B)UGDRK51PE$xK@8^?h(oJw>Nq}Dx*BY(C3eh*rC(eDu}5A)FvP++MUsIIfY
zEthHaV;zD7^n?To5E;r+L7WZ}dU$k2nPhKwbR~4mRFwP5wt>)ePK6yA?-|CaLvqc1
z?t|q<kK&CJdUok4K2;B85osI^v1Cqh_YsW?527kU_K>g>z{0a?yOOkTb8fBog0~#T
z4yn)1yu4;LyA-qog3U!#Gct40x`9L<=y~fo>QuFG8>UcpLTuTmqZDXEcDI1li)b2+
z<#Z~Pz&$5<%!HKG6%zxNJMKHm?EOm}rDk|$5o1=ehLXiR!bu7Z`_lObK=R|X;;rJj
zhVfC{*9;U#2w`NA?lG1Q$OIHrtBOs`3&o6q&!}ECOA9H&7HTAwz^sWab(3v6J5Kg-
z@a66jg^V`__nTu6Yb*X-#Phi=<+YJ;u#!0Fc19yCkYMT-4n*lMn=4LjILUK}M$LEq
zO)I$tU$ZNFg>xDU^sFi=ydAYl)3z0F^w7DiGbd1+M>Gc=DAPwhL{lpgDyuysk&w$A
zB?XKOujwiUdqjJ%bMMQCIyB&@9WScdtsSqKe94drk&uvPld734p;n|{xiSGnz(ntL
zKm;?oYA+EeVc^O?Wb@CH%onG)6`fN#Vu2KvNQ7nIU#Iwy`pcE5VkuF?534s-I>mST
z^9J*npK<2#t9jm_;&sCOOT>b>5pycyu~LO#;!$a7FN`{Qu?4qy8s#wYztQ2|m(&Hk
zIDa(hG^F30!%~(Y80a|wRG7Vz);t@8n>Jlc`K{-fG|(6)>M99%N5x9c*qNZQh}z9T
zi`icXFH@@Xxu4K9rBvsv@D0#Dv3m8R>RnA%FP>;Iq6hbGUR`pMzFMlL4)>B*S@^P`
zmnXYd*}Mm7<l%4{8#G<A0*BP<bY>8=K-h`5o00vg78#b6W>JMJlv1m+I>U076)Eg-
zR`?v)x0JUL=-?J;5reJ}Yq%+_gAK2)FaCHUZr>_}m?+X#Pb%)B3rcvRRlIi*Wy8bJ
zn=Dot&j`l4C{sa<C@al!w3$HhjjAOk<x#YRyZD2QVG1=i>)5CH#oZ34jy$E^e1G+b
zOSN8Rq-V)|&^(K{<uWEn?u-W67{B_KF8W}*yLazOuO(CQB_&Rl*U^0PV?P_E|4kkb
zcEnzA&>U?5S1JwG2OoZNfHCnBrC5W<%Y2s*{$s1FS-~l5ggl}@Mvu7e&Es!n3v2ep
zPh%K#Qe0f4?g5^^zWTy{lfKavGWw8pXMR)YB0EpnnH#oJOItf^{)SHwlnJQSjC_$d
zi8OiHX@u7ivGGPgC4OSa!6$lLeJMU$#W7h*`LDPqc|WY@5XDfx=%s*i<I#T2`PzUU
zNWk`qddYPuCkMo%uhM<x%E_K#Akm^YpotQA`|?ob@BOv6==e5P7n25+vIiw+PrtCM
zpcP>*jm5v3TF7zB+MSw6J5g_3`rITqN9ks&{c!c$pDU_052ucv<zJe?6C+*=cYW$+
zQ(IdIfYK{`?LGoUClBd;nb%rKh!QB|Ttn4xsPpX2w(5xOm-#iy8rs3MJ9ma(zURZX
zz_g1OM<dol_^_d)P&a?Z$r^5nM<}JF%Z7y)C$EypU(#wSdiIo6n<-u(Y%zCJ8x-{B
zejRV!y49OTKxjHJs%q;sx9;5OoJ!cU*v+via&T4QYq9ml%<Iy6yFZ#D+B1MPli?dj
zTq`Ij*s*NuS30yyx#S$H%?QsTlFXokE<gNM>E!Zk_P!rBuE_GK`NcK;*pc44fa)sC
z<{gU0sE0QO`n;{BxE5w(1W#_*25#Mwd*{@)IpAx7rf02G9Q<V~8pd0EV5~lCUf%4c
z9Rp@5>%9`rpJWDx=3$ZqOawV<6_<v6FHsLNXtJN~UNRJuz7N(+)$kJ>j0MQ!Rnp&q
z*^J0DlkXqO=EO<#Nmf`DgV75w-|A35&bixLn#9Oq{knI*@T4ccF|9~KR%9mQpwZK$
zWZs{sv~=-&6QBU;V$5^+<y$${)6EQq4^QR`pLscU)~v!drDo+{1>!FpJ<+h8)T#tE
z<1>9J$LZTDJ+3~s`4+^`ydrR=hj%BLQl`-e_YQYv;<>@OX@<k2d$x;CX=}QfHvLum
zy5)g=*#>AI*yhlzr0gf|uTDvi44x{kYczmYUzlpCuT+|r&>v9v^cC;yW_DdN;4sT$
ziAa+VvOP3GPD}-U72h>8dRcqhIlPQ&L&`Gof=8R0^RZvP4c7(O+j%pT&+FW}{?GG1
z(#RCrf_fla;R1+<+QxPLO^EN#sgw~DJnfe+U%tuJ-UgwJcT_D$#>?gRIV69_B})!b
zHNcHNCL6W;LRJeWm3)mRZc%hlFFor6*FCNldj|Bf(DsP#%4IBd|L{d(8iZiTsSiRc
zs=v-J)|ut+yIdZq5xXQ6^6Z4jZkj#Aw;NV5LMt?jBw4$YCEH8Ym*LEOK^Fl|7IGlc
z*9{JkVuiRtDgcS>sOiUV`Q2utLKzyCT>7%MOlpBI3ArcF%-o)0V8kC(s7q^ZQP}E!
zRX@>qyD_E#%H2bW26x{2-_2+=xb?$j9~w{1B+t9WVzp{EJHbnj2kZb<jf^FUf(|aZ
zZS#?DD2d6O#MZt~y~QU$+UW<LwhwyFpoKg>>x|b_^eES<FfijNbEp<gHJGtfE*#c3
z>snezJjka6h+7JD7+Tt0mn4({fyy3rS&q{}nKmIkIK<;gRX<bc3lz6C6g?FT$PZez
zNW<@2a5%>T>?Rd@zD;jgkXLS8IS#;chY~=Dt1^xYFd22c+9FcZ>~+`6N#sQlz>8S=
zOh}J$Ux~VxW0*&8CbD<r3jJ@N{`kxGXsT+8d0WT3oxcycj@C`&#p2~opGH@=bMZCo
z05-2I_4l1@s!3BSYxu$|!c>^u{J~)2g%0sEj((3ezGJ}x%5f?6)T3NnnzZZnxMBtV
z6oLoFZeJ^Q4>=1{N;z;yXgKt+<Y<$PcDA-L@LK}J@L_wCH;f%y5|`1$6)TQOT>&do
zL9_FpI^ovQ9z|w?Gdz~?AbYT6xPj9vDm-*NR%~p~+|x<p{>MYz9{!TvxD`c{cgd&I
zo_GElMZO>;G&<M8+8<X{9TpL(++v326-wz-_{axI!ZAd%^rhFE&o?)Wlvb3BDxXt2
zcM+<oWb2)1_Yl!!U+kQlq*s+U7^qm18KycxOn6c!><3SosG_5*r34i-NL7Tm!T{76
zEjUs3EkX|BPX{K<pR{_0Xm0*fX?$A6)6Wl1-%H)y@h-_lv5HlKV(}p92n1;J!`yN2
zXCPSHwWZf8^@)4FsUAX2*1@CAWesAXqU6bw-XwR?lEWc7U6FJ@KAcBM*1Jjsf_{x6
z>ZqMn!c3EstUlCmNB|x=;M?2;DYi`^WSO4Xnc@3|8#dOTb?S(0IACSWba6D2(Ia1m
z%B4D&B#r#PczXc_<-`-`Xw^glPlxG=Fb^5)Y|414Y5|7Oz#1q?eSuU3)!*OioGI1V
zOweE9FsO>+(0b^z`;2){8IoX2gN@3XHc4Pi4xzC}3O&E43?=~eNzc`J(BdhJ7cWj2
z%6kxQgOiho5B5+;ndh4X*;-d$*6Zr-p;lB(<~M6|5T{GQiBZksB}=Xb(V=Ms3;+ZZ
z$c)$%BO+7A{Lu>LLd89q_Nhkv5&Qsz;8pARX=KMCFJ0Gn&C9$GH8KzKU8TtowWmP7
z93hEUn@>TGbCshROo}>nj$svjyAua7$&IA^(+4X2O-3mw2&6vZjv6<f?dmGK{di!z
zNkJxI2XlWB+D;T%UnJ4g6)y$>XG4Ycjo%MMAXye&W%~Sf`qly9ztn)%ETS`ZXz63@
zf8jH`fNA0nG(dYs?QcxDGTtDr7_pFpEr-Hu5D8SC`&^m(N5(@OQ?H<8Y|BIETa7b6
zQ9VcF81sbDE>#4hKC&t^tMacxHH9$O)iq8=s^_pC@UZ>aSWc=qEJC(T1=JRP=~68j
z=LUa7y3E-GP-H*=IF{)u-SAU=>1)NMEMmjRg(DV?@xK4J7NDO5HX0M8GtGss)^-$4
zJuQDogh~87ORmavYC;~fm)50+8HtpkBI3Iq@ZGIAKwj8zr6;XoVq3)U5V||gi0~4C
zduQR=P?AmY%V=)6MX>W~rcwVKf}Mhl5cq=_NoW0TxS+AP*<K-Jwfb2c&wjT-9ij8B
zA<1p1z#g;KE$X29q>XZ#h(?Ad>QQCoA-*Qkfx8NOI%6V@1xkSFYB9T<wdZds4;NCd
z&{`hH?d>=g`(hS}3S`Aw4ZrE*-?0mXw(%<2_eYGE6?|EhT5Iz({0yU{^==$l4|?Tr
zVEmsHdc7xjg~77c6xOZn2uC`B+^{<Kcvi*waZGK9RIccy)tdQY%&a==<l+*l7d}!C
zZoSz!JdQX=`&wufpLNg|Xz0vVW^Upr!O}QZz<Uo}TJ-&AE=a79Y)y?h*7?OaYIIQw
zOKics0x+h(RS4g0RLMd%QIKAv_<b>M2WJr4kwoZ-9;&K_S&=71QgQYW`AyjapEJOG
zJvDEw)Ban3651DmjIfaek37n)(wi4rbm|VK{;O(dU@d@F*8HIyFB(ihnm-(hQb`DY
z(<jj#3EwFxk7q&-l?+ZW>jcX3S7F%)8e!R#Q9|kgb&OU$V)&4DErxTD)i{f@EI)qc
zd5O*zJ}f*E(htr|e!8{+2ABv&BOoSN)TH@vSE3=NSCuquYv++HQP^IG6p%xhMzS+p
zUIOAEwqgT`4YJ~+WU{74zz0x>;#DP@d-9gjlBbWUhIpPU=PN_s^l3?gY51K68_D>d
z`u#=cMW=_9Q!s7iK@Ne;qoQqp-9@@<dshxxK60sCCLX%5=Ycjsyg!H{aWpQyDIZEE
zf1?qwUW`Sd<1kx4Up*6Tu@ubh3}^Jux#k+`GQ0(VQopC)^lakmV~#Ve)&bxcxPd`m
zm=UUEpMnHq(Y4c8snKtZn&-;r0wDryZpjysq0oJV>y$RdA86QS5cEXvfF$rz@7G6;
z9m@qn6PrErKCc7*zVqt0)hKIA9nH|Dj2s9)jo{=PX<bF?F32gtNrF6JO-Hr#C_$U*
zj=+m>F6&8^RBzw(7<+(IS-=Dz+4t&hU5!yH&WyM-QGi$wyQd-Tj@U3}Ewz3&iW5hd
z&^NjrZ-%b>q~#;1|L^l`X}l1Q?3JQ{e#oNzDmS+q*rVV|H+GxE*MP}n@Lbf|*bw&<
zxt0pgS#hyr`S)1Se}Ag8wkyV8#?D`8#D_ClDVj&|U!$FycV_8=1^dBjtx=#)<zbjJ
zL%N>#KYk9Z7LjOzhPIs8*>eDYNsx14TzF^~t05@A?RFGAIQkfg8<NP7;U9FBqQeC+
zmmM@hCvhsRM4&|$HFj>Lp*9Br6CEwo3QvTLy<b9`DtfOcB6vrzXz#W3KOaX!Y55Ml
zq1csK$n=M>KLC<4RcrsGO&|3mXv?=ghQo@WAR_A!;VfJKh;M1w@$smuBG8iw4}ZP$
zvNL;0ox;9efue^;Tw1aUy2zh$xq@6LuAI=LBL5R6oR4>V7Tke6+<Z+-((%=R%F+M%
zjL&zj$<159p_65l8xc~7L!#tY6cUSCF4y!q)Y`(bi5ht)e1YImWUNlr+@1>1A@O?Y
znD~+3^f_Dv@sX%S9YdX4GI_&)Bxd&G06t_TgY<D6WQXc;vjO>^bmKG{JIHj?nK0Dp
ze_%T#!;|%t^v9rT#o9>5O?b4f?$Opb<1R}-iHK74uqf;hFPU2vWG-0CKR<uEKH_?l
z!EX>EL?RNn12-FaFOJ&j)$)5zPV?q<`}W&!MKlWQql(}$r^uMWsJ6(~rPrc_k#pEj
zlVenTUWUtPeWVph%30n0KZ;qe%TfdomC%^w(X#|z*72-G@{}yf?e*%(zld?<ZX2cq
zJXn6|_2Yk@;QQ^QgrN#uNG+(o6H*g0VMs+41OwR8`q1D8{`XrND6k7=w~!3hpdvY{
zzM?`cLO8a(D^HyLe{c8~??f9ReP_p_hS_<^z4Y24xE&cGWB&MWv4=)l@Be;7Wzjhw
zJK<7bM?FDa{=avROyK-q34gy(?(pC0kR3?p%wFIBP`VBfjuv$YH^S?mJf^;V_jQ+p
z<;$8pun}~iliK^{=IwtcQ5daOqWH<g_=bG1IybH6-9LQNv9(>B;U?X)f83eZy6u>l
zeXkZj{4Q@%WS_{ft^*I5jcvoO!N`F#FZDb$&D81g;)B<(|MFX~XV}PDL$#D?W|tZ~
zXCJO8tb2CO_xv-Ts&5|EojtD+P&EGE($4z>f`fu0fG35U;p9(az{fTjd4&|KQrm+j
zw%_z_T5{;G?>8axg|v-@NvE(UG}SP~u<JMrj$%;OpySBlf$m1Z@#Pp)F8uiI{_kd4
z{~sITRv#p7fPXkuQcK{IUIbzs%|mC`dxH8k8jpIa{ykA}MC?3W5hv^%uwebQFgB;J
z^tF+;KLZic_fyr4kH(*^|MUI2Zifm|qXwcKVi*bk%fwfZl%o5TE{?jQPg|+P_%Yn_
zpYP|c1XB@J4KYp+rsUw&f`XacV%f*Uv%Y`ro7cfDpYP9{Yge*)MqeC;4CvVE6RZ%U
zy@ux%>!UkGFdXvNT-k}7P#At=p*1}ZvJG7bXMQm!e9pQzx)mRgX&+4$CxzlIgO4$W
zK5>Kk+l&g4vVkso3KgQLo#@+l>aO+j@>+=4x}K3Z92k1S7;XJUv*jy{?~arz0y$@*
zvLM};!VC*?L-LV~3QEV5=ufp8`s&NiJ{#|m49&9ZL}#=mw!5OEqhk&cjEy>NiFq{u
z`Zs4K_4C!j?>@ZU6*mzbpPFq7u|G*jxKgAul<|V?d?!YTjE;q)JB5SLSBgl544c?C
zm3~P|7}DVr5JKDXFKN+S=9+X8;ftWYBO7Z1rSG6L)Cac9-m4C80rSw8l*0fq1{Q@X
zhD92eYSx$m1;{9uFf?>H15uq&)n2*qO-pfd-)(gErvb{b`=^SmAJkmDqQuz~x^Md0
zwQINf`=E<oa_dO9)YQKM9Vu9&ehDY%&m5*DoEpmAcz!Dv{m8@(f!$<ekS@g&6^Y?l
z+j?}(^~g{SlrrFt5;BtH)p5&M{s*4IB>8&^gtRtd#D<npDtmx|s9KUt@X3>dZAPkZ
zh{Z23Ok}JI63ldNrhL3a-_q6I-m(r)G)yvd4)btjZvPqm=eOL%5!X1wMt`!C!Nhhr
z{#dvP+~m0s`HwQ2E>z0gSsNVzoRW;b3g}368|AQpYOmyVKYe-#_BwO{B&gndXExh%
zfbFl7DRfTnAxJTWuKUhl$1>`ntU7bKC$eH%q$K-laK`(}<-VT&#DnpwR-cGR+CheQ
zCY)|qWfr?m{o%Byv$2BV|EW}7HjEj`{4oeZ)JhICb(a8*GTS1`7BJ8EUrBF=_ZJST
zzvjxybE${v87Io4p`ey6qo%rc)D3Sdg;IJ1&iB?WA3BTyG@~}^hJsjDFbp-ji;VA(
zWai`f_BnRT&PIaOh8|B%O;tP6)1gz4<9&5`+^gHvy;k>#XV<J})me-+MDP?PVK1#h
zZWj(^iRId&9R@L#^#WZ$yC|lkDuKMDdgF%K={u;uueo&HJa(~EC)g_-l@bsH+-i;J
zPEiWrqsBLgL5R?PbNiOtXVG0{=C^nX7f0-~R6*$TnRp$Vnj-ZP#g{}}CId@u4H-3!
zBQ{Qj=|yJ*dnK>E#N-RB;eO)P#^Y*Du58(d45U!=_GOkqSM?V0d_E8xf&<3AvQ2V#
z6suWPRTqn~Ma>iUa3>vO2?l;=t}HZ>q))1m+BYIOy_T<t3rhgqp+dcro49>!UE}H`
z*+EH*))W6w=znvb?vuAL(&8s<G6mTU9!L~z69u9`{pZxr)c^|(aV?`ac0unzyf;33
z>)k9yT5tLGsMQwK?+y$$Q^bHDsj4RbfN*@})~&UtI^u%E(c+Pcq%#%?oA9>pc1B`#
z<_e~Z(r||~yUYm6aDmG!*`XlH5%Ep?qP?NugHlxsb<79PE_?=1Zq%U_|09e_cH!Y4
zHn!%!L`D43Al6Y)W+IC4ZF(~f4kAPVxza4rX2(!x30h-Wxdw@PROA0X`I+$)mql@a
zShl<Vnj5fT!?>MpPSIcwLcOb23@+<&QgC9v6Y#5A#!YXbREgs+_4Ic_ut2avyoFSm
zfj=531+T^ZPCKcQ?kw}%OKSXF!*icz38k!ZnnMdhB<(+GCHQJoPm8dFjE2}#U<!aN
z>bQg7y94=Q7GXnxna;E00KNj+9LL*A>cuV1vWS{{HQ!bg1$1CQnSFU=WZp^&K5Xu!
z?}hyU6dKRCwNH82XqwmxRot4_JiJ6`Rh3Phlum(xR-1fyhN2`f*fhSzfrFM~Ql2`u
zj5<67WSlt=YqT>0&J*Ztkcg<sWzd0aB9yEPmCq_g!`2QhPlNXd@m)Sm*chC5Rx1`Y
zr&gCZK7({EednAz2$U?#JckauTl|Cp7%}NBOEz16-;XXl62-_u3d^WUL33d57~wV<
zqa#f;lmC+fGgaaeDl1$FQL-=F%6-h_VZgld4FI}AZ%SE*;z_K2r3Xd#Te7g#ZAbo5
zR2E6|#Q0X4Vey$^=Xo=w3Tz2Ec~7ZUGmbi@@b4d0RK#)TbB^^zb0ZihVR{9E;>1n*
zfD%)=JvbRU-L#++5`MIi{jTC&CD1lOI%-%6GA_&)=FPlIV<7i{dMNRS2$sFboq+zb
zUD>5CG3nywvHR($BdMH1o3xkq?@%K;K@Mb0&ozbplFG;NfrUAs`q>q`PKqYxqK#}m
zRA<eu!yzgb(*MY5V2;4@*z|)lrUENl3gLcwh0B*esLd4!2wNxUj2e{W>+VS}gPSF-
ziBttD6PP!xE){N`L=I7xis>Q=Lsv;fCWaO!GAPGvjlko&N|8&&y!avEQN%VvwTLBX
z!^I$W1@d=-dPdN}uklC3!C@FPNl;k&wh0mNRIw!Ra%wQ~8#{RDkTfooQoS#mp@Ry%
zvCyP36c8k)lJo1dglx`8eP(rWKm?u9M3oH5@<B?j?8q*1h+`Sv0iNBRxb~@CMzY06
zlazi1!IjtrlU1^dO4(NGFyRR+$5MNo3}=n#?Nw%s4kW(T9??sJ>vS#Qq!|P_43RFH
zXc>^^mviIioNs$4g_@DBRMw&rsu;cKfx(Wnp+XLUl<F!$QtV>eEmXmhjgcb;M^RQk
zoP*1{)V*9SJw&I>U~OA5_&dp}Q;n>Odi1ewR6i1rj!8VB{bicZ3gGZ+YcnVZNs8hR
zBsvZnJmovzpix`9m3^4@iBvZnDut4H4zaV|>VNy>vN_b-WvKo{1r%0XY`T<f4Ydz8
z=!i+pYCI%r@xvl_4~hh>O=IK$<dM4M7KGsKCh3})JW;sNuSn4)!$i=UB88Le4&0<r
zibn+rc-#I|^ql)C3ZCGqXd}cjAz({~0T+T!9bt8ou<mxLTG-RH`egL_=AhnFAMS0e
zVpv_wa{~`mn`HvrA>xRndyFGg$wgGVDv4rP0yDN?<_siTp|L01?Kr${&+gsk%2^`p
zbT@3;Fci>-SP+5drf4e2+uH!Hh4PmvTo^;AVDt8UXC0t=8-IGu9(HQl%kO}=vv+zR
z<5yM*DU|v=(TA;5#j8*tE9s3<4oB5;X00g97z2p~2cMvld{2!r@k&w!aNCF^9!^=h
ziA5_qfQhH=iY`Jjh}@kkXq30iJ<p%WyeoE%VtOMsZlSz9d`YVMK6?%OVa%A&rvKTo
z@{fvQ*Sc^8+fwYsLqsII2xCOzG1R_~c1xt}%8mXB$pY2tcRU-{5gaL$P~j$%hFnqd
zji$nDDopp}nbET#Kr+j5e{m<UDn4O85hK$$YrKeV%qyN@7Q=b4SE)A%zqn$cSKn{S
zAAZuFDpzGm{t0UQ)J=z6&n1YMaSz}&rxbYk5uuUT`W*oLHU)5-Nq$P7>4$eSc@rzn
zn^YoP@gUn|pf>qih%O|1c8ndRTC0V$<l4<0IThq;@h{Q)yzD!Id@#6#i#PR{N+Bg2
zwRTLEG4*>+>9N=qK?c*W)g+Oof#$b4tn)8wKA7CBO;JBSWKW)H_V@3S&NR$`#ph5$
z{@>*$pj*_+>C>l+s|i(T4)RNxy=D~Q)=z)81ulWlZtT(+e}dM{<larI<BM!0v`KtA
z?j>4Ku_cii_;2T|-AkSNsNhye3Co&J?4EU_$D!KvHkpbtx)?!A6s9$YF4UbA^96)<
znKQ3zP6a5m&hORh(0EG*sZVSjO1m^E)JWH^K}^^jO*-i+iLh60p3_MoJkr$o7hVHF
z=TK!$xtE^0ZFi>X4q-(W_a0sns8ZEHYLAyWg#F+!5vOPKVr%Rm2|2s;6G!wuzD94@
zu!6-tFB)}=l2hFup8mcZuz1J(*(6<=W-_cz2LElV9Dxa;`RVwJz<&vsSNPn0)-=H=
z^c#pptZN5cuBT4gX*$}toP1CJJa$tpch#CdVN?{az-{2-(^8cJ7PFkVmu7ih26ki=
zQf9`*Ey}jMP67=bo8`f01f!6iuH(><?DTrovMA9wr*T4|PWq{lRS%j@^*ULlx&Oui
zcmR<<)Qq+M`LC{=HySQIJC$kG_F1+2IXYrb3EMr%W~4|y{`lh$!`WijM4~E110@Su
zrT2z>k60Lw&>pNMm+k3^+xxu(!;<LMyLLX6lvFmd`HerL_J?M5ImyH}bD64lYa$%H
z=g%FRiJX&G)cp8Wbgi6l|EK%8LNQIfPAdd|6DH5&-lNzLHTy;%Zz9G=dCK&>59P_)
zoeP5c&Yp?5NjT~|y?R9iAMt3ezI6HWV;+_V#FVJ4+os#HELDM8biVC$biyBY%5Z^C
z1e2bWe}Znpcv9@zTgUel^3TFbj_(Ou^Ry|e?%sm5E)>#wH$iC?8G$RQCSt^=VDW4Y
zXyS+%AL0=+VCF~7;zlNmV@vre{QnybCsmz=wTf+A2O(%1mWxSRc=V|{A#WcmE5Z0~
zQmAL%q?~v5wrt>=ST**Q4l~lZk#@tf@+7?hFT2+0_HA9NX`xQtqmo9qk$5x7_EK@&
z$R6U4dH6(u#Yq+U>LQ?}ojpk<&ep$SkDsX1l3(f_c&KB17~-1vAVhs@t7|}_m(eOo
zHtY!h8<5c`IcV5L<GG@m5!D+DgKjL$rD~VW0>a=?RZZcW)l?-vi#i=1Y?HEVpsQ=e
zxTOpFcQ9|0Cwf0!U0p}7-DkQV`ob{1s@b9eMtX-?w9V(;lCsotf0yn)lhZQ?z8JzX
zm;pXpH*P#SSFx&>d#5ca*1ZNf{qVw#(aX}+7d9xE3^FXb6IEAm?!4*Q{FP&?q{B&n
z`%DvV1T@F$u)-*uL^j58I}Be8;O0;>wK?Sd>IEj9z;wToW{zVk3ta({=GXf!p`KIk
z21M?_)fP!u0DCvoXE^nXU^72+MvsD-_@cPeh47+JbfIS#rU(`7yc0{DdTxqd6|QZ%
zTJ1gH<#h@yYx@H?qd53saZU)S|4dUeD`bXIc~k2`qy*+3!+ESk)~`aRV>}Skl=)Gw
zGX5uXnFO+J4;nXp+wU-?Z)Cx=i1FWmF9B8H80a|gQtE~C%}oYM{h7xOj7edp!=mm)
z+1V4%mSsIXz9(e&slx%Msfr?JOh++hST`JkiW=tX;@NP;iG}fKo`vdqyiQvboYv>R
z?AI^eu+|O=#k%1Ty1tP5V43ibIIz{~j8`?UXE+foAoLK>P=@<?(NY7lSRTwO_qdUf
zVUYLi6kVTf`SgcP<5L>lG&zh(-oDJD2{b+oR|=ZPfp8VFB^jdhw!5LGkS|c8`@t&o
z$Whrvz9)s06Ajdh$HnJI$u+<4(i(D?>TePJ34Op1n_kfs$}BYOzVmwfKPklI=g}ai
z5gNDWQeU%JI9$B4g^ZKkdME%URn|=(`J2DLMPhivz16$!e*O#U7;8X{b|{W0HzZ?+
z4*4nici8o^m;8gA!YZoTsE92Mb=xun-BrDUOnVb#E?+u%2HD+AIWY7G1e3*O%M-@D
zaFe`uVcUqAT-nD=f^eKi)D(nN&2G@NzvJcpDEH+#Q~^V?mr;NvIDY^7XYOcNpEehC
zSY@((!M$qunB6@Zk;4oe*K;ZYgkg3)e;IjzdT;2fE*`#xm)~7ZA7AkoMuRMdxgs!>
zijzuD=51sHQKIi)$IFyExv-m*_07hIUP37FK#3!_Lx#F_^GltB6|D4La!q-$-ntFq
z7$ckur^5zv$MA3D6v;%$R~l(wax4Mbdn;G<vdki<94sR5ee&~nDlBb+O&T_C-fZG?
zE$z`!3c-ouIJcvG*QU8_FAX!cvH0xs&y^m@4R?4a$D1YW-&k$6oHG&Iz^byky~?{M
z{`l)}Er8kW%BGVxsVOOD?hP<1X$wu9UT5u(9l**BtL&nUt1M1aMOnFD3hZxtofRY|
z4~$Gqa;;-ZzN3dXvMKO>UT`gT_vynzI;5whjHTPnCr8e!Z?LAC10C6+>*6qWkx%5H
zo^oEgE^Clk$gjWdihc&vX99R^f8Dh+z`CVaX3UN20~GYbg%*{CMp$o!kH+l?`X~{3
zC|r<fAfOp=93kDPerIdg>_C%0*|>TEvyQ5&rcw_SfjOopkou5Odhvvl6`fyv7N*VO
zJE?ngl|1r!Afi@_J!rCI1~R+oBY$%7&g1B&gxydbL)vdOCZIubZ1QN|0}b~c`2eKM
z>e8?-J-><%h<N5#^V$I{(Uq_O^6caIxmc!QQ5>;*%}cM9?EMZ@lmUoExxW4mIo%*1
zhTc7v{u1m6j5cU?n@@iJE9jW&8{4yok##I;f04stT*x=JFo05*>Ig&6{x)?(yq^Cd
zx*0z|jl|duWu9zi$!SVYzZtVd4cMmb5>W^!xg;6dh#NF~S7T-V#Fc7Xx_8upy2!@i
z>;3@`yLy0!CTyhgL?qO$Z0SJ%PkY&6-eaj+LC-4w;k2Up)c2yqk`=C`dlTKokcBcd
zk#YTXdY+l|F?plC_1E&y>r)RaWRsh;>wq&WfLLS$2znCf0To4ix;|qlcAw0wj91qZ
zp!empwAqrn7y<IcO^W$Eqb=JyV|Z{_oruH-LHak6e#Y_YK+%!Zyg{0JwYECZ%Ka@x
zgoK!1_bbCWH3EC7s_aW;X9B^g9Zx8%VGNcnQ6D98W1V{cg}}b6K|);lr+KDw3h$Q*
zY>27~&}@>>XH-w{z>_LDBr?7wvL-4<8Jo~&n*8G)@7U{vtx_e)2c`s3MSwb45;lyE
zp9N+_LKurwY6YYJ@?T}08r}+FYpRCkbS`E<EQ+G6m46YfZgB6Z-R-s(yWD3`R2Bl`
znIyXu!TxN}Rf-^<GRg|?V?`O;<MJq-ttuCA{-Wlzl}&)+QciCwUJvlsGO>f;=gRQ~
z=(lfD!77vhRdRLEwwv30u8P09TOynQYN{H{)E`~hG1QSesfs{GaA3;b@7r(f=|jt0
z19ZoqX6MO-Y)VF(DD#fDZ{L>Z0bn?I#Yy*cT4qD7J`i!893#t933y_UJ+(-E6vK>!
z7Z<>cFU7f+^*A!@8FLmqNMJ<Kn+cCk;~T_W%==5KsRT$7vA^7N<tl@va;Tgt=|6Kp
z;Z5fLseu$G9lh>}6DOp{2OuDB=pUMM-pnem8&_Na9)}FK5q<2}R1Oq_C;>sp%A&q?
zb;agZbYI_rALej?`uzM?KzWm-A+9d~zO2E6`o^8q5_Ck?aSN!MF-{??0%#Fs4g-DZ
z{G)RgD(3vo!z<hO`m`QBO%~$-PzEVv3hKfQ2Xhp@$8b4h^j3DNh&CDjBWrwNL?u8~
zmqR@ws;06j?5ecV-)}RrYT2FhIFPj{6JBgZpXW-`F1R@9bU)90Bb%+&%^CT>%ojIX
z(Fl@WEeJs;=3bJMj4<e0{|;Nm?vMO=)ZGE>&(3jp2AU}zz3sPt=BN9^xN&iq@yjX<
z0+8r$CT<9tCqY{F{$dG(FT+oRf1=QO6o6uePhXDX?EsrMM~)u7@h-m!pb!7vUw6&;
zRdH+R0Ky&oqs=+4lY%S0(lI+oD?4LIclj(syD`-w-xc_>U7I!wZXy9N6BJw&8^ry-
zT<X6`OpQl7vtr=l(^j$(OO&Dt8Dh4Wxo#c`5-IYOSiFDqx!7h6?VU6~mFE-1OrJ7)
zM}K^lL}C~9izcueab_}LbT9R9Y6NVM(uej>Y}934G@DQE7(OUQ=MAy6UnCD=3V}du
zV9N_Q&Wi%%V|+uOe@>*tT$~EXz>_P6bTp!NV_T2RJ3)c*Pxvqvzmiy7`S$8<ynH{U
zcoCxW=&BUA2)RWxceir!0OoJI%S5N^&4X%&het|CZH|q}aXxt*?SRM}#GXn<RJc#=
zH>;}!4UHqMgsmNV0Y#-4d@9N?!j(l1zxJ3APCgdkMJ0BFQy=sAb^VqnRWhSf@CcQ`
zkH4&6zf=E5d3%kpSyZaY(xU_UaoMJcL`Ltyy}A7beabs|Z=Gxi6bpKqR|use%X8JQ
z)9`!ze37}vsI3%b{*>7Z%K95!0`!6%3;Bw7ydP5nhC`cW1LYb?0sfsV90lQdfWj>T
zAX+xhB82kYv{IhGszO`l*S3lR<HyhbF6zHkwpgOwKgzx!ncAPYLH)1N#1L3u8`Pq?
zTx&YPWjB|yT8dD91-J;UnDxt+PpOeT>ffvNy4s-_|3O|RItNr1Z7Zp?0%4d%1`TNc
zH+-ux!uCJJw!=)3MMFjbCldzxjY*~=rF!3o;wKIwxBYLT@a4zLecQj~BWVGUm~0ZZ
z!GB~X>y@aruNG9Y<(0;dNkbbzuI&W7U%R6zm7lQ16f^ICvL8zt6b=%@QiLH<>fT(u
zHMxp12l&tXrrG`H^F{z-%oM3SuRTUh-nJ6BVzU_JNgv2<K0fM~mbV#ocg%mT(66xB
zlD~v+pdhu~MUE2sou^BxaXfWSObPn?rSDY+T{B4edtqN^pI<i{#3IXDr{9_9KM{S6
zx{T&|1>G<@dtM==79lLdQis0I3IIK-ZWUrwvQ1*!(X?dj3ro!Xw6wVIwN%sH{%eNl
zta3A}O3x@QhV*p$TnA0sqPu63B#r~8jaN0jeQ9AeoL{e!->q|~ZSm*!i*EDo=zwn+
zHr)SN6J1o8VeL^E2d2lJRyTw5$-sy0rg_vsB=p%K4R|_St7+mmSTQ~jGJnR2#i88r
zZINazAL7B<!i#TC{r=L;!rn{=_&S98h!J@Pw7-#rOl;1Vnz8cdm+F3>51+95Z@I$8
ze}oRBLd2*mLq6U8YAgODqqeiC^|JTAyXh9X??o84_|5bY$eaA{zI=I_wGL}cZ8z-&
z#w+LCh15T<smaW4QjrEf1;a4NX-G~=avZvz?oq61Y|G`dloo5NuVpgVXSrDwU*qzd
zQycgi)3RC$Xs=-%0bOrof7kWd4tyS3Kz_8lxzQFnUyl1o{kdNJ#QV8Yp{gPXesM4a
z<c@crnE&UeRhJYT+2j2`An6o{>JJj~-7~9U==aY>kl*u5oq;q%oSYe&DI<a%RmV;V
zRqr35>lB%WT|}GVXT5u*uRRZLkOI(VqKpqiQ5kBrQ@>|gpMRd3KdlVuSZT!;-8*--
zr4N)}8S$ssvoHb$@NTFS*C}9483qRMupKf?{Z|5V{&|G_qI1a_CWuu<J|waQ%tuGF
zcx>h{L-}3J=cu!Ur$vgOQ~|Qc(o$x}xeFN=TGnCv?f&7P=W4{SvRN7ip^=GKl>#6`
zQXVyG`>oWS8lx6r{{Qy|c7jr%Lds*>6e}K_U-zraCyjrpfxNEF!cSPp7m|XmlI)1z
z^EB$0K3X#T?KiHB<^p?`ZUi)Cw&+s1l>igAL-2azC!o#0z9|0Uo5jCE#%@#-f_m?z
zI6&=s=fXCLwd3PKI;Il5^qJ~c$WrUoB9q4;<jZj_C2vCKe;&2xL8m+`Ld0(f`c|m|
zHy1j9NB(BQkMhUTj&R8YE)d5(MH%8x(Fbx)4Aq~TUzq%_`}pTV+@}sq9Y%+v5JUc*
zW<IBB&zC+(lkeaVLI4IP>j^Tg%!ctWVO`so<66RK!u!YndD|I`P~+#^UB(x0!a2&A
z=7R(3zaeuqA`g~UQ6=2Gu-EOkw&;ExzWwJ(?CQ_s9WL*tWc9bGRw-iVz565IqF^95
zPLRf1c9j9{&lsXFe_tb@%livG;wj}_C)3r(fG%~FlslFjmOknO?Dw<;?i0LEsS-p9
zPqDwByM)nZ6W_JDT%DoZBWo1hiWo{}Q~-&~e~@=^zd~@dVR%%09OzF%zG``i!-xO+
z0>T2x<fM9Gp!9VH24ASn8N7qZKQRR@FG=hCy7C=8f<g+79-Ew0ige_o0+6)4oxCMh
zi9h|b_cLG|D+Yw8gLK^k0t)of@;mZ>9lrkOcm#|S!5Wy7^d6C;)u~+livgbtobmtt
z(R&{W-v7(z9nf68|F7@<)BOSOpSJh^^3Mgo{U_+O{2Ra2_n+zg?`rsW`{#1~>z`_m
z{^v;k>z|HT|Idy6*FW`Y_5c5YuQ-?e>$f--9<pXb_VU1VLrzb@9imT{7S-SQUMjPI
z1u_Q<T74;dp8AD;Bkq%1LcUU8GQVCS+%Z7>9uminE;mvCO4lCa>ql)(@@W{<rsW|g
z=Ging*3FRx#4#sWY-cBovd7Tw4%-QB<tz}GaC`}tHiCRsee#h*B4wVmO?fBYJ@H9e
z-x%(tP(#u%KX!@8(;hF>aOg!@Dl_H?31taF<i(cT*pV_TAs8W}h@;E;t3T2xZ70;D
zWnWVZ4uZ2$?_G^x1{eotd(y{k!s%Fqb~#jS3MEBVK8o#tvL;amFf4}^-7<=@9fXVh
z)U#qSA-fyC{B|15m^Eu21c;y*x$W7zBY43eF^{h)c}dGoWiUe#LK%Z9YbxO2FyE##
z_ylzqO$OPPoN|*+<1l!;onnxN#W|VM5giq6ozTe;M{-}JnWQoyLI@ejvmeVKmw#mk
z+j%jiPZ_zAy{S3wbdo4PA{OhE&O`mDKp8h+dTL7$vR%;sMZEXB8=uxjR+2&*$m}}V
zDdeltqah#+JPiU*O+%YL5UYP;<)AyMWlU($HW}*q1e;0w^v-osj0NjoRqm=B;F#CA
z>_-TbB$)($@SD3JhjcUxW?uU;*o>1B4M$f_P91P4V{vvW75Z@o4H^+N(dA9bHzQ%J
z#t$hME5}y;+uNZznu;YWn;=|H+_hBdh%lWAffZm3CMS<3B*-RX5g4w0<6hOc6jhUY
z##P^E#37j4tJj7Hn}yACb8{2vja10Qzlb&Gr`vmvfph@@f3je}*p)|eJqDpLW{<!L
z&c4e`=Zz8mNP1*3iEOq*mA5-?f%>ubGjG`auh62!7(9HZtniH-?fq0rX>pfRZv+xA
zNzxpFSZpyw(V~g~-iZU#lIG4(E6)_;fT&;mxz!_sKiBb|gn&k*A5gveEEBh@*Fr=|
z$vG(lh)1U4WC)r*=n7CWJca1SYxv)C6hPoFH_-{mngW^3L;rKTb;iZrcou?X9xV$?
zJO(7cqxCD8MyawT6f=S@ui|i;&~}62PNMOZf$DSh<p#DC0Ylt1{F6ZoT$wzNa{T$P
zKlp4hP^hxa`!U9fpw&+RS_C(*L9vxVLZoVz=@E=WWjUbj+RX<O9(k<f^%Tb~N3G*L
z$M?}DZ3*)(nzK&?ecG#NWM4PkJ4g`AJbwP<e})}CPZ&)qk!nv?f1)zfpLJ^a34hqu
zCGw=oVdHd($rH|j@?Bv3PKrU1=-;9$d4>;ht)16sX<{}ND*94(Qpv9}2sqTN<RT*S
z(PGHr;!P44=74b>S^q0-Cg9snN=>!H6bcvD!{@tck1Ch8hHfs7OoBUDc;rRz-r`q;
z=^3jpHv{46lIyu{V&@2+DNZmZ>?XFgE!s0dl->~3&3l6f7!DXa*7n8FWMY|=Teej#
zscppB_aS?PC$Mt_`8ZA-98sT%7e!fsLMe;NLX8~7Gzy4DHd-EAG2W@lqTY*7(dXOG
z5*G8o%Rymexi@!EIB~wr)tdv5$sM4}H(=JsWvOf~77-(VJ<5}tZfp~jzLb##v33#F
z8SN{Ol?#A08pX72)!D^7wDrhG%$0M%No|0YEQ?MsA13vYi#OeobjpV>ecpB16Y);F
zd8py~lC?fQdFU>4us1u#d3ZBp%eHSq)?ORS_yC_!6(Pv}>6SzBd_|9ca^~|~9_ei6
zN5$Djj9Fmn_>!s!k;2IY03SuBv?gv?P(qVD3@=J``Nag%iRC|}Z6dVj6CMRjCVmK`
z!7XvF=s=^o`n3MSw))m-i(f1eDq~4O(4nJ>1{5-4Mu)eUIH<|khhTX<7DS3IxF_cO
zISJw6_eP&&{y&K`?Kl%`G~UB-kcRi~<T;^R4=-V5&ivR*-HIl)Hy=OV%P=La_e3*i
z@gO70%d8U7%s#52B^6l=exJwx>|r|N@kP=4c5;vfqf+SdfIXg(eUS$$ylM$}-O>}g
z1M>`^vG+j)2p8xSxTViu3r)&_cn0@D`Y5oAe_=dzaa0z_jP;BeGqzs(wCgfG^cp~@
z*~gbKOCv~Gjyn<`%DCDYo7J}hZ|KPkMj*QV=p$@J^)9|^?Z>|_Dg0L4V$`(dk>q$?
z=90P_md&H~$E7)5c2HIMpb1%YVr_Hh+lf}pj0qBus7wi2!76&zirhZ-7K}C0_57zN
z?|><sNcp(yG8cv$1!clfqwnYj?r-MeH}kSe3On>LHmu#fg~dsv_-<uYSz$zJ-O&v#
z<y2w6aM`}s^7E(<5kUY8u^mJ0CHqauy|VvArrLP@jqGktN~%M8_+uEuBTx|$7FHHQ
zdU62w%Mf|R|FUnFA%T#ocwp%LXcVNM#VST}D{ES0-?v0PZhUBYGm$-yc10npi-5`s
zdenW6uN+`;?VV-kNs$wYr_y)J!v=sTeP}EuK<JK)+gVL!=4$;q7Td`D6CD2yA}es}
zKIbrRaibR8fY<FPYiiTFH7kc=3CLx#$_Z0SpbL>bi*0Ppo->GehT8wQk90q@z*pqw
ziH@B+@6zfR4fhIFC}Tu9i-(@A6OoR%H-PXcv*|`f%(mFS-#~GGp#_&wEFhb5CeaD4
zM<|RAN);iSI$}QxvdZcQ^(6Qh6^gij)-b#Lgx{snpMr?T1`Ul2J0r$NTD4?-{Z=dr
z2vhPX34{&KhDsucic9<g=?7xK`t*7xmbT;H%w$lLmS_IzbKe@ym_iLFEAT)z46p{V
z8<x4^jSNJQ&V{@|MNeZC!bAe;R8}<Go#_-1`8XwUXml!BdnF8q%%d|qbl11JvF1~0
zSSew@c>j=<k>c$34E5-lcLF9z9Mh2fivR7^0*rYN$k3xiP1ZdjV%{;onfp=!gzWv~
zHID8U1H3I%gfhUe<JNU9*d#$Z1RZ{joJ?m}6B$OVcg0+7u@5nYHd;0@$wF=!9)Muf
zE|{M1Pd$5xtfaYTu9vK0!$n5K?L45HUSWwq`&%Yh8bdbG$R?z-1Z<pgdj7-{fD}0q
zncL^Avwia-vavz{Fi>vUrvTGo<odh6e?EUrX6dCZmtL3wdC}?%CD_yc{M#29{KaXT
z&dpg}ri-?x$K`r4h=1eb4@1B!eIE^2zka=#5I8OSaC-!^U>u7@SWL+vs(2EW!U1h-
zEb7OV6Ok0>+>_d?Z6%Z1!ema065|z~f@lbK=Y_j_-TrymN*o~e{n|Z!HbHQ(-Q;B-
zx27%^jzVTd6haGZ^7qd=>d>*Q>e5rljD<k7dR3K=oZs@HYaagIPd*OQ?5{^RWTcnM
zn6O#X>jhQ`1&yj_CjrNvU@RTL1jAj{X^E@wn9HHL$au!coVf^L+Ae|C;)8ClAxc&t
z2^Drc7g%iZM6Dy2605&MhlMuHVN-Y#3bz{wR*xZFl=}wPIoUYfc<|O@*-@f7@@+u5
z87roy@Qp=<AQGm7eTXFd0rBo9{);)dK6y3fRSpr35q9KNM2gwMX?li}XJy5>ss`>R
z2Ie+qX<c~p+i}diPCmL0J+ml#Ji|G2&TIDF_&Li-Q`3KO<;cM)M=p7+MA;d2rK`aD
zlp@9*Sn0=)-_Z9*`*!yM1Ez|R4or>7f|ORcn2({)r>aEMC1T2cN^$(+B>lnL7hU+q
zIg=wRYtkls0Y5SoU{MxApb>(UvJ<hzxyDy^oUW29cm857$BMScLS_BOs1Jl^BuWW?
zMNmarhz-+Z@weqj=YhSd2cr=V!5S*-hVVTH`YgrHWak}ZuS%i9sPc7-4t<dwH@)9k
z=rn;?=%iM#$W??L1W)le;lqi0VFvXrilS?e25%b(Y-EL{HNUG+GS|HgESyI*4#}@T
z$gI)Zbu~ZP8eC4jBS5Ue;q?|F1MOJuE0>o|V+sgv9GNWnlcM$)j+3-5U+v5GhSyR2
z%BK+Vv+P$xnv&4yr8ACe=#-(EeC>q_uqd=obg#0mwrq<u58T$KO&d|R%fPH%aq>N(
zAc$=-qzUovKXc}cl#x)5q&ZLVDMOSm4I>no-jE@$_ioe~TKiViejNGzJZwtQA?7hC
zj@<4FY>7yZr6pQfIa7Qqh4=y_Ux6E`)lYM0;x_0S5_FwKwjBGZw3@P1PkK<=#zi34
zautO;V9W{g(ry>?e$XB#rY87z0W3w4y<$<6aF^6)o)Sx@kvPOMQOmG$oG`+oN{GBT
znUj$NHotUBm*LaE1A}<nMAsxMF+pBbiprwo>7?Fu4&N`b_k=TtZ{8Kck^`gNCiO!$
zZ@ZIqo9bm}m@a2K-!=Yh$X~y;6hM~m0g`~Fb?3iX!^d|DyvGBo&y*_C$;*Zh9#Gom
z(lF(r@yoe%4!2Q>R5BZ}Bp<o%z-4!5Aqyvy%6329qCLw`a``t*dvHrV+p=tzFOzdY
z^sfkRAb^dFsz;hL$|y=zs|y>5-4vuQ)2jcX9pGTwppO@LPG%Jtw(x(N{0;<ic@w$a
zenNHo#&Kk!8(a=c#b-fy{Q)q@GBV+9pUAEp_9#l61Tl$@dq#McE0bNc;2<eH_Kxsp
zbTme~Yui!@;y81I{m=^vQh0B|&NPzrW6(7aLdnE>^{3D8oXBH|<MNFSDKpqgV6~a%
z-wJ&}w&!A%90Rq~`uPJaYUmSpP8!yS@fZ@UF8)%SF<CWQR@MBS<RAXd)teG^$t<VE
zNs(|i1*nj$vs*^8h)@~VuJz3+Hr1mGbgF;);6e4R$!)VH%NS-}cV=z^oW}f^+m^ax
z@f$j=p&>PHoP6<&mrhx_lA9&a<-+J9^pQg*7abV<2|Rr+N+_LBjqOEQ@hCj3E{p?I
ztRYtJY&@uk<U$OPm|&+8DgRm1tiO%X>K&Z29lC2@x6JD=?#u);&jf|mOc@XXmlWOI
zBs|oKc<UZ@@K^21O37V|1v|C>O%BV|!+FxZF@_{n7c<a%sR*VEd_Vw!$PIfq^Xisj
zSx#wgb4--xXcKo7u>4>jXG{2gr*<7ARKkAFpac|`=7{9dvqU)NZ!89+@KG&Iw;t0E
z8IcB|x?&a@JZmapog@nqy}6BrjC!CrHFXcq6n}{_=lu$Z!#5i{Uu>OqTeNBfEGkC^
zbJAw~sgC6uE@^M_H@7?%kZWIn?c}qS*J{swR~SVi%bjCu$?Mb?ioA8S?$oxfGq<vO
zh$bkf-9_>(G7!bGWzX7vu48+Y(KWm;vyT-5rzig6-$2+C<a5T+`Ak}0InhBoie6Cj
z&}UQOb<jU&A3MY0Z@cW=<P6lnc(?Xb3c<QFP*-~1#Ah;8O53g}%UsT%_dPv7O(|P?
ziJOx$@Fh^b>{CoR-D%{yGkR}&?H8SG_Xox6z@=zjgU~MMswWk>n6J~;A<x-I=Me+E
zCY~2i>wz(^4Ifien&^J1%x0N^^C0EOc!BJ0zi7UbTz4S0{EUCGPT$u1?tiRfc}C$q
zv2#DYm<L5BIbZbYv&Q^tv|Fb6?|CM-AC2yl^ZlM|x8Pcb0xxB>-qp+j+eS^=Vp847
z`>z*=+kVk|@W(+f-~Q#E@xisB4fls?Ioypu?sTYec<ITyXKSu4d-?P3d#il+-2Tl_
z$ppsA_|ge7+0$iJ;oj}`tJ9sr0UhH!o4~I{0itDpC$D09`bYby9q*joPTwHLH$yg8
zbE9%N>|0hctx+32vN7GMnQdh=eZHk%U9scVa)Y$Y%v{v)OYX*j#LJ$F%`2x(3n7-<
z***w8L8W_^qJG-vI=ZHC`zc2+&mYiwVsp;hiaE_jTZgkL9fjOI_XXCkhT)Pk4X9D6
zLZ^hE#KT;#mUDDM6aXzh%QpfI+%6oYrzG_rU4&KNb*Fpdw7oCIp8kuz=)v;0SOWSq
z*5<1sP?{Ic^G)RLM{y?#7dE_kW4olJj|Crcb1%2)pBV2_Al+@JrM37NEc_a){Ju#)
zp2pk#@hhc;h3vcEyb|%d_!)1=*QOUllx=?b!B;Y~Lcg(DE=T23v+WTCwp|4Zdu`*F
z?`9<TpQWfxZe%7Ymg}qp>$;MWk;AXm)}3!Y`+c?dPnYM5{&34H;>hB}o#viar>|^D
zY&UyC-74qo4c2czS~t#SIFO}lu)|S2ayI09*4En?=^6CumHOcL)q^&&>A=SQ7A==n
z&z^jpE3eNPw*Or=%x@^R_WbaZR!4Sa_7V^GX}}*_R+1a0dPl|{n4Nq=cKp0-q!EmJ
zG;)n^y!CX^-EcP*?cxUlk@U4liQ7Ct-(%0!Yu9el_4QGnCi;nUT6BtfNL%B`hJP@9
zzllQ%hTNgRio3d&j_>I0!M%Hrmt0UF<|3qvO8b$xA$9E7QOp|YJ{B4+N~K&lIOrSd
z6s9}>eBjMzTxr|tgdwFCEpPorou;|5zVz9fi(xv4Kb<pW${*6;$##mXS;7%vSlwZX
zU3#}j8mft8$botJUD!WysKBW-^LEl6#!ciM!nBw98@~+Nxihw=W<^{+!+v`<Wd+Kf
z;{~HV!2pqWmp}MT-y&V{e@evQp5<?Os&AbQi=z(Cm=_L-_5_vopaT3VjwH>yJG&yk
zcEV6qO_py{&OD<`;2TK<c=81`G*Ovs{^k3aQ<i?Amm;`ivnjLt7l_e#-Y{<iBM;5-
z2@^JB%b!gPQPj@z*BznfX+%4v(x<L;9%3_@Cxzthc#I$$%W5aPrLAJz^KX;;WV1KI
zx0A7xt@Cin1Ts}3DR{o3xB5Z&^YiyG#;k{=>SGP(&_Xfgy7|QZUPrbczWLxlK`IeX
zp(JB?y43!L%H8hxv(puQtxCyeX^URE*-Uw{F)uVU^s5HuXmY$JyfUpT+)pzVa;Kl3
zUV3b$dG}O6B4Y5PZN26{yK^!tcFCYB@>Nm~t<mY;(z<i%uAbunCams$RuMO;&gM2u
z^>JqUm9pbmz>+-nab28h6P9feaGt##R;Amko;`c!n={~cYb~u=C1#0{HJ!ZCO>Vz5
zcV(k~Xb_eZpUd=$v^p`dI**6K3)md@qj<YhlTxj69rN9HKgKvD#~viON7W})XfJO!
z&NqI1LabqNmHsdr<<zO$Xj5zHizezHe%_{INAHX+ZnGT^C60Xi{Aap=Udq*UP+g{7
zY6&RILp=Nw-h~#8Ki9)wZ19<|^qfbHbv-qMpcE-q4~4pMMXvpL>8armdD3|B^FJN4
z!t}Jt26Fzv0^RUI&*bdupSf|XBezn<%|xXrU;*Nv(`(-(6ghlgZ1VxLnTxgV#NC~`
zx`zivme!}UKek@6w7KGZQ{4B((oG1K59%gEmX1zN`hy2gr+FTfl|)JX#g|_$>1p()
zzP=pYp$*KjRK_Hi>3|OHHpLr>H8d4&f~Za@nMC6=xpA@0rw~gCi%LsNhkLFo3xANE
z?-9uX-%(|k@cmELXUy&tm)FX-lca8GWyRDZU59iU;sn94Aifvq9g=C6mzKXQ%&ebg
z<TgqrrjZP5>@YhYJ7A5@kTsgtgN$9h($5bpoINI5UlwTA&;?IaeWI!TG~`~d-~lO~
z!AGx^Jbqku=;U3MLKJ^66)TM1nO~&-gAV&G+Q$h)Xe^t4#-$8?8RYG3uzW<*>tbP`
zBOYGqis#*R__LW&_iFXKf|Z9;`aR9<uoZjC8=TqNuU2J^nLK%`ka*;k<`c=^ogT5Z
z{@Zr#T!?jJ?@Y_yaCv@G%(FBr!}b$Hf6+>G8*aDd!u+G+*d%`6s1}~)COOg52p+rQ
zjc3tglAkad0Fk;>98XZtTM^+#beR@uaMo-{^3gLZnzQo_diS0}BY&}?5RgL%o~N(e
zocb7?)f;AD5LdXaYUX+H=RfG#<>ra9%T-}RFzZc=3rnC<?}bbY=!U57htNlxRTHMS
zd&7*abl=0;J@YIpE1OnH^`$p-=<TYZus~xo!<<%D8n4mx+7z0FGU$KBw{ADQoF=;f
zD*#uws~wr)1enOk?=yI;vdZ3Bw+!)nGW)g%OS49`#^my~Ph05r`x^fc#ZxA|E!Js%
z{^T{o^nK5lQ1!}lyx`&!g2@$y4}F#((3Wnh-{OMrM$Ac=xv@u8;p^?YcEve|`%GgC
zzySLS90tYs7%E7XYB`oTun(-hI#p-VY+u;c6sOYUs?!sR8~P{j(Is55j$K|d{6!G;
z;r5Mp7F((k>*`qzD-QRaD{;RTU!U^Aq(m;@6sJOP;tKNSibb34#w=PCD_&y9j+t;{
z7BXfax|8UOFMcDO_r{{q#l=On$v3JW6A7yqGy<U>ET}pd^e{8(Y4*rw)*m|7pIXx>
z<+QLyrYzH_#|m6_Ljob$)edF3rT{a7)BD!vgwsWxav$kzZyy#j;A-zZU01~oXc&Gq
z)AYWw>h+q((@x?*DS{PM&7U7?in(jw=FU+@DZQ^2RXhblIPP?{lTxI+3{sc;bf6{v
zTzTIuPH~jmVq|0~*<Ddap*)NcTX@mAHMSs%)Tx4-GC*@`#QJOt&udv(tKM{Ica&fo
z_rDxra<M3!y|NJjZ--Rs_Qcy+2nV=_(u`NnT(|CA%$0s&LfvB6VyC-~TIJM;D%1{f
zzxdPmR`9s4I$b`=Gaw`+3P@~nOiYYe+Yopgbap*=n+-O$<!YL`MWd0)%uUkGk}(R3
z$p<f9>+h@SWP7^&=SNMo6{ksOE*J~3mS*4NaI-^vAW+Ghi=O`U)BlN*QO3y`B@A^9
z+UiRiNb&COeK2oh=o$^1R2rOPTt%g&zqYBkvY#H&alcW@k;{83o)V5XPw20+_W6&}
z=SI(`@Cd};BI8UL`6<^^9m7Y}j{8UT^T&kFm@hzy&RZO7hrgvwc1t$&H+{9kZqU~^
zW2{nR_XGwms4vX6`F<9Cs(}b7ZX9N(Ul`Ns+qX_ND%(#rIH*Ca{y4=~($f0dXFw#p
zJZlpoy?=QrW4^ep6SdR5`V1d#$1;nrvOJyanLqJ}d&Tt1Tb<D7FqwPhdh)fmf!{O5
zw1VKUdYGZVD?|Zp-<DT^Xy!}Cazm$co^Qa=CIOJn%S*-9&9U=_J<62^XUB##g~o2K
zjT-?$l_%62b1i}&i&xD#ir3;sK}cv(WzBe=H)TxXqU3}ZskO162LJW6V4;QF{|S?~
zR{gl^!KSRKvrj!to{{7p38?I&ysX?v-@YwA-u2|y)+rrco$}bz!l55941W2#1##x`
z>K~xVlT2`9E0`=#p{|mdiW|%jSQCC&#FX2o_~fFzE3rP{O*y3@=Dn9k0yoao)kf8u
z9@+UDptTjUiQ#hRNXDt=oot@Hs>m_V3~Pyu`PDC87;R-ySeI9;dg?{*^Qg!?(}WH<
z#SL2&?#q(;!t#gIBI+|0HbRf|d$u~Ib2;kQ=!>zf#m6EKY-5n;-3J#p9Ez=|LG&Hz
z7F?2P>RVsq61TD}+;~;|94FxW$ruadTUnSo)~*Je4m}r{f3m`Le{_<^l4F;j`rLn_
z-8;Qa_<nu;H>Tr*U(R03@S)62C}Joz1u%|@2#X!GzDLz8XN)c=0v+moQ4b#-cHXzS
z>G7);0;l(TXQffWbxZ`XZ{p*R)&2IvtG}p_%=;NxELasx6EX8Rt0NuC&m25xh<(jp
zUApNR7-YmQd8JpbJGfs#BCcu0rw~eh)v*2Wt+}s;t0L&l49LkC9?wmB*3i(<Ih82(
zRs8hTiAPP_?YV>6_p<eQ^45Vr{`hlY^6IdqAEWvark*$JsjSbK)3gtt&s(Ow_#@0A
zpe2~&_{)>2lquF!FPmS{&n_%K!{o=z<CxJs%RV&$XO`V%Ukr})-8x~Kld3Rzoujj}
zd<04(8;Y%qP4rf0Y##4<(R!G0Q`9b<D@pA;PQUU5S{zvA;;j0li(6^I!ryNne+J_-
ztuniAlrrqKmd%YNCyShdU(YK^J{9^=1|ZH%Ife?jtTIWl&?<{x&B5q_r(9GX`UBT@
zJb7gp{!0AJWh_DcA3%)t<B^y0#B964y!~-z(w8ZE1(|>0b{w;H(AN&&RjYOtY(|~4
z^5kVSVhi)G@hE%V1$kZ?J10HDA9QNi)8^}=HEoyd?(2Qdl!5nb$Pu+pO#h?>K<Ad?
z++6Do`Gtwj{xL^&gOz5tpz>p0W?nnQxS-9)KWAo|TkX1h^_rE9<U!FE>FSPAnXo*-
zwE9vT?X9tkIN(D2ZeHo`o)9ysk|L(ROccZu)=ul5s~`#rBfOz8J3ITJ%8a8X3tDY!
z-RkOf+oifXjXnc1ocAn5_4#M{#BbS-x6s+iX{Wt2k~mCvdMlC4ALmBTFCSrCm^7%s
zy)khJI+^yxIOp4OYZGqgw+X-4<;>Y-Rn05wV3X@__qUqrgigL)CszofokKmMsOE9_
zl=3~!d-<<dI$S(x=lCuZ{HyI76<i9iHBQGTcVRf|sX_5`K-y%jS+d19t1jQNFnbem
zqi^rt*C_{_>UQtj*S+y&c=HN7a}vR^2?@6-Goc@!hIud+Yh@8C8?38M>cG(OS-JaQ
zD0u40f+^lOlUQ}W1ZuiS>&}Eij2nX8kZB%*9JzJIo}LLQcbBbeiUi9bC3cW`lpzQE
zZX<bt-QBF|pgVYDNlA$;_gmQ@rSG2pBI1y>&0+vS|2H#A^;0j58eACWP^%nWs4`ZU
zggVa}mV%U^7L$Pkj3~+8`fZ~gxrk~VphPzPh`9f&hLN?oR>1Fr3M2#<H3_($xBf8`
zde$Rm^g1}>=!JRVP=D$3k9Me}JPR%>zE&G2%J~gGl<|kEKs*bA?kpn#0+{tL;Ln@1
zyGVrF@`awvqj_$%t(w(&i6nkmuX!>H^x#nIrLyiSc!=%JUjI*X=O0(|y~pu$@9dgw
zGSVL9M>aa7n8{UsMD31LvXYLpagxX=$BeL)@?(AtJ+$IlB;0<SlrH91KUSMtR1!;y
z@@s9@4<(X`RrmQmC*|J9{=9$O9_JyQI^Xj>-|y$g`~7;o-mmw>&Eq4J=PqO0vU)&m
zU>_sz0Hra`BR9sd44!)CWw8Ww)F>&a7d?x_dYc@Alf4Gie*co|WLEi-z!!J6)8{UG
zxKG(XeP*f*kJ{Cx&p*B{eLK*?;!*cFt-8Tohp~4vGt#PugqHWh3LXD!<GPFtTO2OS
zk_X~aA${Vb(^lqL^hX>^54SzqM&uR&P5l8iZ`RK!hpL<E+gMN4O7f-QQUNn_$tT@x
zII|#_l@x2g=#hv)C13m$gs#@wvIilGIbkYcQGBW$7{rk$*bM~?UeKFZ9zNUI)zv1X
zg`Kzt9NXR5c^e^;#BNO8=!v)c6TKMGqLG2&Xwhn*4t_X!Pk3W#R;x<9T8ZJr#z?#X
ze(@dS&ma~{8LLsFq8LF4-lPa+Wf2a)@vZnl_o<ysK8FXox8^pHp+AMX4$o-|s{*^8
z^V^>S8bD}SM8sw_Iz+bP5OPe1bQ13c^`Uv0wVqCdwl)zv1Dem<wI1Y}IpwJAoITsl
zzDJD8dyl#yJ1N9QohIL#C}h{~rL>Hi@bu1xOKZENS7ci69zq@{FgIhTAjEF;Xcg?F
z!+7u(S_2)ZQy0tJrUByOew1~ey7-(3<is*MhZlUFdIclT$}tlr-1J*tK=wf=+{zUL
zbA*T$S|>5lk*GyjG*!h>Hxplv(m=hjSGK9QYlrqV%!FC#31@l|twE)zLi}m*rWsHU
z^(T-&-guE*Hv&8mgyo_aBJWEE@UXPA^A?*Zl*i%>*41>iy}<pffuGEinFUtE3kU0r
zqa-nbOa4gqBG&%Mn3$tdeJ7><pdlhvpeW*criKL@oOSO`!XsDUt>QOEWW)-S5BFM1
z3DIM_wnn>}^3x*UNMvwWuoCm_8gRauk=^y*M}#(iRanbZ`B(}klf1=Zh(w6IYok!`
zd<KcD<Xxq_4Lcy6;;PaBHdI{LL|G@KAg3hld8LIO$MFE|h>VJgV^b7ek(73-g^u1o
zzVvDnpTbAlOM*v6dSvW<tWAjdaX{PCDfA@~BT6=K5tWILCq5#3ze%Vi3okI+dZspJ
zRAo@F{HA{la}s>^F%>~)SM1{i4-YI1+-co1b5lg$bR{c{IZ2^U&ORYoAeR1W-G{j~
zCoeBAHza27UM<Tm{U9eh#C-`8TEjsCi@FYtV{ewuyKODE+o}=c2T@=iSUP7_^{H}A
zvG8-0*a{i!Sn*o-?gBLw{>x@ninkKYi<k5PLqVXGwjE#uU{szfSDq50fy7{zs&4DH
zZO7Bch7gPjbtJwP>;W9iU91qr27XQb*(pLyF$qOr$?jo^z{P!;RW|rp-6J#mRjE6E
zIlDGoomydtX%QiITtY%EnMg5^7`;Q4PB~eI1k%Y#mL5?>cMSj$rKy(EXh=XXpCjo0
zvGX!q2-!?Z%t@ePFJHd=d1_A+6D`zM-;#wwtM8|v0RxjPM3hjBaYQ^dB$YDo9#cgw
zGWqzQ%8&VOR);jHDi@kdShj7iB@3r})0~C+-0l9u<7I2Qv^&8m6v_;=z7Ukl%a$@M
zxww%Yvvck~%T<5U+JM13Y^_0RI!x(~0d}`Og)#V1^Mm~&sq`aU=>H^|d0U^=)@qe{
zVC-@SpjZQ&JyY~zG7%&$(PWc6h^eBK&z7p!6#BTVy7fgu$&ANJ>J4TDp^%G;h?w`=
z?9k9q8L^`I*%j>OjvbgdODvc_zY2?E4<a_|ZBk^9OiJ5kg5ilowcuh48}nL9g(a3Y
zTL_4JPka0M#mlOh?pg;TRNiztt7RJ6*&rCHg1X|Y(9r0<^Jx)ykd7n8^_ewzc8TFH
z1wD|h=BHi1XpG#mCjtMeSxep@5zcPF1eC7a)eN1Xr;C8slda=v1yOTwv+41qk9u);
zgv76xs?Dot{Og_!ErLoE7liA<cEg7k`km~uP$Ue{na7)34P9*y!uOceJ+Du9C83Hr
zSBv|DN#Be6k3rpI!CO}MHG2W{_s5w}@`fA``@Z;GAy9=J4cIp#97x$A+%$#b=arTQ
z7Oj#%7d2`@h_KrtloL#Vd3z)ClHj7^#D!_u%I9Z;TGHyJHH##x=z-Fs#vEmViBYd@
zUO@Ta1s`winL~NwA7mT`ei?tCAWx?P7nDKkKfa*9f90TX+j%FPMQMN>Bp%lCvfs%b
zP7=Y=Y)+FlZvFU)=NZo_Nju9Pf$U1ZSzYtCLA?j3c_|ccUUZuDfycX7VnD{(8@)39
zj?H@ft6iHHx`dNosL7ZFw7}}|2fqWzkfkUv|Ki1OuP4ao!rDR%%J4|pUH)s7r}_#P
z(&yWSbQ30lypa*_#S&J66x^+TP1MLuEBsSV7&V79ve#)5Y8#mv>Vj^MsUtl)jCCul
zkP?fWvv2gqDcg;wYb$A*HHiguCJldq<^!p?X84JZItl7JfHc$YrnY7E4uPKHbuTW|
zRJLM<SW6{!uEP}(CJ!6t!cZ?}VNy{eN{{!P`X-T7nJp$@I#<21hxW`fXOx#B>t@f8
z=3rv}La1a;IXyvtB4Z<{>fmOVrLRdwFTSH}VV2Ml!p7w1yCb=l<sLPK1&|A$-TGwD
zbXzioQeZAIdy(>XjM+ShH38M}9{P~=x(Yd^b&TzU?UX}G=K{P_pCKQU!vDkbTsS82
zvyl2oj4b#JTm<WoI<Tm!rJ>KGxd3ta$I-=BBcR8V%G(k06t|)VZL>&zL9QqCk6;;C
z%Oas>Q0)fNZ+&{+?wea%h8cO5qQ?_*o7k5IJd5O_k|ZofwIu>zG(+NczPvKQ+?(&>
zKa1;;3o_=b><qG<N?yp;$pWz7G`QgdE{LmItu{#gwFcCN%^`^m_8B-3#j>%m1=4^9
z&_iM+DzV|ol0jy*<XU2Wi#MW6Xb<V_Nr){kfH<`rJJt)fN5F36uW?ep&TZ&pHGhjR
zyU)^$rknWtF5)Q_X~#*(98rKcPn|mIT0V@_P3Y{PpLb0GqAIj#89Td>s$Qu|6~iRa
zn6WybU)aesIfywL`^Pk~k7s?mSzCc-8o)cMwJ9}pD1fQ}@SHVkd&!~6Z%lY}VURQg
zIM?Odn&fe*C-iyaDYp($uEH@wz54E}R75yi+;X#{q`gOb<sE<8FknZAAYNfFL>^bl
z%n1mVb*@2Il=E2}sEmYCERQ0zOK30482nN}iWK7k{5eO6&YIDqe+SQJx7%90fq@e_
zg4F*0(MttRtXkLAv|&Yi@iAv*{(X7z3rbY?d9o#~P`SCi<K(eI2SeO>`@^_)pJYYi
zV1s^Rq{fUQ+bJY_QgWN{p%8j*uhI&__&nqP$y_u2fxJ41CHv7xf?ijenK?#?nupE#
z=DD8ou8FVHA!0`*8*>i_JuSHJm%2{f`_buEd6l4%rrW#l$M!1sM_E0H#;U$|)V)L4
ziI>rSlBC>+RKZLz7W3?|Br~NlXUu!exApsKY)z+jI%zB8&*Z$)CQ#1dY0*rpq5Im&
zNKI!6qW$PA=+toPof^U9P=1y3)7>rdwyQs8?~`YoV0BK_pQ$u>ag+Cb&;c5H7>dvX
z^uf}Jad}u@mnEHyNkk*eSO6|yz^lIo3dO=sR=UJdnm<&6m0sGgOWrN<b&;I(`;FB~
z|AZoJ8<uzSE54F;fmGy`r$0Tgwv>K`gh%>w$(|w94f7K2>P^S?vpQ|sp@@s^gLGxO
z)SwL|1%Ng%)zyK9B?Qpc6N(^0SKtFdu76&k81q#JTbt;ICPvdQlwd-CU&4k+ubte7
zzTp$@wci>0$(<v%#03oY>X~J}I(4n+2leL|dtBNlkgx1Ay9v)DqOi?_^n<|Yr3*Pq
zVrs+_SJdPdT@TA0yVH#|^$LaZ!&##xaAxs|F_62iP<Vvx$(eU-^|;q`KQRvc$=#&E
zxvn*|wA_^$y!o(h%w89v7J+2@$!4bv-PAEf-MRyh2%Y7?DPCu#e>8WJ9Mcx-gUF5g
z38C?MX^58~Cy}b`i2286iT%KfQ6VqNHu)~5qcrZ$L}F$YP>Rl**xdX=@m={F{nuyf
z|MUD`JkR%8TOU4IArs-hzRWxCP3LdIEdH=&?tlJln8hVM<wd~S*iI?O_G%8&*W$i7
zpR$z*?WBJc^=B&hIBHF_1UALRK5{=5Vb1m)ON_mVNK42#BM%&SQ0*qaZJQ-t-b6~t
zZLJU5uE7i|okWg`+`zWI<TP=Z`fX-t^*eGn`BD{Fre+*sF{GQ(BoYnjbQByi)Kpdj
zk4;EBq<f+iXgD}*dKkn`A+1$N334{H*Zn&8#v@&KC~jXq>k2T9D{J$G)x$X=YoGLu
z6>Z7)Rs2<0jU^Ka1Xo(JN5#ZwGU`bjc$HYoq}81&w>5rOcd*3y9YaQuFNJYJLP({K
zMRU;wNEVp}pwMECEmCmP?NU#8rb!rCwtMk2I>`(=^0U7X$M|}JW63^eYpm2AzTvHo
zIj%UR95O_@rHElUTg}XSErN|9@MQMLcK7F3Ucoptky(E8s^OO9%a=C+Y~6OLlDba9
zkEJzzKyE$RpS~<gRUO$~Pu*$swz~+(oTp98)cRxTv#a^7#_7Epm%Y+BeS7nnb@DLJ
z4X(%zuK4Uk<1vKu5@v(e&@tI6KjQ)VJ)(w1lsRF>T3EPI&lqZ!bY78ubh9m%W^@D`
zQV3@7Je3n?(>~g;TEJ8Y3ya|I%%)-H1W}c)^?$SfI1d7el!j)Kv=WL5mRf8g8}r8C
z_^j6d{Q)_D5AUmc%)5BLyF8|hyjS^`dzG7__lHZWhKhDXV-RVU;Bin79_Yet3<v6O
zmFc|T3A)Ze@?&hx8{N7IKq19S{pk)Qf)~~glBGkSwyv9HuvEB<!q7=V^cQ9e<oFVu
zZXTpxvsK+Y@McBAMyf4{?g(U}%*1sbxax>NM_+$^7+X#B#kZ~+b3#Fsj&U~#jIy+i
z%0*;GNG9CBEb7s%+u3Skk+4S(zfl<6+jpX7t~|Dl9ANXlJla3KX5~aGu~NYd@%V3x
zKdL{IB)bs>g{@QD9glRMxc#5m!*k1vd%paP{{OMVItN$hZ<F5nKX^8_t!qk{<pv{7
S-(op4r^z2pI{LS{>;Db4M#d%p

literal 0
HcmV?d00001

diff --git a/docs/source/dev/dockerfile/dockerfile.rst b/docs/source/dev/dockerfile/dockerfile.rst
new file mode 100644
index 0000000000000..a07463392dbe8
--- /dev/null
+++ b/docs/source/dev/dockerfile/dockerfile.rst
@@ -0,0 +1,50 @@
+Dockerfile
+====================
+
+See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct 
+the image for running an OpenAI compatible server with vLLM.
+
+-  Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
+
+   - All build stages
+   - The default build target (highlighted in grey)
+   - External images (with dashed borders)
+   
+   The edges of the build graph represent:
+   
+   - FROM ... dependencies (with a solid line and a full arrow head)
+   - COPY --from=... dependencies (with a dashed line and an empty arrow head)
+   - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+
+   .. figure:: ../../assets/dev/dockerfile-stages-dependency.png
+      :alt: query
+      :width: 100%
+      :align: center
+
+   Made using: https://github.com/patrickhoefler/dockerfilegraph
+
+   Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present):
+
+   .. code:: bash
+
+      dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
+
+   or in case you want to run it directly with the docker image:
+   
+   .. code:: bash
+
+      docker run \
+         --rm \
+         --user "$(id -u):$(id -g)" \
+         --workdir /workspace \
+         --volume "$(pwd)":/workspace \
+         ghcr.io/patrickhoefler/dockerfilegraph:alpine \
+         --output png \
+         --dpi 200 \
+         --max-label-length 50 \
+         --filename Dockerfile \
+         --legend
+
+   (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
+
+   
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e8daa5f052754..e0269987ec5d8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -102,6 +102,7 @@ Documentation
    dev/sampling_params
    dev/engine/engine_index
    dev/kernel/paged_attention
+   dev/dockerfile/dockerfile
 
 Indices and tables
 ==================

From 111815d482ba2b724541994da12736615101ef5e Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Tue, 30 Apr 2024 17:46:12 -0400
Subject: [PATCH 03/65] [Kernel] Support Fp8 Checkpoints (Dynamic + Static)
 (#4332)

Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
---
 tests/models/test_fp8.py                      |  90 ++++++++
 vllm/model_executor/layers/linear.py          |  58 ++++-
 .../model_executor/layers/quantization/fp8.py | 199 +++++++++++++++---
 3 files changed, 307 insertions(+), 40 deletions(-)
 create mode 100644 tests/models/test_fp8.py

diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
new file mode 100644
index 0000000000000..e87a1783a83f1
--- /dev/null
+++ b/tests/models/test_fp8.py
@@ -0,0 +1,90 @@
+# flake8: noqa
+"""Tests fp8 models against ground truth generation
+Note: these tests will only pass on L4 GPU.
+"""
+import os
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = [
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+]
+
+EXPECTED_STRS_MAP = {
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": [
+        'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+        'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** (Haya tori, nemuri nemuri)\n\n**'
+    ],
+    "meta-llama/Meta-Llama-3-8B-Instruct": [
+        'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+        'In the year 2154, the robotics lab at NeuroSpark Industries was on the cusp of',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
+    ],
+}
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+fp8_not_supported = (capability <
+                     QUANTIZATION_METHODS["fp8"].get_min_capability())
+
+
+@pytest.mark.skipif(fp8_not_supported,
+                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(
+    example_prompts,
+    model_name,
+) -> None:
+    model = LLM(model=model_name,
+                max_model_len=MAX_MODEL_LEN,
+                enforce_eager=True,
+                quantization="fp8")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = model.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del model
+
+    print(generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 4d43ed4c5f14a..289b317cc991e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -246,6 +246,10 @@ def __init__(
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        # Special case for Fp8 scales.
+        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+                                           None)
+
         tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
         param_data = param.data
@@ -254,6 +258,12 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        # Special case for Fp8 scales.
+        elif fp8_scales_shard_indexer is not None:
+            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
+                                                                 loaded_weight,
+                                                                 shard_id=0)
+
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -317,7 +327,12 @@ def weight_loader(self,
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+        # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
+        # Special case for Fp8 scales.
+        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+                                           None)
+
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -331,14 +346,13 @@ def weight_loader(self,
                 current_shard_offset += output_size
             packed_dim = getattr(param, "packed_dim", None)
             for shard_id, shard_offset, shard_size in shard_offsets:
+                # Special case for Quantization.
                 # If quantized, we need to adjust the offset and size to account
                 # for the packing.
                 if packed_dim == output_dim:
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
-
-                    # If marlin, we need to adjust the offset and size to
-                    # account for the tiling.
+                    # Special case for Marlin.
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
@@ -353,15 +367,14 @@ def weight_loader(self,
         if output_dim is not None:
             shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
             shard_size = self.output_sizes[loaded_shard_id] // tp_size
+            # Special case for quantization.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
             packed_dim = getattr(param, "packed_dim", None)
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
-
-                # If marlin, we need to adjust the offset and size to
-                # account for the tiling.
+                # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
@@ -370,11 +383,17 @@ def weight_loader(self,
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        # Special case for AQLM codebooks.
         elif is_metadata:
             # metadata indicates fixed size concatenated along dim 0
             shard_size = loaded_weight.shape[0]
             shard_offset = loaded_shard_id * shard_size
             param_data = param_data.narrow(0, shard_offset, shard_size)
+        # Special case for Fp8 scales.
+        elif fp8_scales_shard_indexer is not None:
+            param_data, loaded_weight = fp8_scales_shard_indexer(
+                param_data, loaded_weight, loaded_shard_id)
+
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -455,7 +474,11 @@ def weight_loader(self,
                       loaded_shard_id: Optional[str] = None):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+        # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
+        # Special case for Fp8 scales.
+        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+                                           None)
 
         if loaded_shard_id is None:
             # Loaded weight is already packed.
@@ -473,14 +496,14 @@ def weight_loader(self,
             ]
             packed_dim = getattr(param, "packed_dim", None)
             for shard_id, shard_offset, shard_size in shard_offsets:
+                # Special case for Quantized Weights.
                 # If quantized, we need to adjust the offset and size to account
                 # for the packing.
                 if packed_dim == output_dim:
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
 
-                    # If marlin, we need to adjust the offset and size to
-                    # account for the tiling.
+                    # Special case for Marlin.
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
@@ -502,6 +525,7 @@ def weight_loader(self,
                 shard_offset = (self.num_heads +
                                 self.num_kv_heads) * self.head_size
                 shard_size = self.num_kv_heads * self.head_size
+            # Special case for Quantized Weights.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
             packed_dim = getattr(param, "packed_dim", None)
@@ -509,8 +533,7 @@ def weight_loader(self,
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
 
-                # If marlin, we need to adjust the offset and size to
-                # account for the tiling.
+                # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
@@ -523,12 +546,17 @@ def weight_loader(self,
             start_idx = shard_id * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        # Special case for for AQLM codebooks.
         elif is_metadata:
             # metadata indicates fixed size concatenated along dim 0
             shard_size = loaded_weight.shape[0]
             shard_index = ["q", "k", "v"].index(loaded_shard_id)
             param_data = param_data.narrow(0, shard_index * shard_size,
                                            shard_size)
+        # Special case for Fp8 scales.
+        elif fp8_scales_shard_indexer is not None:
+            param_data, loaded_weight = fp8_scales_shard_indexer(
+                param_data, loaded_weight, loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -611,6 +639,10 @@ def __init__(
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        # Special case for Fp8 scales.
+        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+                                           None)
+
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
         param_data = param.data
@@ -619,6 +651,12 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
+        # Special case for Fp8 scales.
+        elif fp8_scales_shard_indexer is not None:
+            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
+                                                                 loaded_weight,
+                                                                 shard_id=0)
+
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ba9f3149649c1..b57e1dde81a5f 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,23 +1,36 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig)
 from vllm.model_executor.utils import set_weight_attrs
 
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = init_logger(__name__)
+
 
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
 
     def __init__(
         self,
+        is_checkpoint_fp8_serialized: bool = False,
         activation_scheme: str = "dynamic",
     ) -> None:
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
+            logger.warning("Detected fp8 checkpoint. Please note that the "
+                           "format is experimental and subject to change.")
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(
+                f"Unsupported activation scheme {activation_scheme}")
         self.activation_scheme = activation_scheme
 
     @classmethod
@@ -30,10 +43,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
 
     @classmethod
     def get_min_capability(cls) -> int:
-        # TODO: PyTorch 2.3.0+ is required to run FP8 on
-        # SM 89 (e.g. Ada) GPUs. Specifically, this PR has to
-        # be included: https://github.com/pytorch/pytorch/pull/118881
-        return 90
+        return 89
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -41,11 +51,14 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        return cls(activation_scheme)
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+                   activation_scheme=activation_scheme)
 
     def get_quant_method(
-            self, layer: torch.nn.Module) -> Optional["QuantizeMethodBase"]:
+            self, layer: torch.nn.Module) -> Optional["Fp8LinearMethod"]:
         if isinstance(layer, LinearBase):
             return Fp8LinearMethod(self)
         return None
@@ -56,8 +69,12 @@ def get_scaled_act_names(self) -> List[str]:
 
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
-    We now support common FP16/BF16 model checkpoints ONLY. The weight
-    scaling factor will be initialized after the model weights are loaded.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
 
     Limitations:
     1. Only support per-tensor quantization due to torch._scaled_mm support.
@@ -71,6 +88,24 @@ class Fp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
 
+    def _create_scale_param(
+        self,
+        scale_name: str,
+        layer: torch.nn.Module,
+        output_partition_sizes: List[int],
+        **extra_weight_attrs,
+    ) -> None:
+        scale = Parameter(torch.empty(len(output_partition_sizes),
+                                      dtype=torch.float32),
+                          requires_grad=False)
+        layer.register_parameter(scale_name, scale)
+        set_weight_attrs(
+            scale, {
+                **extra_weight_attrs,
+                "fp8_scales_shard_indexer":
+                self.scales_shard_indexer,
+            })
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -81,46 +116,150 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        del input_size, output_size
         output_size_per_partition = sum(output_partition_sizes)
+
+        layer.process_after_load = True
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_checkpoint_fp8_serialized else
+                        params_dtype)
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
-                                       dtype=params_dtype),
+                                       dtype=weight_dtype),
                            requires_grad=False)
         layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-        set_weight_attrs(weight, extra_weight_attrs)
+        set_weight_attrs(weight, {
+            **extra_weight_attrs,
+            "input_dim": 1,
+            "output_dim": 0,
+        })
 
-        w_scale = Parameter(
-            torch.empty(1, dtype=torch.float32),
-            requires_grad=False,
-        )
-        layer.register_parameter("weight_scaling_factor", w_scale)
+        # If checkpoint is serialized fp8, load them.
+        # Otherwise, wait until process_weights_after_loading.
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALE
+            self._create_scale_param(
+                scale_name="weight_scale",
+                layer=layer,
+                output_partition_sizes=output_partition_sizes,
+                **extra_weight_attrs)
+
+            # ACTIVATION SCALE
+            if self.quant_config.activation_scheme == "static":
+                self._create_scale_param(
+                    scale_name="act_scale",
+                    layer=layer,
+                    output_partition_sizes=output_partition_sizes,
+                    **extra_weight_attrs)
+
+    def scales_shard_indexer(
+            self, param: torch.Tensor, loaded_weight: torch.Tensor,
+            shard_id: Union[str, int]) -> Tuple[torch.Tensor, torch.Tensor]:
+        qkv_idxs = {"q": 0, "k": 1, "v": 2}
+
+        if isinstance(shard_id, int):
+            pass
+        elif isinstance(shard_id, str):
+            if shard_id not in qkv_idxs:
+                raise ValueError(f"Unknown shard_id: {shard_id}")
+            shard_id = qkv_idxs[shard_id]
+        else:
+            ValueError(f"Shard id must be int or str but got {type(shard_id)}")
+
+        return param[shard_id], loaded_weight
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        # Although the quant_method is propagated to all layers,
-        # only linear layers invoke "create_weights". So we check
-        # whether "weight_scaling_facor" is registered to determine
-        # whether the layer is a linear layer that requires quantization.
-        if not hasattr(layer, "weight_scaling_factor"):
+        if (not hasattr(layer, "process_after_load")
+                or not layer.process_after_load):
+            return
+
+        # If checkpoint is fp/bf16 (not serialized fp8), quantize the weights.
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
+                                                         scale=None)
+            layer.weight = Parameter(qweight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            layer.logical_widths = None
+            layer.act_scale = None
             return
 
-        qweight, weight_scale = ops.scaled_fp8_quant(layer.weight)
-        # torch._scaled_mm requires column-major in the second
-        # input (weight), so we transpose the quantized weight.
-        layer.weight = Parameter(qweight.t(), requires_grad=False)
-        layer.weight_scaling_factor.data.copy_(weight_scale)
+        # If checkpoint is fp8, requantize the separately quantized logical
+        # weights into a single fp8 weight with a single weight scale.
+        else:
+            # WEIGHT_SCALE / WEIGHT
+            #   Loop over logical weights, requantizing with single scale.
+            max_w_scale = layer.weight_scale.max()
+            start = 0
+            for idx, logical_width in enumerate(layer.logical_widths):
+                end = start + logical_width
+                weight_dq = per_tensor_dequantize(layer.weight[start:end, :],
+                                                  layer.weight_scale[idx])
+
+                layer.weight[start:end, :] = per_tensor_quantize(
+                    weight_dq, layer.weight_scale.max())
+                start = end
+            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
+            # WEIGHT
+            #   Transpose weight for passing to torch._scaled_mm
+            weight = layer.weight
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+
+            # ACT_SCALE
+            #   Dynamic: set to None (required input to ops.scaled_fp8_quant).
+            #   Static:  set to max of the act_scales (since they are equal).
+            if self.quant_config.activation_scheme == "dynamic":
+                layer.act_scale = None
+            elif self.quant_config.activation_scheme == "static":
+                if not all_close_1d(layer.act_scale):
+                    raise ValueError(
+                        "All the act_scales for the logical weights of a layer "
+                        f"must be equal. But got {layer.act_scale}")
+                layer.act_scale = Parameter(layer.act_scale.max(),
+                                            requires_grad=False)
+            else:
+                raise ValueError(
+                    f"Unknown scheme {self.quant_config.activation_scheme}")
 
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        qinput, x_scale = ops.scaled_fp8_quant(x)
+        # ops.scaled_fp8_quant supports both dynamic and static quant.
+        #   If dynamic, layer.act_scale is None and x_scale computed from x.
+        #   If static,  layer.act_scale is scalar and x_scale set to act_scale.
+        qinput, x_scale = ops.scaled_fp8_quant(x, layer.act_scale)
+
+        # Fused GEMM_DQ
         output, _ = torch._scaled_mm(
             qinput,
             layer.weight,
             out_dtype=x.dtype,
             scale_a=x_scale,
-            scale_b=layer.weight_scaling_factor,
+            scale_b=layer.weight_scale,
             bias=bias,
         )
+
         return output
+
+
+def all_close_1d(x: torch.Tensor) -> bool:
+    assert len(x.shape) == 1
+    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
+
+
+def per_tensor_quantize(tensor: torch.Tensor,
+                        inv_scale: float) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
+    return qweight.to(torch.float8_e4m3fn)
+
+
+def per_tensor_dequantize(tensor: torch.Tensor,
+                          inv_scale: float) -> torch.Tensor:
+    fake_qweight = tensor.to(torch.float16)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight

From a494140433be496a0321999955acf7e6387986b3 Mon Sep 17 00:00:00 2001
From: Florian Greinacher <florian@greinacher.de>
Date: Wed, 1 May 2024 01:28:46 +0200
Subject: [PATCH 04/65] [Frontend] Support complex message content for chat
 completions endpoint (#3467)

Co-authored-by: Lily Liu <lilyliupku@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/entrypoints/test_openai_server.py | 19 ++++++++++
 vllm/entrypoints/openai/serving_chat.py | 48 ++++++++++++++-----------
 2 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 68332228ace08..a2a98abe7031c 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -786,6 +786,25 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):
     assert "extra_forbidden" in exc_info.value.message
 
 
+async def test_complex_message_content(server, client: openai.AsyncOpenAI):
+    resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role":
+            "user",
+            "content": [{
+                "type":
+                "text",
+                "text":
+                "what is 1+1? please provide the result without any other text."
+            }]
+        }],
+        temperature=0,
+        seed=0)
+    content = resp.choices[0].message.content
+    assert content == "2"
+
+
 async def test_guided_grammar(server, client: openai.AsyncOpenAI):
     simple_sql_grammar = """
 start: select_statement
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 5ed042ef386ea..599f99e56a726 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -55,9 +55,16 @@ def _parse_chat_message_content(
         if isinstance(content, str):
             return [ConversationMessage(role=role, content=content)], []
 
-        # To be implemented: https://github.com/vllm-project/vllm/pull/3467
-        # To be implemented: https://github.com/vllm-project/vllm/pull/4200
-        raise NotImplementedError("Complex input not supported yet")
+        texts: List[str] = []
+        for _, part in enumerate(content):
+            if part["type"] == "text":
+                text = part["text"]
+
+                texts.append(text)
+            else:
+                raise NotImplementedError(f"Unknown part type: {part['type']}")
+
+        return [ConversationMessage(role=role, content="\n".join(texts))], []
 
     async def create_chat_completion(
         self, request: ChatCompletionRequest, raw_request: Request
@@ -122,11 +129,12 @@ async def create_chat_completion(
         # Streaming response
         if request.stream:
             return self.chat_completion_stream_generator(
-                request, result_generator, request_id)
+                request, result_generator, request_id, conversation)
         else:
             try:
                 return await self.chat_completion_full_generator(
-                    request, raw_request, result_generator, request_id)
+                    request, raw_request, result_generator, request_id,
+                    conversation)
             except ValueError as e:
                 # TODO: Use a vllm-specific Validation Error
                 return self.create_error_response(str(e))
@@ -139,8 +147,9 @@ def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
 
     async def chat_completion_stream_generator(
             self, request: ChatCompletionRequest,
-            result_generator: AsyncIterator[RequestOutput],
-            request_id: str) -> AsyncGenerator[str, None]:
+            result_generator: AsyncIterator[RequestOutput], request_id: str,
+            conversation: List[ConversationMessage]
+    ) -> AsyncGenerator[str, None]:
         model_name = self.served_model_names[0]
         created_time = int(time.time())
         chunk_object_type = "chat.completion.chunk"
@@ -179,12 +188,10 @@ async def chat_completion_stream_generator(
                     # last message
                     if request.echo:
                         last_msg_content = ""
-                        if request.messages and isinstance(
-                                request.messages,
-                                list) and request.messages[-1].get(
-                                    "content") and request.messages[-1].get(
-                                        "role") == role:
-                            last_msg_content = request.messages[-1]["content"]
+                        if conversation and conversation[-1].get(
+                                "content") and conversation[-1].get(
+                                    "role") == role:
+                            last_msg_content = conversation[-1]["content"]
 
                         if last_msg_content:
                             for i in range(request.n):
@@ -279,9 +286,10 @@ async def chat_completion_stream_generator(
         yield "data: [DONE]\n\n"
 
     async def chat_completion_full_generator(
-            self, request: ChatCompletionRequest, raw_request: Request,
-            result_generator: AsyncIterator[RequestOutput],
-            request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]:
+        self, request: ChatCompletionRequest, raw_request: Request,
+        result_generator: AsyncIterator[RequestOutput], request_id: str,
+        conversation: List[ConversationMessage]
+    ) -> Union[ErrorResponse, ChatCompletionResponse]:
 
         model_name = self.served_model_names[0]
         created_time = int(time.time())
@@ -322,11 +330,9 @@ async def chat_completion_full_generator(
 
         if request.echo:
             last_msg_content = ""
-            if request.messages and isinstance(
-                    request.messages, list) and request.messages[-1].get(
-                        "content") and request.messages[-1].get(
-                            "role") == role:
-                last_msg_content = request.messages[-1]["content"]
+            if conversation and conversation[-1].get(
+                    "content") and conversation[-1].get("role") == role:
+                last_msg_content = conversation[-1]["content"]
 
             for choice in choices:
                 full_message = last_msg_content + choice.message.content

From 715c2d854d56f2026c31f126a90e6e7859434a50 Mon Sep 17 00:00:00 2001
From: Alpay Ariyak <98838263+alpayariyak@users.noreply.github.com>
Date: Tue, 30 Apr 2024 19:32:13 -0400
Subject: [PATCH 05/65] [Frontend] [Core] Tensorizer: support dynamic
 `num_readers`, update version (#4467)

---
 requirements-dev.txt                           |  2 +-
 setup.py                                       |  2 +-
 vllm/model_executor/model_loader/tensorizer.py | 17 ++++++++++-------
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 324039186142b..e6d375cbafa39 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -14,7 +14,7 @@ types-setuptools
 
 # testing
 pytest
-tensorizer==2.9.0a0
+tensorizer==2.9.0
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures
diff --git a/setup.py b/setup.py
index 6ba36b85ea318..a47b14ffcfc6e 100644
--- a/setup.py
+++ b/setup.py
@@ -408,7 +408,7 @@ def _read_requirements(filename: str) -> List[str]:
     install_requires=get_requirements(),
     ext_modules=ext_modules,
     extras_require={
-        "tensorizer": ["tensorizer==2.9.0a1"],
+        "tensorizer": ["tensorizer==2.9.0"],
     },
     cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
     package_data=package_data,
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 2d654b2fefb8d..0ce9fa95aa7e5 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -44,7 +44,7 @@ class TensorizerConfig:
                           str, bytes, os.PathLike, int]
     vllm_tensorized: bool
     verify_hash: Optional[bool] = False
-    num_readers: Optional[int] = 1
+    num_readers: Optional[int] = None
     encryption_keyfile: Optional[str] = None
     s3_access_key_id: Optional[str] = None
     s3_secret_access_key: Optional[str] = None
@@ -104,7 +104,7 @@ class TensorizerArgs:
                           str, bytes, os.PathLike, int]
     vllm_tensorized: bool
     verify_hash: Optional[bool] = False
-    num_readers: Optional[int] = 1
+    num_readers: Optional[int] = None
     encryption_keyfile: Optional[str] = None
     s3_access_key_id: Optional[str] = None
     s3_secret_access_key: Optional[str] = None
@@ -125,8 +125,9 @@ class TensorizerArgs:
           the hashes stored in the metadata. A `HashMismatchError` will be 
           raised if any of the hashes do not match.
       num_readers: Controls how many threads are allowed to read concurrently
-          from the source file. Default is 1. This greatly increases
-          performance.
+          from the source file. Default is `None`, which will dynamically set
+          the number of readers based on the number of available 
+          resources and model size. This greatly increases performance.
       encryption_keyfile: File path to a binary file containing a  
           binary key to use for decryption. `None` (the default) means 
           no decryption. See the example script in 
@@ -199,10 +200,12 @@ def add_cli_args(
             "use for decryption. Can be a file path or S3 network URI.")
         group.add_argument(
             "--num-readers",
-            default=1,
+            default=None,
             type=int,
             help="Controls how many threads are allowed to read concurrently "
-            "from the source file.")
+            "from the source file. Default is `None`, which will dynamically "
+            "set the number of readers based on the available resources "
+            "and model size. This greatly increases performance.")
         group.add_argument(
             "--s3-access-key-id",
             default=None,
@@ -337,7 +340,7 @@ def deserialize(self):
         per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
         after_mem = get_mem_usage()
         deserializer.close()
-        logger.info("Deserialized %s in %0.2fs, %f/s", total_bytes_str,
+        logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str,
                     end - start, per_second)
         logger.info("Memory usage before: %s", before_mem)
         logger.info("Memory usage after: %s", after_mem)

From dd1a50a8bc520b0e52ce7914f0263ebd576c197f Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 1 May 2024 07:33:33 +0800
Subject: [PATCH 06/65] [Bugfix][Minor] Make ignore_eos effective  (#4468)

---
 vllm/sampling_params.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 0ed6a01a62212..f6e7a3ca792e4 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -275,7 +275,8 @@ def update_from_generation_config(
             self, generation_config: Dict[str, Any]) -> None:
         """Update if there are non-default values from generation_config"""
         # Update eos_token_id for generation
-        if eos_ids := generation_config.get("eos_token_id"):
+        if (not self.ignore_eos) and (eos_ids :=
+                                      generation_config.get("eos_token_id")):
             # it can be either int or list of int
             if isinstance(eos_ids, int):
                 eos_ids = [eos_ids]

From 6ad58f42c59eaee0a57c89f1feb08757524b93cf Mon Sep 17 00:00:00 2001
From: "fuchen.ljl" <yjqqqqdx_01@163.com>
Date: Wed, 1 May 2024 07:38:50 +0800
Subject: [PATCH 07/65] fix_tokenizer_snapshot_download_bug (#4493)

---
 vllm/transformers_utils/tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index fa4693cb7dac1..9066db5a9e7f1 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -79,7 +79,7 @@ def get_tokenizer(
                 revision=revision,
                 local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
                 # Ignore weights - we only need the tokenizer.
-                ignore_file_pattern=["*.pt", "*.safetensors", "*.bin"])
+                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
             tokenizer_name = tokenizer_path
 
     if tokenizer_mode == "slow":

From ee37328da085af14f89ad1af8eb2c359ae2f46a1 Mon Sep 17 00:00:00 2001
From: "fuchen.ljl" <yjqqqqdx_01@163.com>
Date: Wed, 1 May 2024 08:42:09 +0800
Subject: [PATCH 08/65] Unable to find Punica extension issue during source
 code installation (#4494)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 docs/source/getting_started/installation.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index e7826114ffa9d..0c81f7ec6d2a9 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -53,6 +53,7 @@ You can also build and install vLLM from source:
 
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
+    $ # export VLLM_INSTALL_PUNICA_KERNELS=1 # optionally build for multi-LoRA capability
     $ pip install -e .  # This may take 5-10 minutes.
 
 .. tip::

From 2e240c69a9874743abc8b0b681e8c13d675beda3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 30 Apr 2024 18:06:34 -0700
Subject: [PATCH 09/65] [Core] Centralize GPU Worker construction (#4419)

---
 vllm/executor/gpu_executor.py     | 83 +++++++++++++++----------------
 vllm/executor/ray_gpu_executor.py | 32 +++---------
 2 files changed, 47 insertions(+), 68 deletions(-)

diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 489e66d586028..527a14ff6c67a 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
@@ -6,6 +6,7 @@
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
+from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
@@ -23,30 +24,47 @@ def _init_executor(self) -> None:
         else:
             self._init_spec_worker()
 
-    def _init_non_spec_worker(self):
-        # Lazy import the Worker to avoid importing torch.cuda/xformers
-        # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from vllm.worker.worker import Worker
-
-        assert self.parallel_config.world_size == 1, (
-            "GPUExecutor only supports single GPU.")
-
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        self.driver_worker = Worker(
+    def _get_worker_kwargs(
+            self,
+            local_rank: int = 0,
+            rank: int = 0,
+            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
+        """Return worker init args for a given rank."""
+        if distributed_init_method is None:
+            distributed_init_method = get_distributed_init_method(
+                get_ip(), get_open_port())
+        return dict(
             model_config=self.model_config,
             parallel_config=self.parallel_config,
             scheduler_config=self.scheduler_config,
             device_config=self.device_config,
             cache_config=self.cache_config,
             load_config=self.load_config,
-            local_rank=0,
-            rank=0,
+            local_rank=local_rank,
+            rank=rank,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
             vision_language_config=self.vision_language_config,
-            is_driver_worker=True,
+            is_driver_worker=rank == 0,
+        )
+
+    def _create_worker(self,
+                       local_rank: int = 0,
+                       rank: int = 0,
+                       distributed_init_method: Optional[str] = None):
+        wrapper = WorkerWrapperBase(
+            worker_module_name="vllm.worker.worker",
+            worker_class_name="Worker",
         )
+        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
+                                                      distributed_init_method))
+        return wrapper.worker
+
+    def _init_non_spec_worker(self):
+        assert self.parallel_config.world_size == 1, (
+            "GPUExecutor only supports single GPU.")
+
+        self.driver_worker = self._create_worker()
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
@@ -57,41 +75,18 @@ def _init_spec_worker(self):
 
         from vllm.spec_decode.multi_step_worker import MultiStepWorker
         from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
-        from vllm.worker.worker import Worker
 
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-
-        target_worker = Worker(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
-            local_rank=0,
-            rank=0,
-            distributed_init_method=distributed_init_method,
-            lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
-            is_driver_worker=True,
-        )
+        target_worker = self._create_worker()
 
-        draft_worker = MultiStepWorker(
+        draft_worker_kwargs = self._get_worker_kwargs()
+        # Override draft-model specific worker args.
+        draft_worker_kwargs.update(
             model_config=self.speculative_config.draft_model_config,
             parallel_config=self.speculative_config.draft_parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
             # TODO allow draft-model specific load config.
-            load_config=self.load_config,
-            local_rank=0,
-            rank=0,
-            distributed_init_method=distributed_init_method,
-            lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
-            is_driver_worker=True,
+            #load_config=self.load_config,
         )
+        draft_worker = MultiStepWorker(**draft_worker_kwargs)
 
         spec_decode_worker = SpecDecodeWorker.from_workers(
             proposer_worker=draft_worker, scorer_worker=target_worker)
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 3eb3726bd5a6d..16d239b9ab580 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -153,29 +153,14 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
-        def collect_arg_helper_func(**kwargs):
-            # avoid writing `{"name": value}` manually
-            return kwargs
-
         # Initialize the actual workers inside worker wrapper.
-        init_worker_all_kwargs = []
-        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
-            local_rank = node_workers[node_id].index(rank)
-            init_worker_all_kwargs.append(
-                collect_arg_helper_func(
-                    model_config=self.model_config,
-                    parallel_config=self.parallel_config,
-                    scheduler_config=self.scheduler_config,
-                    device_config=self.device_config,
-                    cache_config=self.cache_config,
-                    load_config=self.load_config,
-                    local_rank=local_rank,
-                    rank=rank,
-                    distributed_init_method=distributed_init_method,
-                    lora_config=self.lora_config,
-                    vision_language_config=self.vision_language_config,
-                    is_driver_worker=rank == 0,
-                ))
+        init_worker_all_kwargs = [
+            self._get_worker_kwargs(
+                local_rank=node_workers[node_id].index(rank),
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+        ]
         self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
 
         self._run_workers("init_device")
@@ -201,8 +186,7 @@ def execute_model(self,
             use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
 
         # Only the driver worker returns the sampling results.
-        output = all_outputs[0]
-        return output
+        return all_outputs[0]
 
     def _run_workers(
         self,

From f458112e8afdb01bd3cb2e435db314c6bc227973 Mon Sep 17 00:00:00 2001
From: harrywu <63134210+HarryWu99@users.noreply.github.com>
Date: Wed, 1 May 2024 11:21:39 +0800
Subject: [PATCH 10/65] [Misc][Typo] type annotation fix (#4495)

---
 vllm/engine/llm_engine.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 835803fd4e75d..4caecb8a51598 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -8,7 +8,8 @@
                          LoRAConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig, SpeculativeConfig,
                          VisionLanguageConfig)
-from vllm.core.scheduler import Scheduler, SchedulerOutputs
+from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
+                                 SchedulerOutputs)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics import StatLogger, Stats
 from vllm.engine.output_processor.interfaces import (
@@ -485,7 +486,7 @@ def has_unfinished_requests(self) -> bool:
     def _process_model_outputs(
         self,
         output: List[SamplerOutput],
-        scheduled_seq_groups: List[SequenceGroup],
+        scheduled_seq_groups: List[ScheduledSequenceGroup],
         ignored_seq_groups: List[SequenceGroup],
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> List[RequestOutput]:

From a822eb3413087062a38cea495564ec4a7093c3e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pastel=EF=BC=81?= <1627301104@qq.com>
Date: Wed, 1 May 2024 11:41:32 +0800
Subject: [PATCH 11/65] [Misc] fix typo in block manager (#4453)

---
 vllm/core/block_manager_v1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 1fac2636e86fa..73e7dafb72c7f 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -391,7 +391,7 @@ def append_slots(
                 block_table.append(block_table[len(block_table) %
                                                self.block_sliding_window])
             else:
-                # The sequence has a new logical block.
+                # The sequence hash a new logical block.
                 # Allocate a new physical block.
                 new_block = self._allocate_last_physical_block(seq)
                 block_table.append(new_block)

From c3845d82dc3d1831714898114f87d9c103e2dd41 Mon Sep 17 00:00:00 2001
From: Robert Caulk <rob.caulk@gmail.com>
Date: Wed, 1 May 2024 05:48:39 +0200
Subject: [PATCH 12/65] Allow user to define whitespace pattern for outlines
 (#4305)

---
 tests/entrypoints/test_guided_processors.py            |  4 +++-
 vllm/entrypoints/openai/protocol.py                    | 10 ++++++++++
 .../guided_decoding/outlines_decoding.py               |  8 +++++---
 .../guided_decoding/outlines_logits_processors.py      |  7 +++----
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py
index 30f0ad5d8272f..41c871ca40bc8 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -57,7 +57,9 @@ def test_guided_logits_processors():
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
     tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
     regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
-    json_LP = JSONLogitsProcessor(TEST_SCHEMA, tokenizer)
+    json_LP = JSONLogitsProcessor(TEST_SCHEMA,
+                                  tokenizer,
+                                  whitespace_pattern=None)
 
     regex_LP.init_state()
     token_ids = tokenizer.encode(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 0a949f9867754..731596e80bd71 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -146,6 +146,11 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "If specified, will override the default guided decoding backend "
             "of the server for this specific request. If set, must be either "
             "'outlines' / 'lm-format-enforcer'"))
+    guided_whitespace_pattern: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default whitespace pattern "
+            "for guided json decoding."))
 
     # doc: end-chat-completion-extra-params
 
@@ -285,6 +290,11 @@ class CompletionRequest(OpenAIBaseModel):
             "If specified, will override the default guided decoding backend "
             "of the server for this specific request. If set, must be one of "
             "'outlines' / 'lm-format-enforcer'"))
+    guided_whitespace_pattern: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default whitespace pattern "
+            "for guided json decoding."))
 
     # doc: end-completion-extra-params
 
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 53efebb604048..8403604286903 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -74,7 +74,8 @@ async def get_outlines_guided_decoding_logits_processor(
 
     result = await loop.run_in_executor(global_thread_pool,
                                         _get_cached_logits_processor, guide,
-                                        tokenizer, mode)
+                                        tokenizer, mode,
+                                        request.guided_whitespace_pattern)
 
     logits_processor = copy(result)
     # reset logits processor's internal state
@@ -117,9 +118,10 @@ def _get_guide_and_mode(
 @lru_cache(maxsize=32)
 def _get_cached_logits_processor(guide: str,
                                  tokenizer: PreTrainedTokenizerBase,
-                                 mode: GuidedDecodingMode):
+                                 mode: GuidedDecodingMode,
+                                 whitespace_pattern: Union[str, None]):
     if mode == GuidedDecodingMode.JSON:
-        return JSONLogitsProcessor(guide, tokenizer)
+        return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern)
     elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
         return RegexLogitsProcessor(guide, tokenizer)
     elif mode == GuidedDecodingMode.GRAMMAR:
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 25ab5bf8b6a9c..a131c6a1b92b4 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -18,7 +18,7 @@
 import math
 from collections import defaultdict
 from functools import lru_cache
-from typing import Callable, DefaultDict, Dict, List, Optional, Union
+from typing import Callable, DefaultDict, Dict, List, Union
 
 import torch
 from outlines.fsm.fsm import CFGFSM, FSM, RegexFSM
@@ -80,10 +80,9 @@ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
 
 class JSONLogitsProcessor(RegexLogitsProcessor):
 
-    def __init__(self,
-                 schema: Union[str, Dict, BaseModel],
+    def __init__(self, schema: Union[str, Dict, BaseModel],
                  tokenizer: PreTrainedTokenizerBase,
-                 whitespace_pattern: Optional[str] = None):
+                 whitespace_pattern: Union[str, None]):
         """Compile the FSM that drives the JSON-guided generation.
 
         Parameters

From d6f4bd7cddc9546c38568c92c3772d22940a09f2 Mon Sep 17 00:00:00 2001
From: Jee Li <pandaleefree@163.com>
Date: Wed, 1 May 2024 12:18:14 +0800
Subject: [PATCH 13/65] [Misc]Add customized information for models (#4132)

---
 tests/models/test_big_models.py               | 15 +++++++++++++
 tests/models/test_models.py                   | 15 +++++++++++++
 vllm/attention/layer.py                       |  7 ++++++
 vllm/model_executor/layers/activation.py      |  3 +++
 vllm/model_executor/layers/layernorm.py       |  5 +++++
 vllm/model_executor/layers/linear.py          | 22 +++++++++++++++++++
 .../model_executor/layers/logits_processor.py |  6 +++++
 .../model_executor/layers/rotary_embedding.py |  6 +++++
 .../layers/vocab_parallel_embedding.py        |  8 +++++++
 9 files changed, 87 insertions(+)

diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index 504eaad43c8d7..3dde498bcd639 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -43,3 +43,18 @@ def test_models(
             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
         assert hf_output_ids == vllm_output_ids, (
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_print(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    vllm_model = vllm_runner(model, dtype=dtype)
+    # This test is for verifying whether the model's extra_repr
+    # can be printed correctly.
+    print(vllm_model.model.llm_engine.model_executor.driver_worker.
+          model_runner.model)
+    del vllm_model
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index cfe2539e3a052..e4609620387fa 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -49,3 +49,18 @@ def test_models(
             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
         assert hf_output_ids == vllm_output_ids, (
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_model_print(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    vllm_model = vllm_runner(model, dtype=dtype)
+    # This test is for verifying whether the model's extra_repr
+    # can be printed correctly.
+    print(vllm_model.model.llm_engine.model_executor.driver_worker.
+          model_runner.model)
+    del vllm_model
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index fc65ae108dbb1..ee7be26c0876c 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -47,3 +47,10 @@ def forward(
     ) -> torch.Tensor:
         return self.impl.forward(query, key, value, kv_cache, attn_metadata,
                                  kv_scale)
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.impl.head_size}"  # type: ignore
+        s += f", num_heads={self.impl.num_heads}"  # type: ignore
+        s += f", num_kv_heads={self.impl.num_kv_heads}"  # type: ignore
+        s += f", scale={self.impl.scale}"  # type: ignore
+        return s
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index baf1d4f266181..d101aa323b0e1 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -67,6 +67,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             ops.gelu_tanh_and_mul(out, x)
         return out
 
+    def extra_repr(self) -> str:
+        return f'approximate={repr(self.approximate)}'
+
 
 class NewGELU(nn.Module):
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index a6619714b8aab..8de0794158986 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -64,3 +64,8 @@ def forward(
             self.variance_epsilon,
         )
         return out
+
+    def extra_repr(self) -> str:
+        s = f"hidden_size={self.weight.data.size(0)}"
+        s += f", eps={self.variance_epsilon}"
+        return s
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 289b317cc991e..7726dcb9a5fbd 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -181,6 +181,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         output_bias = self.bias if self.skip_bias_add else None
         return output, output_bias
 
+    def extra_repr(self) -> str:
+        s = f"in_features={self.input_size}"
+        s += f", output_features={self.output_size}"
+        s += f", bias={self.bias is not None}"
+        return s
+
 
 class ColumnParallelLinear(LinearBase):
     """Linear layer with column parallelism.
@@ -281,6 +287,14 @@ def forward(self, input_):
         output_bias = self.bias if self.skip_bias_add else None
         return output, output_bias
 
+    def extra_repr(self) -> str:
+        s = f"in_features={self.input_size}"
+        s += f", output_features={self.output_size_per_partition}"
+        s += f", bias={self.bias is not None}"
+        s += f", tp_size={get_tensor_model_parallel_world_size()}"
+        s += f", gather_output={self.gather_output}"
+        return s
+
 
 class MergedColumnParallelLinear(ColumnParallelLinear):
     """Packed linear layers with column parallelism.
@@ -685,3 +699,11 @@ def forward(self, input_):
             output = output_
             output_bias = self.bias
         return output, output_bias
+
+    def extra_repr(self) -> str:
+        s = f"input_features={self.input_size_per_partition}"
+        s += f", output_features={self.output_size}"
+        s += f", bias={self.bias is not None}"
+        s += f", tp_size={self.tp_size}"
+        s += f", reduce_results={self.reduce_results}"
+        return s
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 22620d9fc86d9..91eb96998c3cf 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -70,6 +70,12 @@ def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
             logits = logits[:, :self.org_vocab_size]
         return logits
 
+    def extra_repr(self) -> str:
+        s = f"vocab_size={self.vocab_size}"
+        s += f", forg_vocab_size={self.org_vocab_size}"
+        s += f", scale={self.scale}, logits_as_input={self.logits_as_input}"
+        return s
+
 
 def _prune_hidden_states(
     hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 25365a9b50a1f..857d70fadcb57 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -156,6 +156,12 @@ def forward(
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        return s
+
 
 class LinearScalingRotaryEmbedding(RotaryEmbedding):
     """RotaryEmbedding extended with linear scaling.
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 088c0849243c0..4585b1679cb5c 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -105,6 +105,14 @@ def forward(self, input_):
         output = tensor_model_parallel_all_reduce(output_parallel)
         return output
 
+    def extra_repr(self) -> str:
+        s = f"num_embeddings={self.num_embeddings_per_partition}"
+        s += f", embedding_dim={self.embedding_dim}"
+        s += f", org_vocab_size={self.org_vocab_size}"
+        s += f', num_embeddings_padded={self.num_embeddings_padded}'
+        s += f', tp_size={self.tp_size}'
+        return s
+
 
 class ParallelLMHead(VocabParallelEmbedding):
     """Parallelized LM head.

From 6f1df80436c46175e09f660a99075a5eba3a2273 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Wed, 1 May 2024 21:45:42 +0900
Subject: [PATCH 14/65] [Test] Add ignore_eos test  (#4519)

---
 tests/samplers/test_ignore_eos.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 tests/samplers/test_ignore_eos.py

diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
new file mode 100644
index 0000000000000..864657a3c2b28
--- /dev/null
+++ b/tests/samplers/test_ignore_eos.py
@@ -0,0 +1,31 @@
+"""Make sure ignore_eos works.
+
+Run `pytest tests/samplers/test_ignore_eos.py`.
+"""
+
+import pytest
+
+from vllm import SamplingParams
+
+MODELS = ["facebook/opt-125m"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [1024])
+def test_beam_search_single_input(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    example_prompts = "1 + 1 is"
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+    ignore_eos_output = vllm_model.model.generate(
+        example_prompts, sampling_params=sampling_params)
+    print(len(ignore_eos_output[0].outputs[0].token_ids))
+    assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) < 10
+    assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) >= 0

From a88bb9b032d75aad74b2e1bd3d97b8e8a24e8b9d Mon Sep 17 00:00:00 2001
From: AnyISalIn <anyisalin@gmail.com>
Date: Thu, 2 May 2024 00:11:03 +0800
Subject: [PATCH 15/65] [Bugfix] Fix the fp8 kv_cache check error that occurs
 when failing to obtain the CUDA version. (#4173)

Signed-off-by: AnyISalIn <anyisalin@gmail.com>
---
 vllm/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index a5512c657e038..db4398addae3c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -353,7 +353,8 @@ def _verify_cache_dtype(self) -> None:
         elif self.cache_dtype == "fp8":
             if not is_hip():
                 nvcc_cuda_version = get_nvcc_cuda_version()
-                if nvcc_cuda_version < Version("11.8"):
+                if nvcc_cuda_version is not None \
+                        and nvcc_cuda_version < Version("11.8"):
                     raise ValueError(
                         "FP8 is not supported when cuda version is"
                         "lower than 11.8.")

From 4dc8026d8614185ece28dd3fcd82aa0dabb4f79c Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Wed, 1 May 2024 12:14:13 -0400
Subject: [PATCH 16/65] [Bugfix] Fix 307 Redirect for `/metrics` (#4523)

---
 vllm/engine/metrics.py                | 2 +-
 vllm/entrypoints/openai/api_server.py | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 45bfad03ec867..3c4aac91549a9 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -119,7 +119,7 @@ def __init__(self, labelnames: List[str], max_model_len: int):
             buckets=[1, 2, 5, 10, 20],
         )
         self.counter_request_success = Counter(
-            name="vllm:request_success",
+            name="vllm:request_success_total",
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + [Metrics.labelname_finish_reason])
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index af9ba7a3bc825..40103f70a31a3 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -2,6 +2,7 @@
 import importlib
 import inspect
 import os
+import re
 from contextlib import asynccontextmanager
 from http import HTTPStatus
 
@@ -12,6 +13,7 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from prometheus_client import make_asgi_app
+from starlette.routing import Mount
 
 import vllm
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -55,8 +57,10 @@ def parse_args():
 
 
 # Add prometheus asgi middleware to route /metrics requests
-metrics_app = make_asgi_app()
-app.mount("/metrics", metrics_app)
+route = Mount("/metrics", make_asgi_app())
+# Workaround for 307 Redirect for /metrics
+route.path_regex = re.compile('^/metrics(?P<path>.*)$')
+app.routes.append(route)
 
 
 @app.exception_handler(RequestValidationError)

From e491c7e053e5d774f321612b3a400ca2fb424d32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=CE=B1n=C3=A7ois?= <francois.paupier@gmail.com>
Date: Wed, 1 May 2024 19:14:16 +0200
Subject: [PATCH 17/65] [Doc] update(example model): for OpenAI compatible
 serving (#4503)

---
 docs/source/serving/openai_compatible_server.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 388b5daa79a92..c157d8ba998da 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat
 
 You can start the server using Python, or using [Docker](deploying_with_docker.rst):
 ```bash
-python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.2 --dtype auto --api-key token-abc123
+python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
 To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
@@ -16,7 +16,7 @@ client = OpenAI(
 )
 
 completion = client.chat.completions.create(
-  model="mistralai/Mistral-7B-Instruct-v0.2",
+  model="NousResearch/Meta-Llama-3-8B-Instruct",
   messages=[
     {"role": "user", "content": "Hello!"}
   ]
@@ -37,7 +37,7 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
 
 ```python
 completion = client.chat.completions.create(
-  model="mistralai/Mistral-7B-Instruct-v0.2",
+  model="NousResearch/Meta-Llama-3-8B-Instruct",
   messages=[
     {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
   ],
@@ -87,7 +87,7 @@ In order for the language model to support chat protocol, vLLM requires the mode
 a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
 specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
 
-An example chat template for `mistralai/Mistral-7B-Instruct-v0.2` can be found [here](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format)
+An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
 
 Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
 you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat

From 69909126a7f6fb1e3254dc0dec87dc6e78e1a0e2 Mon Sep 17 00:00:00 2001
From: sasha0552 <admin@sasha0552.org>
Date: Wed, 1 May 2024 17:41:17 +0000
Subject: [PATCH 18/65] [Bugfix] Use random seed if seed is -1 (#4531)

---
 vllm/sampling_params.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index f6e7a3ca792e4..5fa94eb149ffb 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -139,7 +139,10 @@ def __init__(
         self.top_p = top_p
         self.top_k = top_k
         self.min_p = min_p
-        self.seed = seed
+        if seed == -1:
+            self.seed = None
+        else:
+            self.seed = seed
         self.use_beam_search = use_beam_search
         self.length_penalty = length_penalty
         self.early_stopping = early_stopping

From 8b798eec75cde6eb6fe65b5d673dd9bd4eaef799 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 1 May 2024 12:01:50 -0600
Subject: [PATCH 19/65] [CI/Build][Bugfix] VLLM_USE_PRECOMPILED should skip
 compilation (#4534)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index a47b14ffcfc6e..d534cec437261 100644
--- a/setup.py
+++ b/setup.py
@@ -378,6 +378,7 @@ def _read_requirements(filename: str) -> List[str]:
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
 if os.environ.get("VLLM_USE_PRECOMPILED"):
+    ext_modules = []
     package_data["vllm"].append("*.so")
 
 setup(

From b38e42fbca978d62cc8330bdcf8da91c72cb2ebc Mon Sep 17 00:00:00 2001
From: leiwen83 <leiwen83@users.noreply.github.com>
Date: Thu, 2 May 2024 02:13:03 +0800
Subject: [PATCH 20/65] [Speculative decoding] Add ngram prompt lookup decoding
 (#4237)

Co-authored-by: Lei Wen <wenlei03@qiyi.com>
---
 tests/spec_decode/e2e/conftest.py             |  58 +++++
 ...tness.py => test_multistep_correctness.py} |  60 +----
 .../spec_decode/e2e/test_ngram_correctness.py | 172 ++++++++++++++
 tests/spec_decode/test_multi_step_worker.py   |  50 ++---
 tests/spec_decode/test_ngram_worker.py        | 206 +++++++++++++++++
 vllm/config.py                                |  87 +++++---
 vllm/engine/arg_utils.py                      |  18 ++
 vllm/executor/gpu_executor.py                 |   8 +-
 vllm/spec_decode/batch_expansion.py           |   4 +-
 vllm/spec_decode/multi_step_worker.py         | 209 ++----------------
 vllm/spec_decode/ngram_worker.py              | 190 ++++++++++++++++
 vllm/spec_decode/spec_decode_worker.py        |  45 ++--
 vllm/spec_decode/top1_proposer.py             | 200 +++++++++++++++++
 vllm/spec_decode/util.py                      |  16 +-
 14 files changed, 1004 insertions(+), 319 deletions(-)
 rename tests/spec_decode/e2e/{test_correctness.py => test_multistep_correctness.py} (88%)
 create mode 100644 tests/spec_decode/e2e/test_ngram_correctness.py
 create mode 100644 tests/spec_decode/test_ngram_worker.py
 create mode 100644 vllm/spec_decode/ngram_worker.py
 create mode 100644 vllm/spec_decode/top1_proposer.py

diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 5d3469c4210ee..0eb784a9c5ac5 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,4 +1,5 @@
 import asyncio
+from itertools import cycle
 from typing import List, Optional, Tuple, Union
 
 import pytest
@@ -185,3 +186,60 @@ def get_output_from_llm_generator(
         del llm
 
     return tokens, token_ids
+
+
+def run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len,
+                                         force_output_len: bool,
+                                         print_tokens: bool = False):
+    """Helper method that compares the outputs of both the baseline LLM and
+    the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
+    the same when temperature is zero.
+    """
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+        "San Francisco is know for its",
+        "Facebook was created in 2004 by",
+        "Curious George is a",
+        "Python 3.11 brings improvements to its",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    # If the test requires that we generated max_output_len tokens, then set the
+    # sampling params to ignore eos token.
+    ignore_eos = force_output_len
+
+    sampling_params = SamplingParams(
+        max_tokens=max_output_len,
+        ignore_eos=ignore_eos,
+        temperature=temperature,
+    )
+
+    spec_batch_tokens, spec_batch_token_ids = get_output_from_llm_generator(
+        test_llm_generator, prompts, sampling_params)
+
+    (baseline_batch_tokens,
+     baseline_batch_token_ids) = get_output_from_llm_generator(
+         baseline_llm_generator, prompts, sampling_params)
+
+    assert len(baseline_batch_token_ids) == len(prompts)
+    assert len(spec_batch_token_ids) == len(prompts)
+
+    for i, (baseline_token_ids, baseline_tokens, spec_token_ids,
+            spec_tokens) in enumerate(
+                zip(baseline_batch_token_ids, baseline_batch_tokens,
+                    spec_batch_token_ids, spec_batch_tokens)):
+        if print_tokens:
+            print(f'{i=} {baseline_tokens=}')
+            print(f'{i=}     {spec_tokens=}')
+        print(f'{i=} {baseline_token_ids=}')
+        print(f'{i=}     {spec_token_ids=}')
+        assert baseline_token_ids == spec_token_ids
diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
similarity index 88%
rename from tests/spec_decode/e2e/test_correctness.py
rename to tests/spec_decode/e2e/test_multistep_correctness.py
index ab8d913fb894a..f99e0f6778e59 100644
--- a/tests/spec_decode/e2e/test_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -35,7 +35,8 @@
 
 from vllm import SamplingParams
 
-from .conftest import get_output_from_llm_generator
+from .conftest import (get_output_from_llm_generator,
+                       run_greedy_equality_correctness_test)
 
 
 @pytest.mark.parametrize(
@@ -545,60 +546,3 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
                                          batch_size,
                                          max_output_len=output_len,
                                          force_output_len=True)
-
-
-def run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len,
-                                         force_output_len: bool,
-                                         print_tokens: bool = False):
-    """Helper method that compares the outputs of both the baseline LLM and
-    the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
-    the same when temperature is zero.
-    """
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-        "San Francisco is know for its",
-        "Facebook was created in 2004 by",
-        "Curious George is a",
-        "Python 3.11 brings improvements to its",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    # If the test requires that we generated max_output_len tokens, then set the
-    # sampling params to ignore eos token.
-    ignore_eos = force_output_len
-
-    sampling_params = SamplingParams(
-        max_tokens=max_output_len,
-        ignore_eos=ignore_eos,
-        temperature=temperature,
-    )
-
-    spec_batch_tokens, spec_batch_token_ids = get_output_from_llm_generator(
-        test_llm_generator, prompts, sampling_params)
-
-    (baseline_batch_tokens,
-     baseline_batch_token_ids) = get_output_from_llm_generator(
-         baseline_llm_generator, prompts, sampling_params)
-
-    assert len(baseline_batch_token_ids) == len(prompts)
-    assert len(spec_batch_token_ids) == len(prompts)
-
-    for i, (baseline_token_ids, baseline_tokens, spec_token_ids,
-            spec_tokens) in enumerate(
-                zip(baseline_batch_token_ids, baseline_batch_tokens,
-                    spec_batch_token_ids, spec_batch_tokens)):
-        if print_tokens:
-            print(f'{i=} {baseline_tokens=}')
-            print(f'{i=}     {spec_tokens=}')
-        print(f'{i=} {baseline_token_ids=}')
-        print(f'{i=}     {spec_token_ids=}')
-        assert baseline_token_ids == spec_token_ids
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
new file mode 100644
index 0000000000000..44ef400c91d34
--- /dev/null
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -0,0 +1,172 @@
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+For ngram lookup, its idea comes from https://github.com/apoorvumang/prompt-lookup-decoding,
+and is merged into transform code base: https://github.com/huggingface/transformers/pull/27775.
+Since there is no model is needed for generate the proposal, we could make
+the testcase much simpler than drafter multi-step one.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various ngram sizes / speculative sizes
+
+With those tests, we can say at least, ngram spec would not break the correctess
+for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+    {
+        "model": "JackFram/llama-68m",
+    },
+])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    256,
+])
+@pytest.mark.parametrize("batch_size", [1, 64])
+@pytest.mark.parametrize("seed", [1])
+def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
+                                      test_llm_generator, batch_size: int,
+                                      output_len: int):
+    """Verify greedy equality on a tiny model with different batch size."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+    {
+        "model": "JackFram/llama-160m",
+    },
+])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        256,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_ngram_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+                                                      test_llm_generator,
+                                                      batch_size: int,
+                                                      output_len: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": "[ngram]",
+            "num_speculative_tokens": k,
+            "ngram_prompt_lookup_max": 3,
+        }
+        # Try a range of common k, as well as large speculation.
+        for k in [1, 3, 5]
+    ] + [
+        {
+            "speculative_model": "[ngram]",
+            "num_speculative_tokens": k,
+            "ngram_prompt_lookup_max": 1,
+        }
+        # Try a range of common k, as well as large speculation.
+        for k in [1, 3, 5]
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_ngram_different_k(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int):
+    """Verify that ngram speculative decoding produces exact equality
+    to without spec decode with many different values of k and
+    different ngram_prompt_lookup_max.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index e7aaa1ff4eff8..98f2731de9aa3 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -6,8 +6,8 @@
 
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplerOutput
-from vllm.spec_decode.multi_step_worker import (DraftModelTop1Proposer,
-                                                MultiStepWorker)
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
 
 from .utils import (assert_logprobs_dict_allclose, create_batch,
@@ -117,8 +117,8 @@ def test_same_output_for_single_step():
 
     zero_kv_cache(multi_step_worker.cache_engine)
     set_random_seed(seed)
-    actual_output = multi_step_worker.execute_model_multi_step(
-        **multi_step_execute_model_data.to_dict(), num_steps=num_steps)
+    actual_output, _ = multi_step_worker.sampler_output(
+        **multi_step_execute_model_data.to_dict(), sample_len=num_steps)
     assert len(actual_output) == num_steps
     actual_output = actual_output[0]
 
@@ -200,8 +200,8 @@ def test_same_output_for_multi_step():
     # Run multi-step.
     zero_kv_cache(multi_step_worker.cache_engine)
     set_random_seed(seed)
-    multi_step_output = multi_step_worker.execute_model_multi_step(
-        **execute_model_data.to_dict(), num_steps=num_steps)
+    multi_step_output, _ = multi_step_worker.sampler_output(
+        **execute_model_data.to_dict(), sample_len=num_steps)
 
     # Run single-step repeatedly.
     zero_kv_cache(worker.cache_engine)
@@ -266,7 +266,7 @@ def test_same_output_for_multi_step():
 
 @torch.inference_mode()
 def test_draft_proposals_full_speculation_len():
-    """Verify DraftModelTop1Proposer correctly handles case where all sequences
+    """Verify Top1Proposer correctly handles case where all sequences
     can speculate.
     """
     k = 10
@@ -275,13 +275,13 @@ def test_draft_proposals_full_speculation_len():
     device = 'cuda:0'
 
     draft_worker = MagicMock()
-    proposer = DraftModelTop1Proposer(
-        draft_worker=draft_worker,
+    proposer = Top1Proposer(
+        worker=draft_worker,
         device=device,
-        max_model_len=2048,
         vocab_size=vocab_size,
+        max_proposal_len=2048,
     )
-    draft_worker.execute_model_multi_step.return_value = [
+    draft_worker.sampler_output.return_value = [
         SamplerOutput(
             outputs=[],
             sampled_token_probs=torch.rand(batch_size,
@@ -294,13 +294,13 @@ def test_draft_proposals_full_speculation_len():
                                             device=device,
                                             dtype=torch.long),
         ) for _ in range(k)
-    ]
+    ], True
 
     execute_model_data, _, _ = create_batch(batch_size, k)
 
     proposals = proposer.get_proposals(
         **execute_model_data.to_dict(),
-        max_proposal_len=k,
+        proposal_len=k,
     )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
@@ -315,7 +315,7 @@ def test_draft_proposals_full_speculation_len():
 
 @torch.inference_mode()
 def test_draft_proposals_no_speculations():
-    """Verify DraftModelTop1Proposer correctly handles case where no sequences
+    """Verify Top1Proposer correctly handles case where no sequences
     can speculate.
     """
     k = 10
@@ -325,11 +325,11 @@ def test_draft_proposals_no_speculations():
     prompt_len = 10
 
     draft_worker = MagicMock()
-    proposer = DraftModelTop1Proposer(
-        draft_worker=draft_worker,
+    proposer = Top1Proposer(
+        worker=draft_worker,
         device=device,
-        max_model_len=prompt_len + k - 1,
         vocab_size=vocab_size,
+        max_proposal_len=prompt_len + k - 1,
     )
 
     execute_model_data, _, _ = create_batch(batch_size,
@@ -338,7 +338,7 @@ def test_draft_proposals_no_speculations():
 
     proposals = proposer.get_proposals(
         **execute_model_data.to_dict(),
-        max_proposal_len=k,
+        proposal_len=k,
     )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
@@ -353,7 +353,7 @@ def test_draft_proposals_no_speculations():
 
 @torch.inference_mode()
 def test_draft_proposals_mixed_k():
-    """Verify DraftModelTop1Proposer correctly handles case some sequences can
+    """Verify Top1Proposer correctly handles case some sequences can
     speculate and some can't.
     """
     k = 10
@@ -374,14 +374,14 @@ def test_draft_proposals_mixed_k():
          for _ in range(expected_num_no_proposal_seqs)] + [small_prompt_len]
 
     draft_worker = MagicMock()
-    proposer = DraftModelTop1Proposer(
-        draft_worker=draft_worker,
+    proposer = Top1Proposer(
+        worker=draft_worker,
         device=device,
-        max_model_len=long_prompt_len + prev_output_token_len + k - 1,
         vocab_size=vocab_size,
+        max_proposal_len=long_prompt_len + prev_output_token_len + k - 1,
     )
 
-    draft_worker.execute_model_multi_step.return_value = [
+    draft_worker.sampler_output.return_value = [
         SamplerOutput(
             outputs=[],
             sampled_token_probs=torch.rand(expected_num_proposal_seqs,
@@ -395,7 +395,7 @@ def test_draft_proposals_mixed_k():
                 device=device,
                 dtype=torch.long),
         ) for _ in range(k)
-    ]
+    ], True
 
     execute_model_data, _, _ = create_batch(
         batch_size,
@@ -406,7 +406,7 @@ def test_draft_proposals_mixed_k():
 
     proposals = proposer.get_proposals(
         **execute_model_data.to_dict(),
-        max_proposal_len=k,
+        proposal_len=k,
     )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
new file mode 100644
index 0000000000000..ee4135015713d
--- /dev/null
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -0,0 +1,206 @@
+import torch
+
+from vllm.spec_decode.ngram_worker import NGramWorker
+from vllm.spec_decode.top1_proposer import Top1Proposer
+
+from .utils import (create_execute_model_data,
+                    create_seq_group_metadata_from_prompts, create_worker)
+
+
+def test_ngram_algo_correctness_for_single_no_match():
+    """Verify our ngram algo find the right candidate in the prompt
+
+    For the scenario cannot find any candidate in one single batch
+    """
+    block_size = 32
+    num_gpu_blocks = 2048 // block_size
+    seed = 100
+    model_name = 'JackFram/llama-68m'
+    vocab_size = 32_000
+    device = 'cuda:0'
+
+    ngram_worker = create_worker(
+        NGramWorker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+    )
+
+    proposer = Top1Proposer(
+        worker=ngram_worker,
+        device=device,
+        vocab_size=vocab_size,
+        max_proposal_len=20,
+    )
+
+    # set ngram window (0, 3], which is window=1/2/3
+    ngram_worker.set_ngram_window_size(0, 3)
+
+    prompts = [
+        # shall find no candidate
+        [1, 2, 3, 4, 5, 6, 7],
+    ]
+
+    proposal_len = 5
+    final_seq_lens = [len(prompt) + proposal_len for prompt in prompts]
+    ngram_sampler_output_data = create_execute_model_data(
+        seq_group_metadata_list=create_seq_group_metadata_from_prompts(
+            prompts, num_gpu_blocks, block_size,
+            final_seq_lens=final_seq_lens))
+
+    proposals = proposer.get_proposals(
+        **ngram_sampler_output_data.to_dict(),
+        proposal_len=proposal_len,
+    )
+
+    assert torch.is_tensor(proposals.proposal_token_ids)
+    assert torch.is_tensor(proposals.proposal_probs)
+
+    assert proposals.proposal_token_ids.shape == torch.Size([1, proposal_len])
+    assert proposals.proposal_probs.shape[:-1] == torch.Size([1, proposal_len])
+    assert proposals.proposal_lens.shape == torch.Size([1])
+    assert proposals.proposal_lens.tolist() == [0]
+
+
+def test_ngram_algo_correctness_for_batches_not_match_all():
+    """Verify our ngram algo find the right candidate in the prompt
+
+    For the scenario find some candidate not full in batchs
+    """
+    block_size = 32
+    num_gpu_blocks = 2048 // block_size
+    seed = 100
+    model_name = 'JackFram/llama-68m'
+    vocab_size = 32_000
+    device = 'cuda:0'
+
+    ngram_worker = create_worker(
+        NGramWorker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+    )
+
+    proposer = Top1Proposer(
+        worker=ngram_worker,
+        device=device,
+        vocab_size=vocab_size,
+        max_proposal_len=20,
+    )
+
+    # set ngram window (0, 3], which is window=1/2/3
+    ngram_worker.set_ngram_window_size(0, 3)
+
+    prompts = [
+        # shall find no candidate
+        [1, 2, 3, 4, 5, 6, 7],
+        # shall find candidate 12,13,14,15,16
+        [11, 12, 13, 14, 15, 16, 11],
+        # shall find candidate 23,24,25,26,21
+        [21, 21, 22, 23, 24, 25, 26, 21, 22],
+        # shall find candidate 34,35,36,37,38
+        [31, 32, 31, 32, 33, 34, 35, 36, 37, 38, 31, 32, 33],
+        # shall find no candidate as exceed max_proposal_len
+        [
+            31, 32, 31, 32, 31, 32, 31, 32, 31, 32, 31, 32, 33, 34, 35, 36, 37,
+            38, 31, 32, 33
+        ],
+    ]
+
+    proposal_len = 5
+    final_seq_lens = [len(prompt) + proposal_len for prompt in prompts]
+    ngram_sampler_output_data = create_execute_model_data(
+        seq_group_metadata_list=create_seq_group_metadata_from_prompts(
+            prompts, num_gpu_blocks, block_size,
+            final_seq_lens=final_seq_lens))
+
+    proposals = proposer.get_proposals(
+        **ngram_sampler_output_data.to_dict(),
+        proposal_len=proposal_len,
+    )
+
+    assert torch.is_tensor(proposals.proposal_token_ids)
+    assert torch.is_tensor(proposals.proposal_probs)
+
+    assert proposals.proposal_token_ids.shape == torch.Size([5, proposal_len])
+    assert proposals.proposal_probs.shape[:-1] == torch.Size([5, proposal_len])
+    assert proposals.proposal_lens.shape == torch.Size([5])
+
+    assert proposals.proposal_lens.tolist(
+    ) == [proposal_len for _ in range(4)] + [0]
+
+    for i in range(proposal_len):
+        assert proposals.proposal_token_ids[0][i] == 0
+        assert proposals.proposal_token_ids[1][i] == prompts[1][i + 1]
+        assert proposals.proposal_token_ids[2][i] == prompts[2][i + 3]
+        assert proposals.proposal_token_ids[3][i] == prompts[3][i + 5]
+        assert proposals.proposal_token_ids[4][i] == -1
+
+
+def test_ngram_algo_correctness_for_batches_match_all():
+    """Verify our ngram algo find the right candidate in the prompt
+
+    For the scenario find candidate in all batchs
+    """
+
+    block_size = 32
+    num_gpu_blocks = 2048 // block_size
+    seed = 100
+    model_name = 'JackFram/llama-68m'
+    vocab_size = 32_000
+    device = 'cuda:0'
+
+    ngram_worker = create_worker(
+        NGramWorker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+    )
+
+    proposer = Top1Proposer(
+        worker=ngram_worker,
+        device=device,
+        vocab_size=vocab_size,
+        max_proposal_len=20,
+    )
+
+    # set ngram window (0, 3], which is window=1/2/3
+    ngram_worker.set_ngram_window_size(0, 3)
+
+    prompts = [
+        # shall find candidate 12,13,14,15,16
+        [11, 12, 13, 14, 15, 16, 11],
+        # shall find candidate 23,24,25,26,21
+        [21, 21, 22, 23, 24, 25, 26, 21, 22],
+        # shall find candidate 34,35,36,37,38
+        [31, 32, 31, 32, 33, 34, 35, 36, 37, 38, 31, 32, 33],
+    ]
+
+    proposal_len = 5
+    final_seq_lens = [len(prompt) + proposal_len for prompt in prompts]
+    ngram_sampler_output_data = create_execute_model_data(
+        seq_group_metadata_list=create_seq_group_metadata_from_prompts(
+            prompts, num_gpu_blocks, block_size,
+            final_seq_lens=final_seq_lens))
+
+    proposals = proposer.get_proposals(
+        **ngram_sampler_output_data.to_dict(),
+        proposal_len=proposal_len,
+    )
+
+    assert torch.is_tensor(proposals.proposal_token_ids)
+    assert torch.is_tensor(proposals.proposal_probs)
+
+    assert proposals.proposal_token_ids.shape == torch.Size([3, proposal_len])
+    assert proposals.proposal_probs.shape[:-1] == torch.Size([3, proposal_len])
+    assert proposals.proposal_lens.shape == torch.Size([3])
+
+    assert proposals.proposal_lens.tolist() == [proposal_len for _ in range(3)]
+
+    for i in range(proposal_len):
+        assert proposals.proposal_token_ids[0][i] == prompts[0][i + 1]
+        assert proposals.proposal_token_ids[1][i] == prompts[1][i + 3]
+        assert proposals.proposal_token_ids[2][i] == prompts[2][i + 5]
diff --git a/vllm/config.py b/vllm/config.py
index db4398addae3c..257d49b6e804f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -682,6 +682,8 @@ def maybe_create_spec_config(
         speculative_max_model_len: Optional[int],
         enable_chunked_prefill: bool,
         use_v2_block_manager: bool,
+        ngram_prompt_lookup_max: Optional[int],
+        ngram_prompt_lookup_min: Optional[int],
     ) -> Optional["SpeculativeConfig"]:
         """Create a SpeculativeConfig if possible, else return None.
 
@@ -708,6 +710,10 @@ def maybe_create_spec_config(
             use_v2_block_manager (bool): Whether vLLM is configured to use the
                 v2 block manager or not. Used for raising an error since the v2
                 block manager is required with spec decode.
+            ngram_prompt_lookup_max (Optional[int]): Max size of ngram token
+                window, if provided.
+            ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
+                window, if provided.
 
         Returns:
             Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
@@ -742,39 +748,57 @@ def maybe_create_spec_config(
         draft_code_revision = None
         draft_quantization = None
 
-        draft_model_config = ModelConfig(
-            model=speculative_model,
-            tokenizer=target_model_config.tokenizer,
-            tokenizer_mode=target_model_config.tokenizer_mode,
-            trust_remote_code=target_model_config.trust_remote_code,
-            dtype=target_model_config.dtype,
-            seed=target_model_config.seed,
-            revision=draft_revision,
-            code_revision=draft_code_revision,
-            tokenizer_revision=target_model_config.tokenizer_revision,
-            max_model_len=None,
-            quantization=draft_quantization,
-            enforce_eager=target_model_config.enforce_eager,
-            max_context_len_to_capture=target_model_config.
-            max_context_len_to_capture,
-            max_logprobs=target_model_config.max_logprobs,
-        )
-
-        draft_model_config.max_model_len = (
-            SpeculativeConfig._maybe_override_draft_max_model_len(
-                speculative_max_model_len,
-                draft_model_config.max_model_len,
-                target_model_config.max_model_len,
-            ))
+        if speculative_model == "[ngram]":
+            assert (ngram_prompt_lookup_max is not None
+                    and ngram_prompt_lookup_max > 0)
+            if ngram_prompt_lookup_min is None:
+                ngram_prompt_lookup_min = 0
+            else:
+                assert ngram_prompt_lookup_max > ngram_prompt_lookup_min
 
-        draft_parallel_config = (
-            SpeculativeConfig.create_draft_parallel_config(
-                target_parallel_config))
+            # TODO: current we still need extract vocab_size from target model
+            # config, in future, we may try refactor it out, and set
+            # draft related config as None here.
+            draft_model_config = target_model_config
+            draft_parallel_config = target_parallel_config
+        else:
+            ngram_prompt_lookup_max = 0
+            ngram_prompt_lookup_min = 0
+            draft_model_config = ModelConfig(
+                model=speculative_model,
+                tokenizer=target_model_config.tokenizer,
+                tokenizer_mode=target_model_config.tokenizer_mode,
+                trust_remote_code=target_model_config.trust_remote_code,
+                dtype=target_model_config.dtype,
+                seed=target_model_config.seed,
+                revision=draft_revision,
+                code_revision=draft_code_revision,
+                tokenizer_revision=target_model_config.tokenizer_revision,
+                max_model_len=None,
+                quantization=draft_quantization,
+                enforce_eager=target_model_config.enforce_eager,
+                max_context_len_to_capture=target_model_config.
+                max_context_len_to_capture,
+                max_logprobs=target_model_config.max_logprobs,
+            )
+
+            draft_model_config.max_model_len = (
+                SpeculativeConfig._maybe_override_draft_max_model_len(
+                    speculative_max_model_len,
+                    draft_model_config.max_model_len,
+                    target_model_config.max_model_len,
+                ))
+
+            draft_parallel_config = (
+                SpeculativeConfig.create_draft_parallel_config(
+                    target_parallel_config))
 
         return SpeculativeConfig(
             draft_model_config,
             draft_parallel_config,
             num_speculative_tokens,
+            ngram_prompt_lookup_max,
+            ngram_prompt_lookup_min,
         )
 
     @staticmethod
@@ -842,6 +866,8 @@ def __init__(
         draft_model_config: ModelConfig,
         draft_parallel_config: ParallelConfig,
         num_speculative_tokens: int,
+        ngram_prompt_lookup_max: int,
+        ngram_prompt_lookup_min: int,
     ):
         """Create a SpeculativeConfig object.
 
@@ -854,6 +880,8 @@ def __init__(
         self.draft_model_config = draft_model_config
         self.draft_parallel_config = draft_parallel_config
         self.num_speculative_tokens = num_speculative_tokens
+        self.ngram_prompt_lookup_max = ngram_prompt_lookup_max
+        self.ngram_prompt_lookup_min = ngram_prompt_lookup_min
 
         self._verify_args()
 
@@ -877,7 +905,10 @@ def num_lookahead_slots(self) -> int:
         return self.num_speculative_tokens
 
     def __repr__(self) -> str:
-        draft_model = self.draft_model_config.model
+        if self.ngram_prompt_lookup_max > 0:
+            draft_model = "[ngram]"
+        else:
+            draft_model = self.draft_model_config.model
         num_spec_tokens = self.num_speculative_tokens
         return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})"
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bd6437ee44c28..7637616ae6089 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -75,6 +75,8 @@ class EngineArgs:
     speculative_model: Optional[str] = None
     num_speculative_tokens: Optional[int] = None
     speculative_max_model_len: Optional[int] = None
+    ngram_prompt_lookup_max: Optional[int] = None
+    ngram_prompt_lookup_min: Optional[int] = None
 
     def __post_init__(self):
         if self.tokenizer is None:
@@ -449,6 +451,20 @@ def add_cli_args(
             'draft model. Sequences over this length will skip '
             'speculation.')
 
+        parser.add_argument(
+            '--ngram-prompt-lookup-max',
+            type=int,
+            default=EngineArgs.ngram_prompt_lookup_max,
+            help='Max size of window for ngram prompt lookup in speculative '
+            'decoding.')
+
+        parser.add_argument(
+            '--ngram-prompt-lookup-min',
+            type=int,
+            default=EngineArgs.ngram_prompt_lookup_min,
+            help='Min size of window for ngram prompt lookup in speculative '
+            'decoding.')
+
         parser.add_argument('--model-loader-extra-config',
                             type=str,
                             default=EngineArgs.model_loader_extra_config,
@@ -502,6 +518,8 @@ def create_engine_config(self, ) -> EngineConfig:
             speculative_max_model_len=self.speculative_max_model_len,
             enable_chunked_prefill=self.enable_chunked_prefill,
             use_v2_block_manager=self.use_v2_block_manager,
+            ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
+            ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
         )
 
         scheduler_config = SchedulerConfig(
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 527a14ff6c67a..a58856a12f0c8 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -73,7 +73,6 @@ def _init_spec_worker(self):
         """
         assert self.speculative_config is not None
 
-        from vllm.spec_decode.multi_step_worker import MultiStepWorker
         from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
 
         target_worker = self._create_worker()
@@ -86,10 +85,11 @@ def _init_spec_worker(self):
             # TODO allow draft-model specific load config.
             #load_config=self.load_config,
         )
-        draft_worker = MultiStepWorker(**draft_worker_kwargs)
 
-        spec_decode_worker = SpecDecodeWorker.from_workers(
-            proposer_worker=draft_worker, scorer_worker=target_worker)
+        spec_decode_worker = SpecDecodeWorker.create_worker(
+            scorer_worker=target_worker,
+            draft_worker_kwargs=draft_worker_kwargs,
+        )
 
         assert self.parallel_config.world_size == 1, (
             "GPUExecutor only supports single GPU.")
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index c29b838f854c0..8b113e93474ff 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -333,13 +333,13 @@ def _split_scoring_output(
         sampler_output.sampled_token_probs = spec_probs
         sampler_output.sampled_token_ids = spec_sampled_tokens
         target_token_ids, target_probs = sampler_output_to_torch(
-            [sampler_output])
+            [sampler_output], True)
 
         # Convert non-speculative output tokens to tensors.
         sampler_output.sampled_token_probs = non_spec_probs
         sampler_output.sampled_token_ids = non_spec_sampled_tokens
         non_spec_target_token_ids, non_spec_target_probs = (
-            sampler_output_to_torch([sampler_output]))
+            sampler_output_to_torch([sampler_output], True))
 
         return (target_token_ids, target_probs, non_spec_target_token_ids,
                 non_spec_target_probs)
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 7cf338bbae5f0..d031bc85af160 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,12 +1,11 @@
 import copy
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Tuple
 
 import torch
 
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
-from vllm.spec_decode.interfaces import (SpeculativeProposals,
-                                         SpeculativeProposer)
-from vllm.spec_decode.util import sampler_output_to_torch
+from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
 
 
@@ -26,29 +25,37 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # Lazy initialization list.
-        self._proposer: DraftModelTop1Proposer
+        self._proposer: Top1Proposer
 
     def init_device(self):
         super().init_device()
 
-        self._proposer = DraftModelTop1Proposer(
+        self._proposer = Top1Proposer(
             self,
             self.device,
-            self.max_model_len,
             self.vocab_size,
+            max_proposal_len=self.max_model_len,
         )
 
+    def set_include_gpu_probs_tensor(self):
+        # Need include_gpu_probs_tensor for multi_step_worker
+        self.model_runner.model.sampler.include_gpu_probs_tensor = True
+
     @torch.inference_mode()
-    def execute_model_multi_step(
+    def sampler_output(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         blocks_to_swap_in: Dict[int, int],
         blocks_to_swap_out: Dict[int, int],
         blocks_to_copy: Dict[int, List[int]],
-        num_steps: int,
-    ) -> List[SamplerOutput]:
-        """Run the model forward pass num_steps times. Returns the list of
-        sampler output, one per model forward pass.
+        sample_len: int,
+    ) -> Tuple[List[SamplerOutput], bool]:
+        """Run the model forward pass sample_len times. Returns the list of
+        sampler output, one per model forward pass, along with indicator of
+        whether torch tensor in sampler output need to be transposed in latter
+        sampler_output_to_torch logic.
+
+        For multi step worker, this indicator shall be True.
         """
         self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in,
                                    blocks_to_swap_out, blocks_to_copy)
@@ -58,12 +65,12 @@ def execute_model_multi_step(
         copied_seq_group_metadata_list = self._shallow_copy_inputs(
             seq_group_metadata_list)
 
-        # Assert enough KV space for num_steps tokens per sequence.
-        self._assert_enough_kv_space(seq_group_metadata_list, num_steps)
+        # Assert enough KV space for sample_len tokens per sequence.
+        self._assert_enough_kv_space(seq_group_metadata_list, sample_len)
 
-        # Run model num_steps times.
+        # Run model sample_len times.
         model_outputs = []
-        for _ in range(num_steps):
+        for _ in range(sample_len):
             model_output = super().execute_model(
                 seq_group_metadata_list=copied_seq_group_metadata_list,
                 blocks_to_swap_in=blocks_to_swap_in,
@@ -78,7 +85,7 @@ def execute_model_multi_step(
                                     copied_seq_group_metadata_list)
             model_outputs.append(model_output)
 
-        return model_outputs
+        return model_outputs, True
 
     def get_spec_proposals(
         self,
@@ -206,171 +213,3 @@ def _raise_if_unsupported(
                 for seq_group_metadata in seq_group_metadata_list):
             raise NotImplementedError(
                 "MultiStepWorker does not support beam search.")
-
-
-class DraftModelTop1Proposer(SpeculativeProposer):
-    """Helper class which separates out sequences which would exceed the max
-    model length when speculated upon.
-
-    This allows combinations of models such as JackFram/llama-68m draft with
-    meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of
-    2048 while Llama2-13b has max_position_embeddings of 4096.
-
-    We treat the sequences which exceed the proposal draft model length as
-    "non-spec sequences". Essentially they skip the draft model and go through
-    normal decoding in the target model.
-
-    Currently, only proposal_lens of 0 and k are supported, where k is a global
-    batch proposal length. In the future vLLM should support per-sequence
-    proposal lengths.
-    """
-
-    def __init__(
-        self,
-        draft_worker: MultiStepWorker,
-        device: str,
-        max_model_len: int,
-        vocab_size: int,
-    ):
-        self._draft_worker = draft_worker
-        self._device = device
-        self._max_model_len = max_model_len
-        self._vocab_size = vocab_size
-
-    def get_proposals(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        max_proposal_len: int,
-    ) -> SpeculativeProposals:
-        """Get speculative proposals given the input batch.
-
-        Sequences which would exceed the max model length are skipped during
-        speculation.
-        """
-
-        # Split speculative- and non-speculative- sequences.
-        (proposal_lens, nonzero_proposal_len_seqs,
-         nonzero_proposal_len_indices) = self._split_by_max_model_len(
-             seq_group_metadata_list, max_proposal_len)
-
-        if nonzero_proposal_len_seqs:
-            # Speculate tokens using the draft worker for the speculative
-            # sequences.
-            maybe_sampler_output = self._draft_worker.execute_model_multi_step(
-                seq_group_metadata_list=nonzero_proposal_len_seqs,
-                blocks_to_swap_in=blocks_to_swap_in,
-                blocks_to_swap_out=blocks_to_swap_out,
-                blocks_to_copy=blocks_to_copy,
-                num_steps=max_proposal_len,
-            )
-        else:
-            # If no sequences can be speculated, set sampler output to None.
-            maybe_sampler_output = None
-
-        # Combine speculative- and non-speculative sequences into the same
-        # representation.
-        proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs(
-            batch_size=len(seq_group_metadata_list),
-            max_proposal_len=max_proposal_len,
-            maybe_sampler_output=maybe_sampler_output,
-            proposal_lens=proposal_lens,
-            nonzero_proposal_len_indices=nonzero_proposal_len_indices,
-        )
-
-        proposals = SpeculativeProposals(
-            proposal_token_ids=proposal_tokens,
-            proposal_probs=proposal_probs,
-            proposal_lens=proposal_lens,
-        )
-
-        return proposals
-
-    def _split_by_max_model_len(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        max_proposal_len: int,
-    ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]:
-        """Determine which sequences would exceed the max model length.
-        """
-
-        proposal_lens: List[int] = []
-        nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = []
-        nonzero_proposal_len_indices: List[int] = []
-        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            seq_data = next(iter(seq_group_metadata.seq_data.values()))
-            seq_len = seq_data.get_len()
-
-            # Currently only proposal lens of 0 or the global batch proposal len
-            # are supported.
-            if seq_len + max_proposal_len < self._max_model_len:
-                proposal_lens.append(max_proposal_len)
-                nonzero_proposal_len_seqs.append(seq_group_metadata)
-                nonzero_proposal_len_indices.append(i)
-            else:
-                proposal_lens.append(0)
-
-        return (proposal_lens, nonzero_proposal_len_seqs,
-                nonzero_proposal_len_indices)
-
-    def _merge_outputs(
-        self,
-        batch_size: int,
-        max_proposal_len: int,
-        maybe_sampler_output: Optional[SamplerOutput],
-        proposal_lens: List[int],
-        nonzero_proposal_len_indices: List[int],
-    ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]:
-        """After speculations are produced, merge the speculation results with
-        the skipped sequences.
-        """
-        if maybe_sampler_output is None:
-            # If no speculative tokens, the sampler output will be None.
-            # In this case we return empty proposals.
-            proposal_tokens = torch.full(size=(
-                batch_size,
-                max_proposal_len,
-            ),
-                                         fill_value=-1,
-                                         dtype=torch.long,
-                                         device=self._device)
-            proposal_probs = torch.zeros(batch_size,
-                                         max_proposal_len,
-                                         self._vocab_size,
-                                         dtype=torch.float32,
-                                         device=self._device)
-            proposal_lens_tensor = torch.zeros(len(proposal_lens),
-                                               dtype=torch.long,
-                                               device=self._device)
-            return proposal_tokens, proposal_probs, proposal_lens_tensor
-
-        sampler_output = maybe_sampler_output
-        proposal_tokens, proposal_probs = sampler_output_to_torch(
-            sampler_output)
-
-        # Now, reformat the output GPU tensors such that each sequence has
-        # a proposal. the proposal can be empty, e.g. [-1, -1, -1]
-
-        entire_proposal_tokens = torch.full(size=(batch_size,
-                                                  *proposal_tokens.shape[1:]),
-                                            fill_value=-1,
-                                            dtype=torch.long,
-                                            device=self._device)
-        entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens
-        entire_proposal_probs = torch.zeros(batch_size,
-                                            *proposal_probs.shape[1:],
-                                            dtype=torch.float32,
-                                            device=self._device)
-        entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs
-
-        proposal_tokens, proposal_probs = (entire_proposal_tokens,
-                                           entire_proposal_probs)
-
-        proposal_lens_tensor = torch.zeros(batch_size,
-                                           dtype=torch.long,
-                                           device=self._device)
-        proposal_lens_tensor[nonzero_proposal_len_indices] = max_proposal_len
-
-        return proposal_tokens, proposal_probs, proposal_lens_tensor
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
new file mode 100644
index 0000000000000..696ca964328cf
--- /dev/null
+++ b/vllm/spec_decode/ngram_worker.py
@@ -0,0 +1,190 @@
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.top1_proposer import Top1Proposer
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+
+
+class NGramWorker(LoraNotSupportedWorkerBase):
+    """NGramWorker provides a light drafter without need for model.
+
+    Current NGramWorker only implement prompt lookup decoding,
+    and in future we may also do RAG type drafter and other scenerios
+    which don't rely on LLM model to give proposals.
+    """
+
+    def __init__(self, *args, **kwargs):
+        # Get local_rank/vocab_size from kwargs attribute
+        self.local_rank = kwargs["local_rank"]
+        self.vocab_size = kwargs["model_config"].get_vocab_size()
+
+        # Lazy initialization list.
+        self._proposer: Top1Proposer
+
+    def set_ngram_window_size(self, ngram_prompt_lookup_min: int,
+                              ngram_prompt_lookup_max: int):
+        # Search valid candidate window between
+        # ngram_prompt_lookup_min/ngram_prompt_lookup_max
+        self.ngram_prompt_lookup_max = ngram_prompt_lookup_max
+        self.ngram_prompt_lookup_min = ngram_prompt_lookup_min
+
+    def init_device(self):
+        self.device = torch.device(f"cuda:{self.local_rank}")
+        self.load_model = lambda *args, **kwargs: None
+
+        # Current only support Top1Proposer
+        self._proposer = Top1Proposer(
+            self,
+            device=self.device,
+            vocab_size=self.vocab_size,
+        )
+
+    def set_include_gpu_probs_tensor(self):
+        # NGram don't need gpu sampler
+        pass
+
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+    ) -> None:
+        """NGram doesn't depend on model execution, just pass this function"""
+        pass
+
+    def determine_num_available_blocks(self) -> None:
+        """NGram doesn't depend on model execution, no need to check blocks"""
+        pass
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """As there is no cache need to handle, just pass this function"""
+        pass
+
+    def get_cache_block_size_bytes(self):
+        """Return the size of a cache block in bytes."""
+        return 0
+
+    def sampler_output(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        sample_len: int,
+    ) -> Tuple[Optional[List[SamplerOutput]], bool]:
+        """NGram match algo to pick proposal candidate. Returns the list of
+        sampler output, one per SequenceGroupMetadata.
+
+        For ngram worker, we already done needed transposed internal, so the
+        indicator pass to sampler_output_to_torch shall be False.
+        """
+        self._raise_if_unsupported(
+            seq_group_metadata_list,
+            blocks_to_swap_in,
+            blocks_to_swap_out,
+            blocks_to_copy,
+        )
+
+        arr = []
+        has_spec_out = False
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_data = next(iter(seq_group_metadata.seq_data.values()))
+
+            input_ids = torch.as_tensor(seq_data.get_token_ids(),
+                                        dtype=torch.long,
+                                        device=self.device)
+            input_length = seq_data.get_len()
+
+            for ngram_size in range(
+                    min(self.ngram_prompt_lookup_max, input_length - 1),
+                    self.ngram_prompt_lookup_min,
+                    -1,
+            ):
+                ngram_tensor = input_ids[-1 * ngram_size:]
+                windows = input_ids.unfold(dimension=0,
+                                           size=ngram_size,
+                                           step=1)
+                matches = (windows == ngram_tensor).all(dim=1)
+                match_indices = matches.nonzero(as_tuple=True)[0]
+                if match_indices.size()[0] > 1:
+                    has_spec_out = True
+                    res = seq_data.get_token_ids()
+                    res = res[match_indices[0] + ngram_size:match_indices[0] +
+                              ngram_size + sample_len]
+                    res_len = len(res)
+                    # pad 0 towards output as sample_len tokens required
+                    res += [0] * (sample_len - res_len)
+
+                    break
+            else:
+                # if no candidate found, fill with 0
+                res = [0] * sample_len
+
+            arr.append(res)
+
+        if not has_spec_out:
+            return None, False
+
+        outputs = []
+        token_ids = torch.as_tensor(arr, dtype=torch.long, device=self.device)
+        indices = token_ids.unsqueeze(2)
+
+        token_probs = torch.zeros(
+            (len(seq_group_metadata_list), sample_len, self.vocab_size),
+            dtype=torch.float32,
+            device=self.device,
+        )
+        token_probs.scatter_(2, indices, 1)
+        for i in range(len(seq_group_metadata_list)):
+            outputs.append(
+                SamplerOutput(
+                    outputs=None,
+                    sampled_token_probs=token_probs[i],
+                    sampled_token_ids=token_ids[i],
+                ))
+        return outputs, False
+
+    def get_spec_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        max_proposal_len: int,
+    ) -> SpeculativeProposals:
+        """Produce speculations given an input batch of sequences. The number of
+        speculative tokens per sequence is determined by max_proposal_len.
+        """
+
+        return self._proposer.get_proposals(
+            seq_group_metadata_list,
+            blocks_to_swap_in,
+            blocks_to_swap_out,
+            blocks_to_copy,
+            max_proposal_len,
+        )
+
+    def _raise_if_unsupported(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
+        """NGramWorker does not yet implement support for cache swap
+        operations or beam search.
+        """
+        if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]):
+            raise NotImplementedError(
+                "NGramWorker does not support cache operations")
+
+        if any(
+                len(seq_group_metadata.seq_data.keys()) != 1
+                for seq_group_metadata in seq_group_metadata_list):
+            raise NotImplementedError(
+                "NGramWorker does not support beam search.")
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 4e70ea9686005..e33bb4f3f6337 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -12,6 +12,7 @@
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.metrics import AsyncMetricsCollector
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
                                    split_batch_by_proposal_len)
 from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
@@ -48,8 +49,27 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     """
 
     @classmethod
-    def from_workers(cls, proposer_worker: MultiStepWorker,
-                     scorer_worker: WorkerBase) -> "SpecDecodeWorker":
+    def create_worker(
+        cls,
+        scorer_worker: WorkerBase,
+        draft_worker_kwargs,
+    ) -> "SpecDecodeWorker":
+
+        if "ngram_prompt_lookup_max" in draft_worker_kwargs:
+            ngram_prompt_lookup_max = (
+                draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
+            ngram_prompt_lookup_min = (
+                draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
+        else:
+            ngram_prompt_lookup_max = 0
+
+        if ngram_prompt_lookup_max > 0:
+            proposer_worker = NGramWorker(**draft_worker_kwargs)
+            proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
+                                                  ngram_prompt_lookup_max)
+        else:
+            proposer_worker = MultiStepWorker(**draft_worker_kwargs)
+
         return SpecDecodeWorker(
             proposer_worker,
             scorer_worker,
@@ -59,7 +79,7 @@ def from_workers(cls, proposer_worker: MultiStepWorker,
 
     def __init__(
         self,
-        proposer_worker: MultiStepWorker,
+        proposer_worker: WorkerBase,
         scorer_worker: WorkerBase,
         rejection_sampler: RejectionSampler,
         metrics_collector: Optional[AsyncMetricsCollector] = None,
@@ -134,8 +154,7 @@ def _configure_model_sampler_for_spec_decode(self):
         """
         (self.scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor
          ) = True
-        (self.proposer_worker.model_runner.model.sampler.
-         include_gpu_probs_tensor) = True
+        self.proposer_worker.set_include_gpu_probs_tensor()
 
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of cache blocks to use.
@@ -183,8 +202,8 @@ def execute_model(
             "speculative decoding "
             "requires non-None seq_group_metadata_list")
 
-        logger.info("spec_decode_worker.execute_model num_lookahead_slots=%d",
-                    num_lookahead_slots)
+        #logger.info("spec_decode_worker.execute_model num_lookahead_slots=%d",
+        #            num_lookahead_slots)
 
         # If no spec tokens, call the proposer and scorer workers normally.
         # Used for prefill.
@@ -216,7 +235,7 @@ def _run_no_spec(
         proposer and scorer model so that the KV cache is consistent between the
         two.
         """
-        logger.info("run proposer worker no spec")
+        #logger.info("run proposer worker no spec")
 
         self.proposer_worker.execute_model(
             seq_group_metadata_list=seq_group_metadata_list,
@@ -225,7 +244,7 @@ def _run_no_spec(
             blocks_to_copy=blocks_to_copy,
         )
 
-        logger.info("run target worker no spec")
+        #logger.info("run target worker no spec")
         sampler_output = self.scorer_worker.execute_model(
             seq_group_metadata_list=seq_group_metadata_list,
             blocks_to_swap_in=blocks_to_swap_in,
@@ -259,7 +278,7 @@ def _run_speculative_decoding_step(
         sequence.
         """
 
-        logger.info("get spec proposals")
+        #logger.info("get spec proposals")
         # Generate proposals using draft worker.
         assert blocks_to_swap_in is not None
         assert blocks_to_swap_out is not None
@@ -268,7 +287,7 @@ def _run_speculative_decoding_step(
             seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out,
             blocks_to_copy, k)
 
-        logger.info("score proposals")
+        #logger.info("score proposals")
         proposal_scores = self.scorer.score_proposals(
             seq_group_metadata_list,
             blocks_to_swap_in,
@@ -278,11 +297,11 @@ def _run_speculative_decoding_step(
             proposals,
         )
 
-        logger.info("verify proposals")
+        #logger.info("verify proposals")
         accepted_token_ids = self._verify_tokens(seq_group_metadata_list,
                                                  proposal_scores, proposals, k)
 
-        logger.info("create output list")
+        #logger.info("create output list")
         return self._create_output_sampler_list(seq_group_metadata_list,
                                                 accepted_token_ids, k)
 
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
new file mode 100644
index 0000000000000..6766a2deb8eb8
--- /dev/null
+++ b/vllm/spec_decode/top1_proposer.py
@@ -0,0 +1,200 @@
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                         SpeculativeProposer)
+from vllm.spec_decode.util import sampler_output_to_torch
+from vllm.worker.worker_base import WorkerBase
+
+
+class Top1Proposer(SpeculativeProposer):
+    """Helper class which separates out sequences which would exceed the max
+    model length when speculated upon.
+
+    This allows combinations of models such as JackFram/llama-68m draft with
+    meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of
+    2048 while Llama2-13b has max_position_embeddings of 4096.
+
+    We treat the sequences which exceed the proposal draft model length as
+    "non-spec sequences". Essentially they skip the draft model and go through
+    normal decoding in the target model.
+
+    Currently, only proposal_lens of 0 and k are supported, where k is a global
+    batch proposal length. In the future vLLM should support per-sequence
+    proposal lengths.
+    """
+
+    def __init__(
+        self,
+        worker: WorkerBase,
+        device: str,
+        vocab_size: int,
+        max_proposal_len: Optional[int] = None,
+    ):
+        self._worker = worker
+        self._device = device
+        self.max_proposal_len = max_proposal_len
+        self._vocab_size = vocab_size
+
+    def get_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        proposal_len: int,
+    ) -> SpeculativeProposals:
+        """Get speculative proposals given the input batch.
+
+        Sequences which would exceed the max model length are skipped during
+        speculation.
+        """
+
+        # Split speculative- and non-speculative- sequences.
+        (
+            proposal_lens,
+            nonzero_proposal_len_seqs,
+            nonzero_proposal_len_indices,
+        ) = self._split_by_max_model_len(seq_group_metadata_list, proposal_len)
+
+        if nonzero_proposal_len_seqs:
+            # Speculate tokens using the draft worker for the speculative
+            # sequences.
+            # If sampler_transposed is true, then maybe_sampler_output's
+            # token_ids is like [batch] format in proposal_len size list,
+            # while if it is false, the format would be [proposal_len]
+            # in batch size list
+            maybe_sampler_output, transposed = self._worker.sampler_output(
+                seq_group_metadata_list=nonzero_proposal_len_seqs,
+                blocks_to_swap_in=blocks_to_swap_in,
+                blocks_to_swap_out=blocks_to_swap_out,
+                blocks_to_copy=blocks_to_copy,
+                sample_len=proposal_len,
+            )
+        else:
+            # If no sequences can be speculated, set sampler output to None.
+            maybe_sampler_output = None
+            transposed = False
+
+        # Combine speculative- and non-speculative sequences into the same
+        # representation.
+        proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs(
+            batch_size=len(seq_group_metadata_list),
+            proposal_len=proposal_len,
+            maybe_sampler_output=maybe_sampler_output,
+            proposal_lens=proposal_lens,
+            nonzero_proposal_len_indices=nonzero_proposal_len_indices,
+            sampler_transposed=transposed,
+        )
+
+        proposals = SpeculativeProposals(
+            proposal_token_ids=proposal_tokens,
+            proposal_probs=proposal_probs,
+            proposal_lens=proposal_lens,
+        )
+
+        return proposals
+
+    def _split_by_max_model_len(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        proposal_len: int,
+    ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]:
+        """Determine which sequences would exceed the max model length."""
+
+        proposal_lens: List[int] = []
+        nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = []
+        nonzero_proposal_len_indices: List[int] = []
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_data = next(iter(seq_group_metadata.seq_data.values()))
+            seq_len = seq_data.get_len()
+
+            # Currently only proposal lens of 0 or the global batch proposal len
+            # are supported.
+            # If max_proposal_len is defined, then we shall no exccess this
+            # quota for nonzero_proposal
+            if (self.max_proposal_len is None
+                    or seq_len + proposal_len < self.max_proposal_len):
+                proposal_lens.append(proposal_len)
+                nonzero_proposal_len_seqs.append(seq_group_metadata)
+                nonzero_proposal_len_indices.append(i)
+            else:
+                proposal_lens.append(0)
+
+        return (
+            proposal_lens,
+            nonzero_proposal_len_seqs,
+            nonzero_proposal_len_indices,
+        )
+
+    def _merge_outputs(
+        self,
+        batch_size: int,
+        proposal_len: int,
+        maybe_sampler_output: Optional[SamplerOutput],
+        proposal_lens: List[int],
+        nonzero_proposal_len_indices: List[int],
+        sampler_transposed: bool,
+    ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]:
+        """After speculations are produced, merge the speculation results with
+        the skipped sequences.
+        """
+        if maybe_sampler_output is None:
+            # If no speculative tokens, the sampler output will be None.
+            # In this case we return empty proposals.
+            proposal_tokens = torch.full(
+                size=(
+                    batch_size,
+                    proposal_len,
+                ),
+                fill_value=-1,
+                dtype=torch.long,
+                device=self._device,
+            )
+            proposal_probs = torch.zeros(
+                batch_size,
+                proposal_len,
+                self._vocab_size,
+                dtype=torch.float32,
+                device=self._device,
+            )
+            proposal_lens_tensor = torch.zeros(len(proposal_lens),
+                                               dtype=torch.long,
+                                               device=self._device)
+            return proposal_tokens, proposal_probs, proposal_lens_tensor
+
+        sampler_output = maybe_sampler_output
+        proposal_tokens, proposal_probs = sampler_output_to_torch(
+            sampler_output, sampler_transposed)
+
+        # Now, reformat the output GPU tensors such that each sequence has
+        # a proposal. the proposal can be empty, e.g. [-1, -1, -1]
+
+        entire_proposal_tokens = torch.full(
+            size=(batch_size, *proposal_tokens.shape[1:]),
+            fill_value=-1,
+            dtype=torch.long,
+            device=self._device,
+        )
+        entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens
+        entire_proposal_probs = torch.zeros(
+            batch_size,
+            *proposal_probs.shape[1:],
+            dtype=torch.float32,
+            device=self._device,
+        )
+        entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs
+
+        proposal_tokens, proposal_probs = (
+            entire_proposal_tokens,
+            entire_proposal_probs,
+        )
+
+        proposal_lens_tensor = torch.zeros(batch_size,
+                                           dtype=torch.long,
+                                           device=self._device)
+        proposal_lens_tensor[nonzero_proposal_len_indices] = proposal_len
+
+        return proposal_tokens, proposal_probs, proposal_lens_tensor
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index eb6d4ca1da8e6..894d2fd915948 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -49,10 +49,13 @@ def split_batch_by_proposal_len(
 
 
 def sampler_output_to_torch(
-    sampler_output_list: List[SamplerOutput],
-) -> Tuple[torch.Tensor, torch.Tensor]:
+        sampler_output_list: List[SamplerOutput],
+        sampler_transposed: bool) -> Tuple[torch.Tensor, torch.Tensor]:
     """Utility function which converts a list of SamplerOutput to tensors.
 
+        sampler_transposed here is used as the indicator for whether
+        we need do additional tensor transpose logic here.
+
         Returns:
             sampled_token_ids: torch.Tensor
                 shape: [batch_size, len(sampler_output_list)]
@@ -68,7 +71,10 @@ def sampler_output_to_torch(
             for sampler_output in sampler_output_list
         ],
         dim=0,
-    ).transpose(0, 1)
+    )
+
+    if sampler_transposed:
+        sampled_token_probs = sampled_token_probs.transpose(0, 1)
 
     # shape: [batch_size, num_sampler_output]
     sampled_token_ids = torch.stack(
@@ -77,7 +83,9 @@ def sampler_output_to_torch(
             for sampler_output in sampler_output_list
         ],
         dim=0,
-    ).transpose(0, 1)
+    )
+    if sampler_transposed:
+        sampled_token_ids = sampled_token_ids.transpose(0, 1)
 
     return sampled_token_ids, sampled_token_probs
 

From 24750f4cadd15a2b3a52f982e39eb9803749efbc Mon Sep 17 00:00:00 2001
From: leiwen83 <leiwen83@users.noreply.github.com>
Date: Thu, 2 May 2024 02:20:32 +0800
Subject: [PATCH 21/65] [Core] Enable prefix caching with block manager v2
 enabled (#4142)

Co-authored-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Sage Moore <sagemoore@utexas.edu>
---
 benchmarks/benchmark_prefix_caching.py        |  16 +-
 tests/core/block/e2e/test_correctness.py      | 146 +++++++++++++++
 tests/core/block/test_prefix_caching_block.py | 125 +++++++++++++
 vllm/core/block/cpu_gpu_block_allocator.py    |  12 +-
 vllm/core/block/interfaces.py                 |   4 +
 vllm/core/block/naive_block.py                |  11 +-
 vllm/core/block/prefix_caching_block.py       | 172 ++++++++++++++----
 vllm/core/block_manager_v1.py                 |   2 +-
 vllm/core/block_manager_v2.py                 |  31 ++--
 vllm/core/{evictor.py => evictor_v1.py}       |   0
 vllm/core/evictor_v2.py                       | 122 +++++++++++++
 11 files changed, 584 insertions(+), 57 deletions(-)
 rename vllm/core/{evictor.py => evictor_v1.py} (100%)
 create mode 100644 vllm/core/evictor_v2.py

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 1f3274a28cad5..089966986984f 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -16,20 +16,22 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
 
 
 def main(args):
-    llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
+    llm = LLM(model=args.model,
               tokenizer_mode='auto',
               trust_remote_code=True,
               enforce_eager=True,
+              use_v2_block_manager=args.use_v2_block_manager,
+              tensor_parallel_size=args.tensor_parallel_size,
               enable_prefix_caching=args.enable_prefix_caching)
 
     num_prompts = 100
     prompts = [PROMPT] * num_prompts
-    sampling_params = SamplingParams(temperature=0, max_tokens=100)
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
 
     print("------warm up------")
     test_prefix(
         llm=llm,
-        prompts=prompts[:1],
+        prompts=prompts,
         sampling_params=sampling_params,
     )
 
@@ -45,8 +47,16 @@ def main(args):
     parser = argparse.ArgumentParser(
         description='Benchmark the performance with or without automatic '
         'prefix caching.')
+    parser.add_argument('--model',
+                        type=str,
+                        default='baichuan-inc/Baichuan2-13B-Chat')
+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+    parser.add_argument('--output-len', type=int, default=10)
     parser.add_argument('--enable-prefix-caching',
                         action='store_true',
                         help='enable prefix caching')
+    parser.add_argument('--use-v2-block-manager',
+                        action='store_true',
+                        help='Use BlockSpaceMangerV2')
     args = parser.parse_args()
     main(args)
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 0ee78a9b0a8ea..c3666da7542b5 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -300,6 +300,152 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+
+        # Enable prefill cache
+        "enable_prefix_caching": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "use_v2_block_manager": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
+        baseline_llm_generator, test_llm_generator, batch_size):
+    """Verify block manager v2 produces same outputs as block manager v1, even
+    when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that the KV
+    cache is not corrupted in the v2 block manager.
+
+    NOTE: We want a significant number of generated tokens so that any incorrect
+    KV mapping has time to build up error.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids from block manager v1')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids from block manager v2')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+
+        # Test APC in v2 block
+        "use_v2_block_manager": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "enable_prefix_caching": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"enable_prefix_caching": True}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
+                                             test_llm_generator, batch_size):
+    """Verify block manager v2 with auto prefix caching enabled produces same
+    outputs as auto prefix caching disabled, even when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that auto
+    prefix caching itself at least don't cause result error.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with APC disabled')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with APC enabled')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
 def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
     for llm in llm_generator:
         outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 5f4d58dd5fd39..c4c680e109a84 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -358,6 +358,131 @@ def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
                                                        i)
             allocator.free(block)
 
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
+                                           seed: int):
+        """Verify get_common_computed_block_ids could get correct result
+        by create two immutable chain sharing prefix at specified pos,
+        and compare whether we also could get right result
+        from get_common_computed_block_ids.
+        """
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+        blocks = list(range(num_blocks_to_consume))
+
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # mark all blocks in first chain as computed
+        allocator.mark_blocks_as_computed(blocks)
+
+        # After zero_point, second_chain's token_ids would be set -1, which
+        # make it different from here comparing with first_chain
+        zero_point = random.randint(1, len(token_ids) - 1)
+        zero_point_blocks = zero_point // block_size
+        token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
+
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        first_computed_ids = [
+            first_chain[i].block_id for i in range(num_blocks_to_consume)
+        ]
+        second_computed_ids = [
+            second_chain[i].block_id for i in range(num_blocks_to_consume)
+        ]
+        res = allocator.get_common_computed_block_ids(
+            [first_computed_ids, second_computed_ids])
+
+        assert (len(res) == zero_point_blocks)
+
+    # Test case where two last accessed times are equal
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_eviction_order(num_blocks: int, block_size: int, seed: int):
+        """This test case simulate the two chain created and free in order,
+        and together they would exhaust the initial freed blocks.
+
+        So the next block created after those two chain shall use the block
+        from the first chain as that block has long access time.
+        While first chain has two blocks, it shall pick up the last one, as
+        it has larger token number.
+        """
+
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = num_blocks + 1
+
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        num_blocks_in_first_chain = 2
+        num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
+        # First chain takes the first block
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[:num_tokens_in_first_chain],
+            allocator=allocator,
+        )
+        # There should only be one block allocated at this point
+        assert allocator.get_num_free_blocks() == (num_blocks -
+                                                   num_blocks_in_first_chain)
+
+        # Set the last accessed time of the first block to 1
+        blocks_ids = [block.block_id for block in first_chain]
+        allocator.mark_blocks_as_accessed(blocks_ids, 1)
+
+        # Second chain takes the rest of the blocks
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[num_tokens_in_first_chain:-block_size],
+            allocator=allocator,
+        )
+
+        # There shouldn't be any blocks left at this point
+        assert allocator.get_num_free_blocks() == (0)
+
+        assert len(first_chain) == num_blocks_in_first_chain
+        last_block_id = first_chain[-1].block_id
+        # Free each block in the first chain.
+        for i, block in enumerate(first_chain):
+            allocator.free(block)
+
+        # Set the last accessed time on all of the blocks in the second chain
+        # to 2
+        blocks_ids = [block.block_id for block in second_chain]
+        allocator.mark_blocks_as_accessed(blocks_ids, 2)
+
+        # Free each block in the second chain.
+        for i, block in enumerate(second_chain):
+            allocator.free(block)
+
+        # Allocate a new block and check that it's the least recently used block
+        # from the first chain.
+        new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[-block_size:],
+            allocator=allocator,
+        )
+
+        assert new_block[0].block_id == last_block_id
+
     @staticmethod
     def create_immutable_chain(
         block_size: int,
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 3135e194c5937..23e1a4cf91266 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -190,10 +190,18 @@ def clear_copy_on_writes(self) -> Dict[int, List[int]]:
         device = Device.GPU
         return self._allocators[device].clear_copy_on_writes()
 
-    def mark_blocks_as_computed(self) -> None:
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, only use for prefix caching."""
         # Prefix caching only supported on GPU.
         device = Device.GPU
-        return self._allocators[device].mark_blocks_as_computed()
+        return self._allocators[device].mark_blocks_as_accessed(block_ids, now)
+
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        """Mark blocks as accessed, only use for prefix caching."""
+        # Prefix caching only supported on GPU.
+        device = Device.GPU
+        return self._allocators[device].mark_blocks_as_computed(block_ids)
 
     def get_common_computed_block_ids(
             self, seq_block_ids: List[List[int]]) -> List[int]:
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 50ce922118124..440d6a4b04d3b 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -81,6 +81,10 @@ def all_block_ids(self) -> FrozenSet[int]:
     def clear_copy_on_writes(self) -> Dict[int, List[int]]:
         pass
 
+    @abstractmethod
+    def mark_blocks_as_accessed(self) -> None:
+        pass
+
     @abstractmethod
     def mark_blocks_as_computed(self) -> None:
         pass
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index f8e9265bb2d67..a0bf33912d935 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -174,7 +174,16 @@ def clear_copy_on_writes(self) -> Dict[BlockId, List[BlockId]]:
         """
         return self._cow_tracker.clear_cows()
 
-    def mark_blocks_as_computed(self) -> None:
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, used in prefix caching.
+
+        Since the naive allocator does not implement prefix caching, we do
+        nothing.
+        """
+        pass
+
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         """Mark blocks as computed, used in prefix caching.
 
         Since the naive allocator does not implement prefix caching, we do
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 6aa75a8abb80a..292a750146ae6 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -7,10 +7,16 @@
                                     get_all_blocks_recursively)
 from vllm.core.block.interfaces import Block, BlockAllocator
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
 
 PrefixHash = int
 BlockId = int
 
+# By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
+# so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME,
+# then we know this block hasn't been accessed yet.
+_DEFAULT_LAST_ACCESSED_TIME = -1
+
 
 class PrefixCachingBlockAllocator(BlockAllocator):
     """A block allocator that implements prefix caching.
@@ -27,22 +33,19 @@ class PrefixCachingBlockAllocator(BlockAllocator):
             from 0 to num_blocks - 1.
     """
 
-    # TODO last access time / evictor integration
-
     def __init__(
         self,
         num_blocks: int,
         block_size: int,
         block_ids: Optional[Iterable[int]] = None,
+        eviction_policy: Optional[EvictionPolicy] = EvictionPolicy.LRU,
     ):
         # A mapping of prefix hash to block index. All blocks which have a
         # prefix hash will be in this dict, even if they have refcount 0.
         self._cached_blocks: Dict[PrefixHash, BlockId] = {}
 
-        # A mapping of prefix hash to block index. All blocks which have a
-        # prefix hash AND refcount 0 will be in this dict. Thus, it is a subset
-        # of self._cached_blocks.
-        self._unused_cached_blocks: Dict[PrefixHash, BlockId] = {}
+        # A mapping of blockId to Block to track those cached blocks
+        self._blocks: Dict[BlockId, Block] = {}
 
         # An allocator for blocks that do not have prefix hashes.
         self._hashless_allocator = NaiveBlockAllocator(
@@ -54,6 +57,10 @@ def __init__(
 
         self._block_size = block_size
 
+        # Evitor used to maintain how we want to handle those computed blocks
+        # if we find memory pressure is high.
+        self.evictor: Evictor = make_evictor(eviction_policy)
+
         # We share the refcounter between allocators. This allows us to promote
         # blocks originally allocated in the hashless allocator to immutable
         # blocks.
@@ -72,6 +79,7 @@ def _create_block(
         block_size: int,
         allocator: BlockAllocator,
         block_id: Optional[int] = None,
+        computed: Optional[bool] = False,
     ) -> Block:
         # Bind block to self.
         allocator = self
@@ -82,6 +90,7 @@ def _create_block(
             block_size=block_size,
             block_id=block_id,
             prefix_caching_allocator=allocator,
+            computed=computed,
         )
 
     def allocate_immutable(self, prev_block: Optional[Block],
@@ -109,14 +118,12 @@ def allocate_immutable(self, prev_block: Optional[Block],
         cached_block_id = self._cached_blocks.get(block.content_hash, None)
         if cached_block_id is not None:
             block.block_id = cached_block_id
-            self._incr_refcount_cached_block(block.content_hash,
-                                             block.block_id)
+            self._incr_refcount_cached_block(block, block.block_id)
             return block
 
         block = self.allocate_mutable(prev_block)
         block.append_token_ids(token_ids)
         assert block.content_hash is not None
-        # TODO computed bit
 
         return block
 
@@ -133,41 +140,67 @@ def allocate_mutable(self, prev_block: Block) -> Block:
         assert_prefix_caching_block_or_none(prev_block)
 
         try:
-            return self._hashless_allocator.allocate_mutable(
+            block = self._hashless_allocator.allocate_mutable(
                 prev_block=prev_block)
+
+            assert block.block_id not in self._blocks
+            self._blocks[block.block_id] = block
+            return block
         except BlockAllocator.NoFreeBlocksError:
             # We must check the unused cached blocks before raising OOM.
             pass
 
-        if self._unused_cached_blocks:
-            # TODO policy for selecting block to remove
-            content_hash_to_evict = next(iter(self._unused_cached_blocks))
+        # If the evictor has blocks available for eviction, evict a block
+        # and return it.
+        if self.evictor.num_blocks > 0:
+            block_id, content_hash_to_evict = self.evictor.evict()
+
+            # Here we may have scenario that several blocks have
+            # the same content hash, but due to the latter coming block
+            # is coming from mutable to immutable path, their physical
+            # block is added into evictor.
+            # However in this case, we shall not pop the _cached_blocks,
+            # as the same content is still used by others, which means
+            # we need to check ref before decide to pop the list.
 
-            # Clear content hash mapping; the block will be overwritten.
-            del self._cached_blocks[content_hash_to_evict]
+            _block_id = self._cached_blocks[content_hash_to_evict]
+            refcount = self._refcounter.get(_block_id)
+            if refcount == 1:
+                self._cached_blocks.pop(content_hash_to_evict)
+                assert _block_id == block_id
 
-            block_id = self._unused_cached_blocks.pop(content_hash_to_evict)
-            refcount = self._refcounter.incr(block_id)
-            assert refcount == 1
+            self._refcounter.incr(block_id)
+
+            # the block comes from evictor already contain computed result
             block = self._create_block(
                 prev_block=prev_block,
                 token_ids=[],
                 block_size=self._block_size,
                 allocator=self,
                 block_id=block_id,
+                computed=True,
             )
             assert block.content_hash is None
+
+            assert block.block_id not in self._blocks
+            self._blocks[block.block_id] = block
             return block
 
         # No block available in hashless allocator, nor in unused cache blocks.
         raise BlockAllocator.NoFreeBlocksError()
 
-    def _incr_refcount_cached_block(self, content_hash: int,
+    def _incr_refcount_cached_block(self, block: Block,
                                     block_id: BlockId) -> None:
+        # since block is already computed, mark it
+        block.computed = True
+
         refcount = self._refcounter.incr(block_id)
         if refcount == 1:
-            assert content_hash in self._unused_cached_blocks
-            del self._unused_cached_blocks[content_hash]
+            # if block get referred, then it shall not be in evictor
+            # and put it into _blocks for tracking
+            if block_id in self.evictor:
+                self.evictor.remove(block_id)
+            self._blocks[block_id] = block
 
     def free(self, block: Block) -> None:
         """Decrement the refcount of the block. If the decremented refcount is
@@ -180,6 +213,7 @@ def free(self, block: Block) -> None:
                 is not None), "freeing unallocated block is undefined"
 
         self._free_block_id_for_block(block.block_id, block)
+
         block.block_id = None
 
     def _free_block_id_for_block(self, block_id: BlockId,
@@ -187,15 +221,21 @@ def _free_block_id_for_block(self, block_id: BlockId,
         assert isinstance(block, PrefixCachingBlock)
 
         if block.content_hash is None:
+            refcount = self._refcounter.get(block_id)
+            # We have fork case where block would get more than one ref,
+            # so we cannot free it from tracking if ref cnt large than 1
+            if refcount <= 1:
+                del self._blocks[block.block_id]
             return self._hashless_allocator.free(block)
 
         refcount = self._refcounter.decr(block_id)
 
-        # If no longer used, add the block to the unused cached blocks.
+        # If no longer used, add the block to the evictor.
         if refcount == 0:
-            assert block.content_hash not in self._unused_cached_blocks
             assert block.content_hash in self._cached_blocks
-            self._unused_cached_blocks[block.content_hash] = block_id
+            del self._blocks[block.block_id]
+            self.evictor.add(block.block_id, block.content_hash,
+                             block.num_tokens_total, block.last_accessed)
 
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
@@ -230,9 +270,9 @@ def fork(self, last_block: Block) -> List[Block]:
 
     def get_num_free_blocks(self) -> int:
         # The number of free blocks is the number of hashless free blocks
-        # plus the number of hashful blocks that are unused.
-        return self._hashless_allocator.get_num_free_blocks() + len(
-            self._unused_cached_blocks)
+        # plus the number of blocks evictor could free from its list.
+        return self._hashless_allocator.get_num_free_blocks(
+        ) + self.evictor.num_blocks
 
     @property
     def all_block_ids(self) -> frozenset[int]:
@@ -266,7 +306,7 @@ def promote_to_immutable_block(self,
         else:
             self._free_block_id_for_block(block.block_id, block)
             self._incr_refcount_cached_block(
-                block.content_hash, self._cached_blocks[block.content_hash])
+                block, self._cached_blocks[block.content_hash])
 
         return self._cached_blocks[block.content_hash]
 
@@ -293,29 +333,60 @@ def clear_copy_on_writes(self) -> Dict[BlockId, List[BlockId]]:
         """
         return self._cow_tracker.clear_cows()
 
-    def mark_blocks_as_computed(self) -> None:
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, used in prefix caching.
+
+        If the block is added into evictor, we need to update corresponding
+        info in evictor's metadata.
+        """
+
+        for block_id in block_ids:
+            if block_id in self._blocks:
+                self._blocks[block_id].last_accessed = now
+            elif block_id in self.evictor:
+                self.evictor.update(block_id, now)
+            else:
+                raise ValueError(
+                    "Mark block as accessed which is not belonged to GPU")
+
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         """Mark blocks as computed, used in prefix caching."""
-        # TODO Track computed blocks.
-        pass
+
+        for block_id in block_ids:
+            if block_id in self._blocks:
+                # only those full block is valid for prefix caching
+                if self._blocks[block_id].is_full:
+                    self._blocks[block_id].computed = True
+            elif block_id not in self.evictor:
+                raise ValueError(f"Mark {block_id=} as computed which "
+                                 "is not belonged to GPU")
+
+    def block_is_computed(self, block_id: int) -> bool:
+        if block_id in self._blocks:
+            return self._blocks[block_id].computed
+        else:
+            return block_id in self.evictor
 
     def get_common_computed_block_ids(
             self, seq_block_ids: List[List[int]]) -> List[int]:
         """Return the block ids that are common for a given sequence group.
 
-        Used in prefill (can skip prefill of some blocks).
+        Only those blocks that are immutable and already be marked
+        compyted would be taken consideration.
         """
 
-        # TODO: Track computed blocks.
-        computed = lambda block_id: False
-
         # NOTE We exclude the last block to avoid the case where the entire
         # prompt is cached. This would cause erroneous behavior in model
         # runner.
+
         ids_list = [
-            takewhile(lambda block_id: computed(block_id), seq[:-1])
-            for seq in seq_block_ids
+            list(
+                takewhile(lambda block_id: self.block_is_computed(block_id),
+                          seq[:-1])) for seq in seq_block_ids
         ]
-        return commonprefix([ids for ids in ids_list if ids != []])
+        res = commonprefix([ids for ids in ids_list if ids != []])
+        return res
 
 
 class PrefixCachingBlock(Block):
@@ -345,12 +416,16 @@ def __init__(
         block_size: int,
         prefix_caching_allocator: PrefixCachingBlockAllocator,
         block_id: Optional[int] = None,
+        computed: Optional[bool] = False,
     ):
         assert_prefix_caching_block_or_none(prev_block)
 
         self._prev_block = prev_block
         self._cached_content_hash: Optional[int] = None
+        self._cached_num_tokens_total: Optional[int] = None
         self._prefix_caching_allocator = prefix_caching_allocator
+        self.last_accessed = _DEFAULT_LAST_ACCESSED_TIME
+        self.computed = computed
 
         self._block = NaiveBlock(
             prev_block=prev_block,
@@ -398,6 +473,27 @@ def is_full(self) -> bool:
     def num_empty_slots(self) -> int:
         return self._block.num_empty_slots
 
+    @property
+    def num_tokens_total(self) -> int:
+        """return the total tokens so far.
+
+        Here we iterate the block chain till to the first block, while
+        cache the result in local to prevent repeated computations.
+        """
+        if self._cached_num_tokens_total is not None:
+            return self._cached_num_tokens_total
+
+        _block = self
+        self._cached_num_tokens_total = 0
+
+        # TODO: current implement here take O(N^2), we expect future
+        # we have O(1) here
+        while _block is not None:
+            self._cached_num_tokens_total += len(_block.token_ids)
+            _block = _block.prev_block
+
+        return self._cached_num_tokens_total
+
     @property
     def block_size(self) -> int:
         return self._block.block_size
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 73e7dafb72c7f..4a9a2999e3913 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -8,7 +8,7 @@
 from typing import Set
 
 from vllm.block import BlockTable, PhysicalTokenBlock
-from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
+from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.logger import init_logger
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 6339a6baf4161..0857605e2d005 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -72,14 +72,12 @@ def __init__(
         self.watermark = watermark
         assert watermark >= 0.0
 
-        assert not enable_caching, "Prefix caching not yet supported"
         self.enable_caching = enable_caching
 
         self.watermark_blocks = int(watermark * num_gpu_blocks)
 
         self.block_allocator = CpuGpuBlockAllocator.create(
-            # Currently, only naive blocks are supported (no prefix caching).
-            allocator_type="naive",
+            allocator_type="prefix_caching" if enable_caching else "naive",
             num_gpu_blocks=num_gpu_blocks,
             num_cpu_blocks=num_cpu_blocks,
             block_size=block_size,
@@ -194,17 +192,26 @@ def get_block_table(self, seq: Sequence) -> List[int]:
         assert all(b is not None for b in block_ids)
         return block_ids
 
-    def access_all_blocks_in_seq(self, seq, now):
-        # TODO add prefix caching support.
-        # Tracked here https://github.com/vllm-project/vllm/issues/3667
-        pass
+    def access_all_blocks_in_seq(self, seq: Sequence, now: float):
+        # Update the last accessed time of all the blocks accessed
+        # in this step.
+        # And the accessed time is only useful for prefix caching now,
+        # as it support internal evictor policy for which cached
+        # block could be refilled, to keep cached content could be reused
+        # at max extend.
+        if self.enable_caching:
+            block_table = self.block_tables[seq.seq_id]
+            block_ids = []
+            for block_id in block_table.physical_block_ids:
+                block_ids.append(block_id)
+            self.block_allocator.mark_blocks_as_accessed(block_ids, now)
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
-        # We ignore the sequence group as its not necessary. After the batch is
-        # formed by the scheduler, we do not need to mark blocks from individual
-        # sequence groups as computed -- all blocks in the batch can be marked
-        # as computed.
-        self.block_allocator.mark_blocks_as_computed()
+        # The only need for mark block as computed is for prefix caching,
+        # while currently we could determine whether one block is computed
+        # or not by check whether it has content hash.
+        # So this function is useless for block_v2.
+        pass
 
     def get_common_computed_block_ids(
             self, seqs: List[Sequence]) -> GenericSequence[int]:
diff --git a/vllm/core/evictor.py b/vllm/core/evictor_v1.py
similarity index 100%
rename from vllm/core/evictor.py
rename to vllm/core/evictor_v1.py
diff --git a/vllm/core/evictor_v2.py b/vllm/core/evictor_v2.py
new file mode 100644
index 0000000000000..b902a39263d14
--- /dev/null
+++ b/vllm/core/evictor_v2.py
@@ -0,0 +1,122 @@
+import enum
+from abc import ABC, abstractmethod, abstractproperty
+from typing import OrderedDict, Tuple
+
+
+class EvictionPolicy(enum.Enum):
+    """Enum for eviction policy used by make_evictor to instantiate the correct
+       Evictor subclass.
+    """
+    LRU = enum.auto()
+
+
+class Evictor(ABC):
+    """The Evictor subclasses should be used by the BlockAllocator class to
+    handle eviction of freed PhysicalTokenBlocks.
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def __contains__(self, block_id: int) -> bool:
+        pass
+
+    @abstractmethod
+    def evict(self) -> Tuple[int, int]:
+        """Runs the eviction algorithm and returns the evicted block's
+        content hash along with physical block id along with physical block id
+        """
+        pass
+
+    @abstractmethod
+    def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
+            last_accessed: int):
+        """Adds block to the evictor, making it a candidate for eviction"""
+        pass
+
+    @abstractmethod
+    def update(self, block_id: int, last_accessed: int):
+        """Update corresponding block's access time in metadata"""
+        pass
+
+    @abstractproperty
+    def num_blocks(self) -> int:
+        pass
+
+
+class BlockMetaData():
+    """Data structure for storing key data describe cached block, so that
+    evitor could use to make its decision which one to choose for eviction
+
+    Here we use physical block id as the dict key, as there maybe several
+    blocks with the same content hash, but their physical id is unique.
+    """
+
+    def __init__(self, content_hash: int, num_hashed_tokens: int,
+                 last_accessed: int):
+        self.content_hash = content_hash
+        self.num_hashed_tokens = num_hashed_tokens
+        self.last_accessed = last_accessed
+
+
+class LRUEvictor(Evictor):
+    """Evicts in a least-recently-used order using the last_accessed timestamp
+    that's recorded in the PhysicalTokenBlock. If there are multiple blocks with
+    the same last_accessed time, then the one with the largest num_hashed_tokens
+    will be evicted. If two blocks each have the lowest last_accessed time and
+    highest num_hashed_tokens value, then one will be chose arbitrarily
+    """
+
+    def __init__(self):
+        self.free_table: OrderedDict[int, BlockMetaData] = OrderedDict()
+
+    def __contains__(self, block_id: int) -> bool:
+        return block_id in self.free_table
+
+    def evict(self) -> Tuple[int, int]:
+        if len(self.free_table) == 0:
+            raise ValueError("No usable cache memory left")
+
+        evicted_block = next(iter(self.free_table.values()))
+        evicted_block_id = next(iter(self.free_table.keys()))
+        # The blocks with the lowest timestamps should be placed consecutively
+        # at the start of OrderedDict. Loop through all these blocks to
+        # find the one with maximum number of hashed tokens.
+        for _id, block in self.free_table.items():
+            if evicted_block.last_accessed > block.last_accessed or (
+                    evicted_block.last_accessed == block.last_accessed and
+                    evicted_block.num_hashed_tokens < block.num_hashed_tokens):
+                evicted_block = block
+                evicted_block_id = _id
+
+        self.free_table.pop(evicted_block_id)
+
+        return evicted_block_id, evicted_block.content_hash
+
+    def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
+            last_accessed: int):
+        self.free_table[block_id] = BlockMetaData(content_hash,
+                                                  num_hashed_tokens,
+                                                  last_accessed)
+
+    def update(self, block_id: int, last_accessed: int):
+        self.free_table[block_id].last_accessed = last_accessed
+
+    def remove(self, block_id: int):
+        if block_id not in self.free_table:
+            raise ValueError(
+                "Attempting to remove block that's not in the evictor")
+        self.free_table.pop(block_id)
+
+    @property
+    def num_blocks(self) -> int:
+        return len(self.free_table)
+
+
+def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
+    if eviction_policy == EvictionPolicy.LRU:
+        return LRUEvictor()
+    else:
+        raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")

From a657bfc48a11d87de146629a7b6c03e9ccfbc3fc Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Wed, 1 May 2024 11:41:59 -0700
Subject: [PATCH 22/65] [Core] Add `multiproc_worker_utils` for
 multiprocessing-based workers (#4357)

---
 tests/engine/test_multiproc_workers.py  | 176 ++++++++++++++++
 vllm/executor/multiproc_worker_utils.py | 264 ++++++++++++++++++++++++
 2 files changed, 440 insertions(+)
 create mode 100644 tests/engine/test_multiproc_workers.py
 create mode 100644 vllm/executor/multiproc_worker_utils.py

diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
new file mode 100644
index 0000000000000..610ad9732fb91
--- /dev/null
+++ b/tests/engine/test_multiproc_workers.py
@@ -0,0 +1,176 @@
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from time import sleep
+from typing import Any, List, Tuple
+
+import pytest
+
+from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
+                                                  ResultHandler, WorkerMonitor)
+
+
+class DummyWorker:
+    """Dummy version of vllm.worker.worker.Worker"""
+
+    def __init__(self, rank: int):
+        self.rank = rank
+
+    def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
+        sleep(0.05)
+
+        if isinstance(worker_input, Exception):
+            # simulate error case
+            raise worker_input
+
+        return self.rank, input
+
+
+def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
+    result_handler = ResultHandler()
+    workers = [
+        ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank))
+        for rank in range(8)
+    ]
+
+    worker_monitor = WorkerMonitor(workers, result_handler)
+    assert not worker_monitor.is_alive()
+
+    result_handler.start()
+    worker_monitor.start()
+    assert worker_monitor.is_alive()
+
+    return workers, worker_monitor
+
+
+def test_local_workers() -> None:
+    """Test workers with sync task submission"""
+
+    workers, worker_monitor = _start_workers()
+
+    def execute_workers(worker_input: str) -> None:
+        worker_outputs = [
+            worker.execute_method("worker_method", worker_input)
+            for worker in workers
+        ]
+
+        for rank, output in enumerate(worker_outputs):
+            assert output.get() == (rank, input)
+
+    executor = ThreadPoolExecutor(max_workers=4)
+
+    # Test concurrent submission from different threads
+    futures = [
+        executor.submit(partial(execute_workers, f"thread {thread_num}"))
+        for thread_num in range(4)
+    ]
+
+    for future in futures:
+        future.result()
+
+    # Test error case
+    exception = ValueError("fake error")
+    result = workers[0].execute_method("worker_method", exception)
+    try:
+        result.get()
+        pytest.fail("task should have failed")
+    except Exception as e:
+        assert isinstance(e, ValueError)
+        assert str(e) == "fake error"
+
+    # Test cleanup when a worker fails
+    assert worker_monitor.is_alive()
+    workers[3].process.kill()
+
+    # Other workers should get shut down here
+    worker_monitor.join(2)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = workers[0].execute_method("worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
+
+
+def test_local_workers_clean_shutdown() -> None:
+    """Test clean shutdown"""
+
+    workers, worker_monitor = _start_workers()
+
+    assert worker_monitor.is_alive()
+    assert all(worker.process.is_alive() for worker in workers)
+
+    # Clean shutdown
+    worker_monitor.close()
+
+    worker_monitor.join(5)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = workers[0].execute_method("worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
+
+
+@pytest.mark.asyncio
+async def test_local_workers_async() -> None:
+    """Test local workers with async task submission"""
+
+    workers, worker_monitor = _start_workers()
+
+    async def execute_workers(worker_input: str) -> None:
+        worker_coros = [
+            worker.execute_method_async("worker_method", worker_input)
+            for worker in workers
+        ]
+
+        results = await asyncio.gather(*worker_coros)
+        for rank, result in enumerate(results):
+            assert result == (rank, input)
+
+    tasks = [
+        asyncio.create_task(execute_workers(f"task {task_num}"))
+        for task_num in range(4)
+    ]
+
+    for task in tasks:
+        await task
+
+    # Test error case
+    exception = ValueError("fake error")
+    try:
+        _result = await workers[0].execute_method_async(
+            "worker_method", exception)
+        pytest.fail("task should have failed")
+    except Exception as e:
+        assert isinstance(e, ValueError)
+        assert str(e) == "fake error"
+
+    # Test cleanup when a worker fails
+    assert worker_monitor.is_alive()
+    workers[3].process.kill()
+
+    # Other workers should get shut down here
+    worker_monitor.join(2)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = await workers[0].execute_method_async(
+            "worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
new file mode 100644
index 0000000000000..0c04796bc38e3
--- /dev/null
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -0,0 +1,264 @@
+import asyncio
+import multiprocessing
+import os
+import sys
+import threading
+import traceback
+import uuid
+from dataclasses import dataclass
+from multiprocessing import Queue
+from multiprocessing.connection import wait
+from multiprocessing.process import BaseProcess
+from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
+                    TypeVar, Union)
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+T = TypeVar('T')
+
+_TERMINATE = "TERMINATE"  # sentinel
+
+# ANSI color codes
+CYAN = '\033[1;36m'
+RESET = '\033[0;0m'
+
+JOIN_TIMEOUT_S = 2
+
+# Use dedicated multiprocess context for workers.
+# Both spawn and fork work
+mp_method = os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+mp = multiprocessing.get_context(mp_method)
+
+
+@dataclass
+class Result(Generic[T]):
+    """Result of task dispatched to worker"""
+
+    task_id: uuid.UUID
+    value: Optional[T] = None
+    exception: Optional[BaseException] = None
+
+
+class ResultFuture(threading.Event, Generic[T]):
+    """Synchronous future for non-async case"""
+
+    def __init__(self):
+        super().__init__()
+        self.result: Optional[Result[T]] = None
+
+    def set_result(self, result: Result[T]):
+        self.result = result
+        self.set()
+
+    def get(self) -> T:
+        self.wait()
+        assert self.result is not None
+        if self.result.exception is not None:
+            raise self.result.exception
+        return self.result.value  # type: ignore[return-value]
+
+
+def _set_future_result(future: Union[ResultFuture, asyncio.Future],
+                       result: Result):
+    if isinstance(future, ResultFuture):
+        future.set_result(result)
+        return
+    loop = future.get_loop()
+    if result.exception is not None:
+        loop.call_soon_threadsafe(future.set_exception, result.exception)
+    else:
+        loop.call_soon_threadsafe(future.set_result, result.value)
+
+
+class ResultHandler(threading.Thread):
+    """Handle results from all workers (in background thread)"""
+
+    def __init__(self) -> None:
+        super().__init__(daemon=True)
+        self.result_queue = mp.Queue()
+        self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {}
+
+    def run(self):
+        for result in iter(self.result_queue.get, _TERMINATE):
+            future = self.tasks.pop(result.task_id)
+            _set_future_result(future, result)
+        # Ensure that all waiters will receive an exception
+        for task_id, future in self.tasks.items():
+            _set_future_result(
+                future,
+                Result(task_id=task_id,
+                       exception=ChildProcessError("worker died")))
+
+    def close(self):
+        self.result_queue.put(_TERMINATE)
+
+
+class WorkerMonitor(threading.Thread):
+    """Monitor worker status (in background thread)"""
+
+    def __init__(self, workers: List['ProcessWorkerWrapper'],
+                 result_handler: ResultHandler):
+        super().__init__(daemon=True)
+        self.workers = workers
+        self.result_handler = result_handler
+        self._close = False
+
+    def run(self) -> None:
+        # Blocks until any worker exits
+        dead_sentinels = wait([w.process.sentinel for w in self.workers])
+        if not self._close:
+            self._close = True
+
+            # Kill / cleanup all workers
+            for worker in self.workers:
+                process = worker.process
+                if process.sentinel in dead_sentinels:
+                    process.join(JOIN_TIMEOUT_S)
+                if process.exitcode is not None and process.exitcode != 0:
+                    logger.error("Worker %s pid %s died, exit code: %s",
+                                 process.name, process.pid, process.exitcode)
+            # Cleanup any remaining workers
+            logger.info("Killing local vLLM worker processes")
+            for worker in self.workers:
+                worker.kill_worker()
+            # Must be done after worker task queues are all closed
+            self.result_handler.close()
+
+        for worker in self.workers:
+            worker.process.join(JOIN_TIMEOUT_S)
+
+    def close(self):
+        if self._close:
+            return
+        self._close = True
+        logger.info("Terminating local vLLM worker processes")
+        for worker in self.workers:
+            worker.terminate_worker()
+        # Must be done after worker task queues are all closed
+        self.result_handler.close()
+
+
+class ProcessWorkerWrapper:
+    """Local process wrapper for vllm.worker.Worker,
+    for handling single-node multi-GPU tensor parallel."""
+
+    def __init__(self, result_handler: ResultHandler,
+                 worker_factory: Callable[[], Any]) -> None:
+        self._task_queue = mp.Queue()
+        self.result_queue = result_handler.result_queue
+        self.tasks = result_handler.tasks
+        self.process: BaseProcess = mp.Process(  # type: ignore[attr-defined]
+            target=_run_worker_process,
+            name="VllmWorkerProcess",
+            kwargs=dict(
+                worker_factory=worker_factory,
+                task_queue=self._task_queue,
+                result_queue=self.result_queue,
+            ),
+            daemon=True)
+
+        self.process.start()
+
+    def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future],
+                      method: str, args, kwargs):
+        task_id = uuid.uuid4()
+        self.tasks[task_id] = future
+        try:
+            self._task_queue.put((task_id, method, args, kwargs))
+        except BaseException as e:
+            del self.tasks[task_id]
+            raise ChildProcessError("worker died") from e
+
+    def execute_method(self, method: str, *args, **kwargs):
+        future: ResultFuture = ResultFuture()
+        self._enqueue_task(future, method, args, kwargs)
+        return future
+
+    async def execute_method_async(self, method: str, *args, **kwargs):
+        future = asyncio.get_running_loop().create_future()
+        self._enqueue_task(future, method, args, kwargs)
+        return await future
+
+    def terminate_worker(self):
+        try:
+            self._task_queue.put(_TERMINATE)
+        except ValueError:
+            self.process.kill()
+        self._task_queue.close()
+
+    def kill_worker(self):
+        self._task_queue.close()
+        self.process.kill()
+
+
+def _run_worker_process(
+    worker_factory: Callable[[], Any],
+    task_queue: Queue,
+    result_queue: Queue,
+) -> None:
+    """Worker process event loop"""
+
+    # Add process-specific prefix to stdout and stderr
+    process_name = mp.current_process().name
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
+
+    # Initialize worker
+    worker = worker_factory()
+    del worker_factory
+
+    # Accept tasks from the engine in task_queue
+    # and return task output in result_queue
+    logger.info("Worker ready; awaiting tasks")
+    try:
+        for items in iter(task_queue.get, _TERMINATE):
+            output = None
+            exception = None
+            task_id, method, args, kwargs = items
+            try:
+                executor = getattr(worker, method)
+                output = executor(*args, **kwargs)
+            except BaseException as e:
+                tb = traceback.format_exc()
+                logger.error(
+                    "Exception in worker %s while processing method %s: %s, %s",
+                    process_name, method, e, tb)
+                exception = e
+            result_queue.put(
+                Result(task_id=task_id, value=output, exception=exception))
+    except KeyboardInterrupt:
+        pass
+    except Exception:
+        logger.exception("Worker failed")
+
+    logger.info("Worker exiting")
+
+
+def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
+    """Prepend each output line with process-specific prefix"""
+
+    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
+    file_write = file.write
+
+    def write_with_prefix(s: str):
+        if not s:
+            return
+        if file.start_new_line:  # type: ignore[attr-defined]
+            file_write(prefix)
+        idx = 0
+        while (next_idx := s.find('\n', idx)) != -1:
+            next_idx += 1
+            file_write(s[idx:next_idx])
+            if next_idx == len(s):
+                file.start_new_line = True  # type: ignore[attr-defined]
+                return
+            file_write(prefix)
+            idx = next_idx
+        file_write(s[idx:])
+        file.start_new_line = False  # type: ignore[attr-defined]
+
+    file.start_new_line = True  # type: ignore[attr-defined]
+    file.write = write_with_prefix  # type: ignore[method-assign]

From 24bb4fe432fffeccf7a27270ee70aff1b1b8a89a Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 1 May 2024 11:47:38 -0700
Subject: [PATCH 23/65] [Kernel] Update fused_moe tuning script for FP8 (#4457)

This PR updates the tuning script for the fused_moe kernel to support FP8 and also adds configurations for TP4. Note that for the configuration I removed num_warps and num_stages for small batch sizes since that improved performance and brought the benchmarks on par with the numbers before in that regime to make sure this is a strict improvement over the status quo.

All the numbers below are for mistralai/Mixtral-8x7B-Instruct-v0.1, 1000 input and 50 output tokens.

Before this PR (with static activation scaling):

qps = 1: 9.8 ms ITL, 0.49s e2e latency
qps = 2: 9.7 ms ITL, 0.49s e2e latency
qps = 4: 10.1 ms ITL, 0.52s e2e latency
qps = 6: 11.9 ms ITL, 0.59s e2e latency
qps = 8: 14.0 ms ITL, 0.70s e2e latency
qps = 10: 15.7 ms ITL, 0.79s e2e latency

After this PR (with static activation scaling):

qps = 1: 9.8 ms ITL, 0.49s e2e latency
qps = 2: 9.7 ms ITL, 0.49s e2e latency
qps = 4: 10.2 ms ITL, 0.53s e2e latency
qps = 6: 11.9 ms ITL, 0.59s e2e latency
qps = 8: 11.9 ms ITL, 0.59s e2e latency
qps = 10: 12.1 ms ITL, 0.61s e2e latency
---
 benchmarks/kernels/benchmark_mixtral_moe.py   | 109 +++++++++-----
 ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 140 ++++++++++++++++++
 2 files changed, 211 insertions(+), 38 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json

diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py
index 8e976fbcb3028..5280b214144c9 100644
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@@ -1,3 +1,4 @@
+import argparse
 import json
 import os
 import sys
@@ -5,6 +6,7 @@
 import torch
 import torch.nn.functional as F
 import triton
+from tqdm import tqdm
 
 from vllm.model_executor.layers.fused_moe import (fused_moe,
                                                   get_config_file_name)
@@ -12,16 +14,16 @@
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
 
-def main():
+def main(dtype: str):
     method = fused_moe
     for bs in [
             1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
             2048, 3072, 4096
     ]:
-        run_grid(bs, method=method)
+        run_grid(bs, method=method, dtype=dtype)
 
 
-def run_grid(bs, method):
+def run_grid(bs, method, dtype: str):
     d_model = 4096
     num_total_experts = 8
     top_k = 2
@@ -34,39 +36,29 @@ def run_grid(bs, method):
     num_trials = 1
 
     configs = []
-    if bs <= 16:
-        BLOCK_SIZES_M = [16]
-    elif bs <= 32:
-        BLOCK_SIZES_M = [16, 32]
-    elif bs <= 64:
-        BLOCK_SIZES_M = [16, 32, 64]
-    elif bs <= 128:
-        BLOCK_SIZES_M = [16, 32, 64, 128]
-    else:
-        BLOCK_SIZES_M = [16, 32, 64, 128, 256]
 
     for block_size_n in [32, 64, 128, 256]:
-        for block_size_m in BLOCK_SIZES_M:
+        for block_size_m in [16, 32, 64, 128, 256]:
             for block_size_k in [64, 128, 256]:
                 for group_size_m in [1, 16, 32, 64]:
                     for num_warps in [4, 8]:
-                        configs.append({
-                            "BLOCK_SIZE_M": block_size_m,
-                            "BLOCK_SIZE_N": block_size_n,
-                            "BLOCK_SIZE_K": block_size_k,
-                            "GROUP_SIZE_M": group_size_m,
-                            "num_warps": num_warps,
-                            "num_stages": 4,
-                        })
+                        for num_stages in [2, 3, 4, 5]:
+                            configs.append({
+                                "BLOCK_SIZE_M": block_size_m,
+                                "BLOCK_SIZE_N": block_size_n,
+                                "BLOCK_SIZE_K": block_size_k,
+                                "GROUP_SIZE_M": group_size_m,
+                                "num_warps": num_warps,
+                                "num_stages": num_stages,
+                            })
 
     best_config = None
     best_time_us = 1e20
 
-    for config in configs:
-        print(f'{tp_size=} {bs=}')
-        print(f'{config}')
+    print(f'{tp_size=} {bs=}')
+
+    for config in tqdm(configs):
         # warmup
-        print('warming up')
         try:
             for _ in range(num_warmup_trials):
                 run_timing(
@@ -79,12 +71,12 @@ def run_grid(bs, method):
                     model_intermediate_size=model_intermediate_size,
                     method=method,
                     config=config,
+                    dtype=dtype,
                 )
         except triton.runtime.autotuner.OutOfResources:
             continue
 
         # trial
-        print('benchmarking')
         for _ in range(num_trials):
             kernel_dur_ms = run_timing(
                 num_calls=num_calls,
@@ -96,6 +88,7 @@ def run_grid(bs, method):
                 model_intermediate_size=model_intermediate_size,
                 method=method,
                 config=config,
+                dtype=dtype,
             )
 
             kernel_dur_us = 1000 * kernel_dur_ms
@@ -105,16 +98,18 @@ def run_grid(bs, method):
                 best_config = config
                 best_time_us = kernel_dur_us
 
-            print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
-                  f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
-                  f'{d_model=} {model_intermediate_size=} {num_layers=}')
+                tqdm.write(
+                    f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
+                    f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
+                    f'{d_model=} {model_intermediate_size=} {num_layers=}')
 
     print("best_time_us", best_time_us)
     print("best_config", best_config)
 
     # holds Dict[str, Dict[str, int]]
     filename = get_config_file_name(num_total_experts,
-                                    model_intermediate_size // tp_size)
+                                    model_intermediate_size // tp_size,
+                                    "float8" if dtype == "float8" else None)
     print(f"writing config to file {filename}")
     existing_content = {}
     if os.path.exists(filename):
@@ -128,27 +123,48 @@ def run_grid(bs, method):
 
 def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
                top_k: int, tp_size: int, model_intermediate_size: int, method,
-               config) -> float:
+               config, dtype: str) -> float:
     shard_intermediate_size = model_intermediate_size // tp_size
 
     hidden_states = torch.rand(
         (bs, d_model),
         device="cuda:0",
-        dtype=torch.bfloat16,
+        dtype=torch.float16,
     )
 
-    ws = torch.rand(
+    w1 = torch.rand(
         (num_total_experts, 2 * shard_intermediate_size, d_model),
         device=hidden_states.device,
         dtype=hidden_states.dtype,
     )
 
-    w2s = torch.rand(
+    w2 = torch.rand(
         (num_total_experts, d_model, shard_intermediate_size),
         device=hidden_states.device,
         dtype=hidden_states.dtype,
     )
 
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+
+    if dtype == "float8":
+        w1 = w1.to(torch.float8_e4m3fn)
+        w2 = w2.to(torch.float8_e4m3fn)
+        w1_scale = torch.ones(num_total_experts,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        w2_scale = torch.ones(num_total_experts,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        a1_scale = torch.ones(1,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        a2_scale = torch.ones(1,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+
     gating_output = F.softmax(torch.rand(
         (num_calls, bs, num_total_experts),
         device=hidden_states.device,
@@ -163,13 +179,18 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
     for i in range(num_calls):
         hidden_states = method(
             hidden_states=hidden_states,
-            w1=ws,
-            w2=w2s,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
             gating_output=gating_output[i],
             topk=2,
             renormalize=True,
             inplace=True,
             override_config=config,
+            use_fp8=dtype == "float8",
         )
     end_event.record()
     end_event.synchronize()
@@ -179,4 +200,16 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
 
 
 if __name__ == "__main__":
-    sys.exit(main())
+    parser = argparse.ArgumentParser(
+        prog='benchmark_mixtral_moe',
+        description='Benchmark and tune the fused_moe kernel',
+    )
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['float8', 'float16'],
+        help='Data type used for fused_moe kernel computations',
+    )
+    args = parser.parse_args()
+    sys.exit(main(args.dtype))
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
new file mode 100644
index 0000000000000..9287808a94d0e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -0,0 +1,140 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}

From c47ba4aaa94d067bbb0437526cae9a33c698c717 Mon Sep 17 00:00:00 2001
From: sasha0552 <admin@sasha0552.org>
Date: Wed, 1 May 2024 19:31:22 +0000
Subject: [PATCH 24/65] [Bugfix] Add validation for seed (#4529)

---
 tests/entrypoints/test_openai_server.py | 20 ++++++++++++++++++++
 vllm/entrypoints/openai/protocol.py     |  8 ++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index a2a98abe7031c..1323dba469117 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -13,6 +13,7 @@
 # and debugging.
 import ray
 import requests
+import torch
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
@@ -870,5 +871,24 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
         assert len(logprobs.tokens) > 5
 
 
+async def test_long_seed(server, client: openai.AsyncOpenAI):
+    for seed in [
+            torch.iinfo(torch.long).min - 1,
+            torch.iinfo(torch.long).max + 1
+    ]:
+        with pytest.raises(BadRequestError) as exc_info:
+            await client.chat.completions.create(
+                model=MODEL_NAME,
+                messages=[{
+                    "role": "system",
+                    "content": "You are a helpful assistant.",
+                }],
+                temperature=0,
+                seed=seed)
+
+        assert ("greater_than_equal" in exc_info.value.message
+                or "less_than_equal" in exc_info.value.message)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 731596e80bd71..3cd9ddad3b7b7 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -79,7 +79,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
     n: Optional[int] = 1
     presence_penalty: Optional[float] = 0.0
     response_format: Optional[ResponseFormat] = None
-    seed: Optional[int] = None
+    seed: Optional[int] = Field(None,
+                                ge=torch.iinfo(torch.long).min,
+                                le=torch.iinfo(torch.long).max)
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     temperature: Optional[float] = 0.7
@@ -228,7 +230,9 @@ class CompletionRequest(OpenAIBaseModel):
     max_tokens: Optional[int] = 16
     n: int = 1
     presence_penalty: Optional[float] = 0.0
-    seed: Optional[int] = None
+    seed: Optional[int] = Field(None,
+                                ge=torch.iinfo(torch.long).min,
+                                le=torch.iinfo(torch.long).max)
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     suffix: Optional[str] = None

From 3a922c1e7ee6753f41c6cc9d6d47d3b2d0110447 Mon Sep 17 00:00:00 2001
From: Roy <jasonailu87@gmail.com>
Date: Thu, 2 May 2024 04:08:14 +0800
Subject: [PATCH 25/65] [Bugfix][Core] Fix and refactor logging stats (#4336)

---
 vllm/engine/async_llm_engine.py | 14 +++++++++-----
 vllm/engine/llm_engine.py       | 12 +++++++-----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 4aceb19b50776..5591893d267a2 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -8,6 +8,7 @@
 from transformers import PreTrainedTokenizer
 
 from vllm.config import DecodingConfig, ModelConfig
+from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.ray_utils import initialize_ray_cluster, ray
@@ -15,7 +16,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import MultiModalData
+from vllm.sequence import MultiModalData, SamplerOutput
 from vllm.usage.usage_lib import UsageContext
 
 logger = init_logger(__name__)
@@ -224,8 +225,7 @@ async def step_async(self) -> List[RequestOutput]:
             scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
 
         # Log stats.
-        if self.log_stats:
-            self.stat_logger.log(self._get_stats(scheduler_outputs))
+        self.do_log_stats(scheduler_outputs, output)
 
         return request_outputs
 
@@ -707,9 +707,13 @@ async def get_decoding_config(self) -> DecodingConfig:
         else:
             return self.engine.get_decoding_config()
 
-    async def do_log_stats(self) -> None:
+    async def do_log_stats(
+            self,
+            scheduler_outputs: Optional[SchedulerOutputs] = None,
+            model_output: Optional[List[SamplerOutput]] = None) -> None:
         if self.engine_use_ray:
-            await self.engine.do_log_stats.remote()  # type: ignore
+            await self.engine.do_log_stats.remote(  # type: ignore
+                scheduler_outputs, model_output)
         else:
             self.engine.do_log_stats()
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 4caecb8a51598..19e7143ac2b45 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -597,16 +597,18 @@ def step(self) -> List[RequestOutput]:
             scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
 
         # Log stats.
-        if self.log_stats:
-            self.stat_logger.log(
-                self._get_stats(scheduler_outputs, model_output=output))
+        self.do_log_stats(scheduler_outputs, output)
 
         return request_outputs
 
-    def do_log_stats(self) -> None:
+    def do_log_stats(
+            self,
+            scheduler_outputs: Optional[SchedulerOutputs] = None,
+            model_output: Optional[List[SamplerOutput]] = None) -> None:
         """Forced log when no requests active."""
         if self.log_stats:
-            self.stat_logger.log(self._get_stats(scheduler_outputs=None))
+            self.stat_logger.log(
+                self._get_stats(scheduler_outputs, model_output))
 
     def _get_stats(
             self,

From 6ef09b08f88b675f84b7140238286e5d4c5304c8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 1 May 2024 15:23:06 -0700
Subject: [PATCH 26/65] [Core][Distributed] fix pynccl del error (#4508)

---
 vllm/distributed/device_communicators/pynccl.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 9434867e1b120..f21fcd262d810 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -200,6 +200,10 @@ def from_torch(cls, op: ReduceOp) -> int:
     ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p
 ]
 
+# be cautious! this is a collective call, it will block until all
+# processes in the communicator have called this function.
+# because Python object destruction can happen in random order,
+# it is better not to call it at all.
 # equivalent to c declaration:
 # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
 _c_ncclCommDestroy = nccl.ncclCommDestroy
@@ -278,11 +282,3 @@ def all_reduce(self,
                              ncclDataTypeEnum.from_torch(tensor.dtype),
                              ncclRedOpTypeEnum.from_torch(op), self.comm,
                              ctypes.c_void_p(stream.cuda_stream)))
-
-    def __del__(self):
-        # `dist` module might have been already destroyed
-        if hasattr(dist, 'destroy_process_group'):
-            dist.destroy_process_group()
-        # function might have been already destroyed
-        if _c_ncclCommDestroy is not None:
-            _c_ncclCommDestroy(self.comm)

From c9d852d601ce1a02f6748ab62db8694c22772583 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 1 May 2024 16:30:52 -0700
Subject: [PATCH 27/65] [Misc] Remove Mixtral device="cuda" declarations
 (#4543)

Remove the device="cuda" declarations in mixtral as promised in #4343
---
 vllm/model_executor/models/mixtral.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c5dd1a63e2f7a..9ff9ba298588a 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -96,13 +96,11 @@ def __init__(
             torch.empty(self.num_total_experts,
                         2 * self.intermediate_size,
                         self.hidden_size,
-                        device="cuda",
                         dtype=self.params_dtype))
         self.w2s = nn.Parameter(
             torch.empty(self.num_total_experts,
                         self.hidden_size,
                         self.intermediate_size,
-                        device="cuda",
                         dtype=self.params_dtype))
 
         set_weight_attrs(self.ws, {
@@ -114,22 +112,20 @@ def __init__(
 
         # Scaling factors for FP8 weights
         self.ws_scale = nn.Parameter(
-            torch.ones(
-                self.num_total_experts, device="cuda", dtype=torch.float32),
+            torch.ones(self.num_total_experts, dtype=torch.float32),
             requires_grad=False) if self.use_fp8 else None
         self.w2s_scale = nn.Parameter(
-            torch.ones(
-                self.num_total_experts, device="cuda", dtype=torch.float32),
+            torch.ones(self.num_total_experts, dtype=torch.float32),
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
         need_act_scales = (self.use_fp8
                            and quant_config.activation_scheme == "static")
         self.as_scale = nn.Parameter(
-            torch.zeros(1, device="cuda", dtype=torch.float32),
+            torch.zeros(1, dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
         self.a2s_scale = nn.Parameter(
-            torch.zeros(1, device="cuda", dtype=torch.float32),
+            torch.zeros(1, dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
 
         if need_act_scales:

From 826b82a260ebb1ea7edd04a3278d5fb9b103a76e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 1 May 2024 16:47:59 -0700
Subject: [PATCH 28/65] [Misc] Fix expert_ids shape in MoE (#4517)

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index b4f81527141a8..3cb0419404625 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -203,14 +203,15 @@ def moe_align_block_size(
     - The padding ensures that the total number of tokens is now divisible
         by block_size for proper block matrix operations.
     """
-    sorted_ids = torch.empty(
-        (topk_ids.numel() + num_experts * (block_size - 1), ),
-        dtype=torch.int32,
-        device=topk_ids.device)
-    expert_ids = torch.empty((topk_ids.numel() + num_experts, ),
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty((max_num_tokens_padded, ),
                              dtype=torch.int32,
                              device=topk_ids.device)
     sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+    expert_ids = torch.empty((max_num_m_blocks, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
     num_tokens_post_pad = torch.empty((1),
                                       dtype=torch.int32,
                                       device=topk_ids.device)

From b8afa8b95a4eee008a9b72440620113e5bfbe962 Mon Sep 17 00:00:00 2001
From: Danny Guinther <dguinther@neuralmagic.com>
Date: Wed, 1 May 2024 20:34:40 -0400
Subject: [PATCH 29/65] [MISC] Rework logger to enable pythonic custom logging
 configuration to be provided (#4273)

---
 examples/logging_configuration.md | 178 ++++++++++++++++++++++++++++
 tests/test_logger.py              | 189 +++++++++++++++++++++++++++++-
 vllm/logger.py                    | 112 ++++++++++--------
 vllm/logging/__init__.py          |   5 +
 vllm/logging/formatter.py         |  15 +++
 5 files changed, 451 insertions(+), 48 deletions(-)
 create mode 100644 examples/logging_configuration.md
 create mode 100644 vllm/logging/__init__.py
 create mode 100644 vllm/logging/formatter.py

diff --git a/examples/logging_configuration.md b/examples/logging_configuration.md
new file mode 100644
index 0000000000000..75b4b31a80462
--- /dev/null
+++ b/examples/logging_configuration.md
@@ -0,0 +1,178 @@
+# Logging Configuration
+
+vLLM leverages Python's `logging.config.dictConfig` functionality to enable
+robust and flexible configuration of the various loggers used by vLLM.
+
+vLLM offers two environment variables that can be used to accommodate a range
+of logging configurations that range from simple-and-inflexible to
+more-complex-and-more-flexible.
+
+- No vLLM logging (simple and inflexible)
+  - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
+- vLLM's default logging configuration (simple and inflexible)
+  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
+- Fine-grained custom logging configuration (more complex, more flexible)
+  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
+    set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
+
+
+## Logging Configuration Environment Variables
+
+### `VLLM_CONFIGURE_LOGGING`
+
+`VLLM_CONFIGURE_LOGGING` controls whether or not vLLM takes any action to
+configure the loggers used by vLLM. This functionality is enabled by default,
+but can be disabled by setting `VLLM_CONFIGURE_LOGGING=0` when running vLLM.
+
+If `VLLM_CONFIGURE_LOGGING` is enabled and no value is given for
+`VLLM_LOGGING_CONFIG_PATH`, vLLM will use built-in default configuration to
+configure the root vLLM logger. By default, no other vLLM loggers are
+configured and, as such, all vLLM loggers defer to the root vLLM logger to make
+all logging decisions.
+
+If `VLLM_CONFIGURE_LOGGING` is disabled and a value is given for
+`VLLM_LOGGING_CONFIG_PATH`, an error will occur while starting vLLM.
+
+### `VLLM_LOGGING_CONFIG_PATH`
+
+`VLLM_LOGGING_CONFIG_PATH` allows users to specify a path to a JSON file of
+alternative, custom logging configuration that will be used instead of vLLM's
+built-in default logging configuration. The logging configuration should be
+provided in JSON format following the schema specified by Python's [logging
+configuration dictionary
+schema](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details).
+
+If `VLLM_LOGGING_CONFIG_PATH` is specified, but `VLLM_CONFIGURE_LOGGING` is
+disabled, an error will occur while starting vLLM.
+
+
+## Examples
+
+### Example 1: Customize vLLM root logger
+
+For this example, we will customize the vLLM root logger to use
+[`python-json-logger`](https://github.com/madzak/python-json-logger) to log to
+STDOUT of the console in JSON format with a log level of `INFO`.
+
+To begin, first, create an appropriate JSON logging configuration file:
+
+**/path/to/logging_config.json:**
+
+```json
+{
+  "formatters": {
+    "json": {
+      "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
+    }
+  },
+  "handlers": {
+    "console": {
+      "class" : "logging.StreamHandler",
+      "formatter": "json",
+      "level": "INFO",
+      "stream": "ext://sys.stdout"
+    }
+  },
+  "loggers": {
+    "vllm": {
+      "handlers": ["console"],
+      "level": "INFO",
+      "propagate": false
+    }
+  },
+  "version": 1
+}
+```
+
+Next, install the `python-json-logger` package if it's not already installed:
+
+```bash
+pip install python-json-logger
+```
+
+Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
+to the path of the custom logging configuration JSON file:
+
+```bash
+VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
+    python3 -m vllm.entrypoints.openai.api_server \
+    --max-model-len 2048 \
+    --model mistralai/Mistral-7B-v0.1
+```
+
+
+### Example 2: Silence a particular vLLM logger
+
+To silence a particular vLLM logger, it is necessary to provide custom logging
+configuration for the target logger that configures the logger so that it won't
+propagate its log messages to the root vLLM logger.
+
+When custom configuration is provided for any logger, it is also necessary to
+provide configuration for the root vLLM logger since any custom logger
+configuration overrides the built-in default logging configuration used by vLLM.
+
+First, create an appropriate JSON logging configuration file that includes
+configuration for the root vLLM logger and for the logger you wish to silence:
+
+**/path/to/logging_config.json:**
+
+```json
+{
+  "formatters": {
+    "vllm": {
+      "class": "vllm.logging.NewLineFormatter",
+      "datefmt": "%m-%d %H:%M:%S",
+      "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
+    }
+  },
+  "handlers": {
+    "vllm": {
+      "class" : "logging.StreamHandler",
+      "formatter": "vllm",
+      "level": "INFO",
+      "stream": "ext://sys.stdout"
+    }
+  },
+  "loggers": {
+    "vllm": {
+      "handlers": ["vllm"],
+      "level": "DEBUG",
+      "propagage": false
+    },
+    "vllm.example_noisy_logger": {
+      "propagate": false
+    }
+  },
+  "version": 1
+}
+```
+
+Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
+to the path of the custom logging configuration JSON file:
+
+```bash
+VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
+    python3 -m vllm.entrypoints.openai.api_server \
+    --max-model-len 2048 \
+    --model mistralai/Mistral-7B-v0.1
+```
+
+
+### Example 3: Disable vLLM default logging configuration
+
+To disable vLLM's default logging configuration and silence all vLLM loggers,
+simple set `VLLM_CONFIGURE_LOGGING=0` when running vLLM. This will prevent vLLM
+for configuring the root vLLM logger, which in turn, silences all other vLLM
+loggers.
+
+```bash
+VLLM_CONFIGURE_LOGGING=0 \
+    python3 -m vllm.entrypoints.openai.api_server \
+    --max-model-len 2048 \
+    --model mistralai/Mistral-7B-v0.1
+```
+
+
+## Additional resources
+
+- [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 601f72b50811c..74f1125fb37c9 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -1,8 +1,19 @@
+import json
+import logging
 import os
 import sys
 import tempfile
+from json.decoder import JSONDecodeError
+from tempfile import NamedTemporaryFile
+from typing import Any
+from unittest.mock import patch
+from uuid import uuid4
 
-from vllm.logger import enable_trace_function_call
+import pytest
+
+from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger,
+                         enable_trace_function_call, init_logger)
+from vllm.logging import NewLineFormatter
 
 
 def f1(x):
@@ -25,3 +36,179 @@ def test_trace_function_call():
     assert "f2" in content
     sys.settrace(None)
     os.remove(path)
+
+
+def test_default_vllm_root_logger_configuration():
+    """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and
+    VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default
+    behavior is activated."""
+    logger = logging.getLogger("vllm")
+    assert logger.level == logging.DEBUG
+    assert not logger.propagate
+
+    handler = logger.handlers[0]
+    assert handler.stream == sys.stdout
+    assert handler.level == logging.INFO
+
+    formatter = handler.formatter
+    assert formatter is not None
+    assert isinstance(formatter, NewLineFormatter)
+    assert formatter._fmt == _FORMAT
+    assert formatter.datefmt == _DATE_FORMAT
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
+@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None)
+def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger():
+    """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and
+    VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default
+    behavior is activated."""
+    root_logger = logging.getLogger("vllm")
+    root_handler = root_logger.handlers[0]
+
+    unique_name = f"vllm.{uuid4()}"
+    logger = init_logger(unique_name)
+    assert logger.name == unique_name
+    assert logger.level == logging.NOTSET
+    assert not logger.handlers
+    assert logger.propagate
+
+    message = "Hello, world!"
+    with patch.object(root_handler, "emit") as root_handle_mock:
+        logger.info(message)
+
+    root_handle_mock.assert_called_once()
+    _, call_args, _ = root_handle_mock.mock_calls[0]
+    log_record = call_args[0]
+    assert unique_name == log_record.name
+    assert message == log_record.msg
+    assert message == log_record.msg
+    assert log_record.levelno == logging.INFO
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
+@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None)
+def test_logger_configuring_can_be_disabled():
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however mocks are used to ensure no changes in behavior or
+    configuration occur."""
+
+    with patch("logging.config.dictConfig") as dict_config_mock:
+        _configure_vllm_root_logger()
+    dict_config_mock.assert_not_called()
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
+@patch(
+    "vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+    "/if/there/is/a/file/here/then/you/did/this/to/yourself.json",
+)
+def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist():
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however it fails before any change in behavior or
+    configuration occurs."""
+    with pytest.raises(RuntimeError) as ex_info:
+        _configure_vllm_root_logger()
+    assert ex_info.type == RuntimeError
+    assert "File does not exist" in str(ex_info)
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
+def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however it fails before any change in behavior or
+    configuration occurs."""
+    with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
+        logging_config_file.write("---\nloggers: []\nversion: 1")
+        logging_config_file.flush()
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+                   logging_config_file.name):
+            with pytest.raises(JSONDecodeError) as ex_info:
+                _configure_vllm_root_logger()
+            assert ex_info.type == JSONDecodeError
+            assert "Expecting value" in str(ex_info)
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
+@pytest.mark.parametrize("unexpected_config", (
+    "Invalid string",
+    [{
+        "version": 1,
+        "loggers": []
+    }],
+    0,
+))
+def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json(
+        unexpected_config: Any):
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however it fails before any change in behavior or
+    configuration occurs."""
+    with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
+        logging_config_file.write(json.dumps(unexpected_config))
+        logging_config_file.flush()
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+                   logging_config_file.name):
+            with pytest.raises(ValueError) as ex_info:
+                _configure_vllm_root_logger()
+            assert ex_info.type == ValueError
+            assert "Invalid logging config. Expected Dict, got" in str(ex_info)
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
+def test_custom_logging_config_is_parsed_and_used_when_provided():
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however mocks are used to ensure no changes in behavior or
+    configuration occur."""
+    valid_logging_config = {
+        "loggers": {
+            "vllm.test_logger.logger": {
+                "handlers": [],
+                "propagate": False,
+            }
+        },
+        "version": 1
+    }
+    with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
+        logging_config_file.write(json.dumps(valid_logging_config))
+        logging_config_file.flush()
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+                   logging_config_file.name), patch(
+                       "logging.config.dictConfig") as dict_config_mock:
+            _configure_vllm_root_logger()
+            assert dict_config_mock.called_with(valid_logging_config)
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
+def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however mocks are used to ensure no changes in behavior or
+    configuration occur."""
+    valid_logging_config = {
+        "loggers": {
+            "vllm.test_logger.logger": {
+                "handlers": [],
+            }
+        },
+        "version": 1
+    }
+    with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
+        logging_config_file.write(json.dumps(valid_logging_config))
+        logging_config_file.flush()
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+                   logging_config_file.name):
+            with pytest.raises(RuntimeError) as ex_info:
+                _configure_vllm_root_logger()
+            assert ex_info.type is RuntimeError
+            expected_message_snippet = (
+                "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+                "VLLM_LOGGING_CONFIG_PATH was given.")
+            assert expected_message_snippet in str(ex_info)
+
+        # Remember! The root logger is assumed to have been configured as
+        # though VLLM_CONFIGURE_LOGGING=1 and VLLM_LOGGING_CONFIG_PATH=None.
+        root_logger = logging.getLogger("vllm")
+        other_logger_name = f"vllm.test_logger.{uuid4()}"
+        other_logger = init_logger(other_logger_name)
+        assert other_logger.handlers != root_logger.handlers
+        assert other_logger.level != root_logger.level
+        assert other_logger.propagate
diff --git a/vllm/logger.py b/vllm/logger.py
index 3928e5367d1e6..40c29da2b70ce 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -1,73 +1,91 @@
-# Adapted from
-# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
 """Logging configuration for vLLM."""
 import datetime
+import json
 import logging
 import os
 import sys
 from functools import partial
-from typing import Optional
+from logging import Logger
+from logging.config import dictConfig
+from os import path
+from typing import Dict, Optional
 
 VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
+VLLM_LOGGING_CONFIG_PATH = os.getenv("VLLM_LOGGING_CONFIG_PATH")
 
 _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
 _DATE_FORMAT = "%m-%d %H:%M:%S"
 
+DEFAULT_LOGGING_CONFIG = {
+    "formatters": {
+        "vllm": {
+            "class": "vllm.logging.NewLineFormatter",
+            "datefmt": _DATE_FORMAT,
+            "format": _FORMAT,
+        },
+    },
+    "handlers": {
+        "vllm": {
+            "class": "logging.StreamHandler",
+            "formatter": "vllm",
+            "level": "INFO",
+            "stream": "ext://sys.stdout",
+        },
+    },
+    "loggers": {
+        "vllm": {
+            "handlers": ["vllm"],
+            "level": "DEBUG",
+            "propagate": False,
+        },
+    },
+    "version": 1,
+}
+
+
+def _configure_vllm_root_logger() -> None:
+    logging_config: Optional[Dict] = None
+
+    if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+        raise RuntimeError(
+            "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+            "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
+            "implies VLLM_CONFIGURE_LOGGING. Please enable "
+            "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH.")
 
-class NewLineFormatter(logging.Formatter):
-    """Adds logging prefix to newlines to align multi-line messages."""
+    if VLLM_CONFIGURE_LOGGING:
+        logging_config = DEFAULT_LOGGING_CONFIG
 
-    def __init__(self, fmt, datefmt=None):
-        logging.Formatter.__init__(self, fmt, datefmt)
+    if VLLM_LOGGING_CONFIG_PATH:
+        if not path.exists(VLLM_LOGGING_CONFIG_PATH):
+            raise RuntimeError(
+                "Could not load logging config. File does not exist: %s",
+                VLLM_LOGGING_CONFIG_PATH)
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8",
+                  mode="r") as file:
+            custom_config = json.loads(file.read())
 
-    def format(self, record):
-        msg = logging.Formatter.format(self, record)
-        if record.message != "":
-            parts = msg.split(record.message)
-            msg = msg.replace("\n", "\r\n" + parts[0])
-        return msg
+        if not isinstance(custom_config, dict):
+            raise ValueError("Invalid logging config. Expected Dict, got %s.",
+                             type(custom_config).__name__)
+        logging_config = custom_config
 
+    if logging_config:
+        dictConfig(logging_config)
 
-_root_logger = logging.getLogger("vllm")
-_default_handler: Optional[logging.Handler] = None
 
+def init_logger(name: str) -> Logger:
+    """The main purpose of this function is to ensure that loggers are
+    retrieved in such a way that we can be sure the root vllm logger has
+    already been configured."""
 
-def _setup_logger():
-    _root_logger.setLevel(logging.DEBUG)
-    global _default_handler
-    if _default_handler is None:
-        _default_handler = logging.StreamHandler(sys.stdout)
-        _default_handler.flush = sys.stdout.flush  # type: ignore
-        _default_handler.setLevel(logging.INFO)
-        _root_logger.addHandler(_default_handler)
-    fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
-    _default_handler.setFormatter(fmt)
-    # Setting this will avoid the message
-    # being propagated to the parent logger.
-    _root_logger.propagate = False
+    return logging.getLogger(name)
 
 
-# The logger is initialized when the module is imported.
+# The root logger is initialized when the module is imported.
 # This is thread-safe as the module is only imported once,
 # guaranteed by the Python GIL.
-if VLLM_CONFIGURE_LOGGING:
-    _setup_logger()
-
-
-def init_logger(name: str):
-    # Use the same settings as above for root logger
-    logger = logging.getLogger(name)
-    logger.setLevel(os.getenv("LOG_LEVEL", "DEBUG"))
-
-    if VLLM_CONFIGURE_LOGGING:
-        if _default_handler is None:
-            raise ValueError(
-                "_default_handler is not set up. This should never happen!"
-                " Please open an issue on Github.")
-        logger.addHandler(_default_handler)
-        logger.propagate = False
-    return logger
-
+_configure_vllm_root_logger()
 
 logger = init_logger(__name__)
 
diff --git a/vllm/logging/__init__.py b/vllm/logging/__init__.py
new file mode 100644
index 0000000000000..b9aec380776f3
--- /dev/null
+++ b/vllm/logging/__init__.py
@@ -0,0 +1,5 @@
+from vllm.logging.formatter import NewLineFormatter
+
+__all__ = [
+    "NewLineFormatter",
+]
diff --git a/vllm/logging/formatter.py b/vllm/logging/formatter.py
new file mode 100644
index 0000000000000..b24b4e11d1fcb
--- /dev/null
+++ b/vllm/logging/formatter.py
@@ -0,0 +1,15 @@
+import logging
+
+
+class NewLineFormatter(logging.Formatter):
+    """Adds logging prefix to newlines to align multi-line messages."""
+
+    def __init__(self, fmt, datefmt=None, style="%"):
+        logging.Formatter.__init__(self, fmt, datefmt, style)
+
+    def format(self, record):
+        msg = logging.Formatter.format(self, record)
+        if record.message != "":
+            parts = msg.split(record.message)
+            msg = msg.replace("\n", "\r\n" + parts[0])
+        return msg

From 0d62fe58dbb58cfe4132005ce7ff37319d66981d Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Thu, 2 May 2024 11:24:13 +0900
Subject: [PATCH 30/65] [Bug fix][Core] assert num_new_tokens == 1 fails when
 SamplingParams.n is not 1 and max_tokens is large & Add tests for preemption
 (#4451)

---
 .buildkite/test-pipeline.yaml                 |   1 +
 .../basic_correctness/test_chunked_prefill.py |   1 -
 tests/basic_correctness/test_preemption.py    | 138 ++++++++++++++++++
 tests/conftest.py                             |   3 +-
 tests/spec_decode/test_spec_decode_worker.py  |   6 +-
 vllm/core/scheduler.py                        |  36 ++++-
 6 files changed, 172 insertions(+), 13 deletions(-)
 create mode 100644 tests/basic_correctness/test_preemption.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 11cda053260ec..641f366d06031 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -17,6 +17,7 @@ steps:
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Core Test
   command: pytest -v -s core
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index d83416eb51b43..47d582c726c66 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -55,7 +55,6 @@ def test_models(
     )
     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
     del vllm_model
-    print(vllm_outputs[0])
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
new file mode 100644
index 0000000000000..1adfc7dddd6fa
--- /dev/null
+++ b/tests/basic_correctness/test_preemption.py
@@ -0,0 +1,138 @@
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
+
+Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
+pytest tests/basic_correctness/test_preemption.py`.
+"""
+import pytest
+
+from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
+                                 ENABLE_ARTIFICIAL_PREEMPT)
+
+MODELS = [
+    "facebook/opt-125m",
+]
+
+assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+    "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+    "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+    "tests/basic_correctness/test_preemption.py`")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+def test_chunked_prefill_recompute(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+) -> None:
+    """Ensure that chunked prefill works with preemption."""
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        max_num_batched_tokens=max_num_batched_tokens,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_seqs=max_num_seqs,
+    )
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_preemption(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    """By default, recompute preemption is enabled"""
+
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+    )
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("beam_width", [4])
+def test_swap(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    """Use beam search enables swapping."""
+    example_prompts = example_prompts[:1]
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                               max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype, swap_space=10)
+    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
+                                                   max_tokens)
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, _ = hf_outputs[i]
+        vllm_output_ids, _ = vllm_outputs[i]
+        assert len(hf_output_ids) == len(vllm_output_ids)
+        for j in range(len(hf_output_ids)):
+            assert hf_output_ids[j] == vllm_output_ids[j], (
+                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
+                f"vLLM: {vllm_output_ids}")
diff --git a/tests/conftest.py b/tests/conftest.py
index 5c50fc2d1bab6..671326915b22b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -296,6 +296,7 @@ def __init__(
         tensor_parallel_size: int = 1,
         block_size: int = 16,
         enable_chunked_prefill: bool = False,
+        swap_space=4,
         **kwargs,
     ) -> None:
         self.model = LLM(
@@ -303,7 +304,7 @@ def __init__(
             tokenizer=tokenizer_name,
             trust_remote_code=True,
             dtype=dtype,
-            swap_space=0,
+            swap_space=swap_space,
             disable_log_stats=disable_log_stats,
             tensor_parallel_size=tensor_parallel_size,
             max_model_len=max_model_len,
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index d24d726c9c0cf..91315df9b5e60 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -33,7 +33,7 @@ def test_correctly_calls_draft_model(k: int, batch_size: int):
     worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
                               metrics_collector)
 
-    exception_secret = 'artifical stop'
+    exception_secret = 'artificial stop'
     draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
 
     execute_model_data, _, _ = create_batch(batch_size, k)
@@ -101,7 +101,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
         proposal_probs=proposal_probs,
         proposal_lens=proposal_lens)
 
-    exception_secret = 'artifical stop'
+    exception_secret = 'artificial stop'
     target_worker.execute_model.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
@@ -197,7 +197,7 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
 
     target_worker.execute_model.return_value = [target_output[0]]
 
-    exception_secret = 'artifical stop'
+    exception_secret = 'artificial stop'
     rejection_sampler.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 024b7e7013441..b17b6cc7fe733 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1,4 +1,6 @@
 import enum
+import os
+import random
 import time
 from collections import deque
 from dataclasses import dataclass, field
@@ -15,6 +17,13 @@
 
 logger = init_logger(__name__)
 
+# Test-only. If configured, decode is preempted with
+# ARTIFICIAL_PREEMPTION_PROB% probability.
+ENABLE_ARTIFICIAL_PREEMPT = bool(
+    os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False))  # noqa
+ARTIFICIAL_PREEMPTION_PROB = 0.5
+ARTIFICIAL_PREEMPTION_MAX_CNT = 500
+
 
 class PreemptionMode(enum.Enum):
     """Preemption modes.
@@ -286,6 +295,13 @@ def __init__(
         # Latency of the last prompt step
         self.last_prompt_latency = 0.0
 
+        # The following field is test-only. It is used to inject artificial
+        # preemption.
+        self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
+        self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
+                                       if self.enable_artificial_preemption
+                                       else 0)
+
     @property
     def lora_enabled(self) -> bool:
         return bool(self.lora_config)
@@ -386,15 +402,13 @@ def _schedule_running(
         # groups to preempt.
         now = time.time()
         running_queue = policy.sort_by_priority(now, running_queue)
-
         while running_queue:
             seq_group = running_queue[0]
             num_running_tokens = self._get_num_new_tokens(
                 seq_group, SequenceStatus.RUNNING, enable_chunking, budget)
 
-            # We can have up to 1 running prefill at any given time in running
-            # queue, which means we can guarantee chunk size is at least 1.
-            assert num_running_tokens != 0
+            if num_running_tokens == 0:
+                break
 
             running_queue.popleft()
             while not self._can_append_slots(seq_group):
@@ -449,9 +463,6 @@ def _schedule_running(
                 if curr_loras is not None and seq_group.lora_int_id > 0:
                     curr_loras.add(seq_group.lora_int_id)
 
-        # Make sure all queues are updated.
-        assert len(running_queue) == 0
-
         return running_queue, SchedulerRunningOutputs(
             decode_seq_groups=decode_seq_groups,
             prefill_seq_groups=prefill_seq_groups,
@@ -545,7 +556,6 @@ def _schedule_swapped(
                     ScheduledSequenceGroup(seq_group,
                                            token_chunk_size=num_new_tokens))
             else:
-                assert num_new_tokens == 1
                 decode_seq_groups.append(
                     ScheduledSequenceGroup(seq_group, token_chunk_size=1))
             budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens)
@@ -868,6 +878,13 @@ def _can_append_slots(self, seq_group: SequenceGroup) -> bool:
         """Determine whether or not we have enough space in the KV cache to
         continue generation of the sequence group.
         """
+        # It is True only for testing case to trigger artificial preemption.
+        if (self.enable_artificial_preemption
+                and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
+                and self.artificial_preempt_cnt > 0):
+            self.artificial_preempt_cnt -= 1
+            return False
+
         # Appending slots only occurs in decoding.
         is_prefill = False
 
@@ -1116,11 +1133,14 @@ def _get_num_new_tokens(self, seq_group: SequenceGroup,
         if `enable_chunking` is True. If a sequence group has multiple
         sequences (e.g., running beam search), it means it is in decoding
         phase, so chunking doesn't happen.
+
+        Returns 0 if the new token cannot be computed due to token budget.
         """
         num_new_tokens = 0
         seqs = seq_group.get_seqs(status=status)
         for seq in seqs:
             num_new_tokens += seq.get_num_new_tokens()
+        assert num_new_tokens > 0
         # Chunk if a running request cannot fit in.
         # If number of seq > 1, it means it is doing beam search in a
         # decode phase. Do not chunk in that case.

From 5e401bce17ae9b327020ade6ba0ddceea2853451 Mon Sep 17 00:00:00 2001
From: Ronen Schaffer <ronen.schaffer@ibm.com>
Date: Thu, 2 May 2024 05:57:12 +0300
Subject: [PATCH 31/65] [CI]Add regression tests to ensure the async engine
 generates metrics (#4524)

---
 tests/metrics/test_metrics.py | 94 +++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 0ab9c63ce4377..311e60ba60f61 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,4 +1,10 @@
 import pytest
+from prometheus_client import REGISTRY
+
+from vllm import EngineArgs, LLMEngine
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.sampling_params import SamplingParams
 
 MODELS = [
     "facebook/opt-125m",
@@ -68,3 +74,91 @@ def test_metric_counter_generation_tokens(
     assert vllm_generation_count == metric_count, (
         f"generation token count: {vllm_generation_count!r}\n"
         f"metric: {metric_count!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("disable_log_stats", [True, False])
+@pytest.mark.asyncio
+async def test_async_engine_log_metrics_regression(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    disable_log_stats: bool,
+) -> None:
+    """
+    Regression test ensuring async engine generates metrics
+    when disable_log_stats=False
+    (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
+    """
+    engine_args = AsyncEngineArgs(model=model,
+                                  dtype=dtype,
+                                  disable_log_stats=disable_log_stats)
+    async_engine = AsyncLLMEngine.from_engine_args(engine_args)
+    for i, prompt in enumerate(example_prompts):
+        results = async_engine.generate(
+            prompt,
+            SamplingParams(max_tokens=max_tokens),
+            f"request-id-{i}",
+        )
+        # Exhaust the async iterator to make the async engine work
+        async for _ in results:
+            pass
+
+    assert_metrics(async_engine.engine, disable_log_stats,
+                   len(example_prompts))
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("disable_log_stats", [True, False])
+def test_engine_log_metrics_regression(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    disable_log_stats: bool,
+) -> None:
+    engine_args = EngineArgs(model=model,
+                             dtype=dtype,
+                             disable_log_stats=disable_log_stats)
+    engine = LLMEngine.from_engine_args(engine_args)
+    for i, prompt in enumerate(example_prompts):
+        engine.add_request(
+            f"request-id-{i}",
+            prompt,
+            SamplingParams(max_tokens=max_tokens),
+        )
+    while engine.has_unfinished_requests():
+        engine.step()
+
+    assert_metrics(engine, disable_log_stats, len(example_prompts))
+
+
+def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
+                   num_requests: int) -> None:
+    if disable_log_stats:
+        with pytest.raises(AttributeError):
+            _ = engine.stat_logger
+    else:
+        assert (engine.stat_logger
+                is not None), "engine.stat_logger should be set"
+        # Ensure the count bucket of request-level histogram metrics matches
+        # the number of requests as a simple sanity check to ensure metrics are
+        # generated
+        labels = {'model_name': engine.model_config.model}
+        request_histogram_metrics = [
+            "vllm:e2e_request_latency_seconds",
+            "vllm:request_prompt_tokens",
+            "vllm:request_generation_tokens",
+            "vllm:request_params_best_of",
+            "vllm:request_params_n",
+        ]
+        for metric_name in request_histogram_metrics:
+            metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
+                                                     labels)
+            assert (
+                metric_value == num_requests), "Metrics should be collected"

From cf8cac8c701079a3fda068ffd1cd6f72a490aa6d Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Thu, 2 May 2024 12:01:00 +0900
Subject: [PATCH 32/65] [mypy][6/N] Fix all the core subdirectory typing
 (#4450)

Co-authored-by: Cade Daniel <edacih@gmail.com>
---
 .github/workflows/mypy.yaml                |   6 +-
 format.sh                                  |   2 +-
 vllm/core/block/block_table.py             |  16 ++--
 vllm/core/block/common.py                  |  20 +++-
 vllm/core/block/cpu_gpu_block_allocator.py |  49 ++++++----
 vllm/core/block/interfaces.py              | 104 +++++++++++++++++++--
 vllm/core/block/naive_block.py             |  52 +++++++++--
 vllm/core/block/prefix_caching_block.py    |  85 ++++++++++++-----
 vllm/core/block_manager_v2.py              |   9 +-
 vllm/core/evictor_v2.py                    |  15 ++-
 10 files changed, 275 insertions(+), 83 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index a19be8525f902..5b2bad1476dc3 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -33,6 +33,7 @@ jobs:
     - name: Mypy
       run: |
         mypy vllm/attention --config-file pyproject.toml
+        mypy vllm/core --config-file pyproject.toml
         mypy vllm/distributed --config-file pyproject.toml
         mypy vllm/entrypoints --config-file pyproject.toml
         mypy vllm/executor --config-file pyproject.toml
@@ -42,9 +43,6 @@ jobs:
         mypy vllm/engine  --config-file pyproject.toml
         mypy vllm/worker --config-file pyproject.toml
         mypy vllm/spec_decode --config-file pyproject.toml
-        mypy vllm/lora --config-file pyproject.toml
         mypy vllm/model_executor  --config-file pyproject.toml
-
-        # TODO(sang): Fix nested dir
-        mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
 
diff --git a/format.sh b/format.sh
index bd12e61d77806..49149afe41d04 100755
--- a/format.sh
+++ b/format.sh
@@ -95,7 +95,7 @@ echo 'vLLM yapf: Done'
 # Run mypy
 echo 'vLLM mypy:'
 mypy vllm/attention --config-file pyproject.toml
-mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/core --config-file pyproject.toml
 mypy vllm/distributed --config-file pyproject.toml
 mypy vllm/entrypoints --config-file pyproject.toml
 mypy vllm/executor --config-file pyproject.toml
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index f1b65b2514f76..b0d9511fba521 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -40,7 +40,9 @@ def __init__(
     ):
         self._block_size = block_size
         self._allocator = block_allocator
-        self._blocks: Optional[List[Block]] = _blocks
+        if _blocks is None:
+            _blocks = []
+        self._blocks: List[Block] = _blocks
 
         # Use helper method instead of directly calculating, as blocks
         # may not be allocated.
@@ -104,7 +106,7 @@ def append_token_ids(self,
             token_ids (List[int]): The sequence of token IDs to be appended.
         """
         assert self._is_allocated
-        assert self._blocks is not None
+        assert len(self._blocks) > 0
 
         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
                                     num_lookahead_slots)
@@ -141,6 +143,7 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
         blocks_to_allocate = cdiv(slots_to_allocate, self._block_size)
 
         for _ in range(blocks_to_allocate):
+            assert len(self._blocks) > 0
             self._blocks.append(
                 self._allocator.allocate_mutable(prev_block=self._blocks[-1],
                                                  device=device))
@@ -159,6 +162,7 @@ def fork(self) -> "BlockTable":
                 the current instance.
         """
         assert self._is_allocated
+        assert len(self._blocks) > 0
         forked_blocks = self._allocator.fork(self._blocks[-1])
         return BlockTable(
             block_size=self._block_size,
@@ -177,10 +181,10 @@ def free(self) -> None:
         assert self._is_allocated
         for block in self._blocks:
             self._allocator.free(block)
-        self._blocks = None
+        self._blocks = []
 
     @property
-    def physical_block_ids(self) -> List[int]:
+    def physical_block_ids(self) -> List[Optional[int]]:
         """Returns a list of physical block indices for the blocks in the
         BlockTable.
 
@@ -235,7 +239,7 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
 
     def _get_all_token_ids(self) -> List[int]:
         # NOTE: This function is O(seq_len); use sparingly.
-        token_ids = []
+        token_ids: List[int] = []
 
         if not self._is_allocated:
             return token_ids
@@ -247,7 +251,7 @@ def _get_all_token_ids(self) -> List[int]:
 
     @property
     def _is_allocated(self) -> bool:
-        return self._blocks is not None
+        return len(self._blocks) > 0
 
     @property
     def _num_empty_slots(self) -> int:
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index f11234a0bf2dd..3f97a1210b096 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Dict, Iterable, List, Optional
+from typing import Dict, Iterable, List, Optional, Protocol
 
 from vllm.core.block.interfaces import Block, BlockAllocator
 
@@ -7,7 +7,19 @@
 RefCount = int
 
 
-class RefCounter:
+class RefCounterProtocol(Protocol):
+
+    def incr(self, block_id: BlockId) -> RefCount:
+        raise NotImplementedError
+
+    def decr(self, block_id: BlockId) -> RefCount:
+        raise NotImplementedError
+
+    def get(self, block_id: BlockId) -> RefCount:
+        raise NotImplementedError
+
+
+class RefCounter(RefCounterProtocol):
     """A class for managing reference counts for a set of block indices.
 
     The RefCounter class maintains a dictionary that maps block indices to their
@@ -54,7 +66,7 @@ def as_readonly(self) -> "ReadOnlyRefCounter":
         return ReadOnlyRefCounter(self)
 
 
-class ReadOnlyRefCounter:
+class ReadOnlyRefCounter(RefCounterProtocol):
     """A read-only view of the RefCounter class.
 
     The ReadOnlyRefCounter class provides a read-only interface to access the
@@ -96,7 +108,7 @@ class CopyOnWriteTracker:
 
     def __init__(
         self,
-        refcounter: RefCounter,
+        refcounter: RefCounterProtocol,
         allocator: BlockAllocator,
     ):
         self._copy_on_writes: Dict[BlockId, List[BlockId]] = defaultdict(list)
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 23e1a4cf91266..d25d22cf52838 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -1,6 +1,6 @@
-from typing import Dict, List, Optional
+from typing import Dict, FrozenSet, List, Optional
 
-from vllm.core.block.interfaces import (Block, BlockAllocator,
+from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
@@ -57,15 +57,15 @@ def create(
         cpu_block_ids = block_ids[num_gpu_blocks:]
 
         if allocator_type == "naive":
-            gpu_allocator = NaiveBlockAllocator(
-                create_block=NaiveBlock,
+            gpu_allocator: BlockAllocator = NaiveBlockAllocator(
+                create_block=NaiveBlock,  # type: ignore
                 num_blocks=num_gpu_blocks,
                 block_size=block_size,
                 block_ids=gpu_block_ids,
             )
 
-            cpu_allocator = NaiveBlockAllocator(
-                create_block=NaiveBlock,
+            cpu_allocator: BlockAllocator = NaiveBlockAllocator(
+                create_block=NaiveBlock,  # type: ignore
                 num_blocks=num_cpu_blocks,
                 block_size=block_size,
                 block_ids=cpu_block_ids,
@@ -105,13 +105,14 @@ def __init__(
             Device.GPU: gpu_block_allocator,
         }
 
-        self._block_ids_to_allocator = {}
+        self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
         for _, allocator in self._allocators.items():
             for block_id in allocator.all_block_ids:
                 self._block_ids_to_allocator[block_id] = allocator
 
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable(self,
+                         prev_block: Optional[Block],
+                         device: Optional[Device] = None) -> Block:
         """Allocates a new mutable block on the specified device.
 
         Args:
@@ -122,10 +123,13 @@ def allocate_mutable(self, prev_block: Optional[Block],
         Returns:
             Block: The newly allocated mutable block.
         """
+        assert device is not None
         return self._allocators[device].allocate_mutable(prev_block)
 
-    def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int], device: Device) -> Block:
+    def allocate_immutable(self,
+                           prev_block: Optional[Block],
+                           token_ids: List[int],
+                           device: Optional[Device] = None) -> Block:
         """Allocates a new immutable block with the provided token IDs on the
         specified device.
 
@@ -140,6 +144,7 @@ def allocate_immutable(self, prev_block: Optional[Block],
             Block: The newly allocated immutable block containing the provided
                 token IDs.
         """
+        assert device is not None
         return self._allocators[device].allocate_immutable(
             prev_block, token_ids)
 
@@ -149,7 +154,9 @@ def free(self, block: Block) -> None:
         Args:
             block (Block): The block to be freed.
         """
-        allocator = self._block_ids_to_allocator[block.block_id]
+        block_id = block.block_id
+        assert block_id is not None
+        allocator = self._block_ids_to_allocator[block_id]
         return allocator.free(block)
 
     def fork(self, last_block: Block) -> List[Block]:
@@ -163,19 +170,22 @@ def fork(self, last_block: Block) -> List[Block]:
             List[Block]: A new list of blocks that shares the same memory as the
                 original sequence.
         """
-        allocator = self._block_ids_to_allocator[last_block.block_id]
+        block_id = last_block.block_id
+        assert block_id is not None
+        allocator = self._block_ids_to_allocator[block_id]
         return allocator.fork(last_block)
 
-    def get_num_free_blocks(self, device: Device) -> int:
+    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
         """Returns the number of free blocks available on the specified device.
 
         Args:
             device (Device): The device for which to query the number of free
-                blocks.
+                blocks. AssertionError is raised if None is passed.
 
         Returns:
             int: The number of free blocks available on the specified device.
         """
+        assert device is not None
         return self._allocators[device].get_num_free_blocks()
 
     def clear_copy_on_writes(self) -> Dict[int, List[int]]:
@@ -210,5 +220,12 @@ def get_common_computed_block_ids(
         return self._allocators[device].get_common_computed_block_ids(
             seq_block_ids)
 
-    def all_block_ids(self) -> frozenset[int]:
+    @property
+    def all_block_ids(self) -> FrozenSet[int]:
         return frozenset(self._block_ids_to_allocator.keys())
+
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        raise NotImplementedError
+
+    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
+        raise NotImplementedError
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 440d6a4b04d3b..08d2f87301d92 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -3,6 +3,8 @@
 
 from vllm.utils import Device
 
+BlockId = int
+
 
 class Block(ABC):
 
@@ -15,6 +17,12 @@ def append_token_ids(self, token_ids: List[int]) -> None:
     def block_id(self) -> Optional[int]:
         pass
 
+    @block_id.setter
+    @abstractmethod
+    def block_id(self, value: Optional[int]) -> None:
+        """NOTE: Do not use this API outside Block."""
+        self._block_id = value
+
     @property
     @abstractmethod
     def token_ids(self) -> List[int]:
@@ -35,6 +43,27 @@ def is_full(self) -> bool:
     def prev_block(self) -> Optional["Block"]:
         pass
 
+    @property
+    @abstractmethod
+    def computed(self) -> bool:
+        raise NotImplementedError
+
+    @computed.setter
+    @abstractmethod
+    def computed(self, value) -> bool:
+        """Should be only used by PrefixCacingAllocator"""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def last_accessed(self) -> float:
+        raise NotImplementedError
+
+    @last_accessed.setter
+    @abstractmethod
+    def last_accessed(self, last_accessed_ts: float):
+        raise NotImplementedError
+
     class Factory(Protocol):
 
         @abstractmethod
@@ -48,6 +77,17 @@ def __call__(
         ) -> "Block":
             pass
 
+    @property
+    @abstractmethod
+    def content_hash(self) -> Optional[int]:
+        """Return the content-based hash of the current block, or None if it is
+        not yet defined or not supported.
+
+        For the content-based hash to be defined, the current block must be
+        full.
+        """
+        return None
+
 
 class BlockAllocator(ABC):
 
@@ -57,7 +97,7 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
 
     @abstractmethod
     def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int], device: Device) -> Block:
+                           token_ids: List[int]) -> Block:
         pass
 
     @abstractmethod
@@ -69,7 +109,7 @@ def fork(self, last_block: Block) -> List[Block]:
         pass
 
     @abstractmethod
-    def get_num_free_blocks(self, device: Device) -> int:
+    def get_num_free_blocks(self) -> int:
         pass
 
     @property
@@ -82,11 +122,12 @@ def clear_copy_on_writes(self) -> Dict[int, List[int]]:
         pass
 
     @abstractmethod
-    def mark_blocks_as_accessed(self) -> None:
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
         pass
 
     @abstractmethod
-    def mark_blocks_as_computed(self) -> None:
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         pass
 
     @abstractmethod
@@ -94,21 +135,66 @@ def get_common_computed_block_ids(
             self, seq_block_ids: List[List[int]]) -> List[int]:
         pass
 
+    @abstractmethod
+    def cow_block_if_not_appendable(self, block: Block) -> Optional["BlockId"]:
+        """NOTE: This should not be used besides Block"""
+        pass
+
+    @abstractmethod
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        """NOTE: This should not be used besides Block"""
+        pass
+
     class NoFreeBlocksError(ValueError):
         pass
 
 
-class DeviceAwareBlockAllocator(BlockAllocator):
+class DeviceAwareBlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
+    def allocate_mutable(self,
+                         prev_block: Optional[Block],
+                         device: Optional[Device] = None) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int], device: Device) -> Block:
+    def allocate_immutable(self,
+                           prev_block: Optional[Block],
+                           token_ids: List[int],
+                           device: Optional[Device] = None) -> Block:
         pass
 
     @abstractmethod
-    def get_num_free_blocks(self, device: Device) -> int:
+    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
+        pass
+
+    @abstractmethod
+    def free(self, block: Block) -> None:
+        pass
+
+    @abstractmethod
+    def fork(self, last_block: Block) -> List[Block]:
+        pass
+
+    @property
+    @abstractmethod
+    def all_block_ids(self) -> FrozenSet[int]:
+        pass
+
+    @abstractmethod
+    def clear_copy_on_writes(self) -> Dict[int, List[int]]:
+        pass
+
+    @abstractmethod
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        pass
+
+    @abstractmethod
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        pass
+
+    @abstractmethod
+    def get_common_computed_block_ids(
+            self, seq_block_ids: List[List[int]]) -> List[int]:
         pass
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index a0bf33912d935..10af129246889 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,10 +1,9 @@
-from typing import Dict, Iterable, List, Optional, Set
+from typing import Dict, FrozenSet, Iterable, List, Optional, Set
 
 from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
                                     get_all_blocks_recursively)
-from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
 
-BlockId = int
 Refcount = int
 
 
@@ -49,8 +48,10 @@ def __init__(
             allocator=self,
         )
 
-    def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int]) -> Block:
+    def allocate_immutable(self,
+                           prev_block: Optional[Block],
+                           token_ids: List[int],
+                           device: Optional[Device] = None) -> Block:
         """Allocates a new immutable block with the given token IDs, linked to
         the previous block.
 
@@ -63,11 +64,14 @@ def allocate_immutable(self, prev_block: Optional[Block],
         Returns:
             Block: The newly allocated immutable block.
         """
+        assert device is None
         block = self.allocate_mutable(prev_block=prev_block)
         block.append_token_ids(token_ids)
         return block
 
-    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
+    def allocate_mutable(self,
+                         prev_block: Optional[Block],
+                         device: Optional[Device] = None) -> Block:
         """Allocates a new mutable block, linked to the previous block.
 
         Args:
@@ -78,6 +82,7 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
         Returns:
             Block: The newly allocated mutable block.
         """
+        assert device is None
         block_id = self._allocate_new_block_id()
         return self._create_block(
             prev_block=prev_block,
@@ -88,6 +93,7 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
         )
 
     def free(self, block: Block) -> None:
+        assert block.block_id is not None
         self._free_block_id(block.block_id)
 
         # Mark the block as having no allocation.
@@ -111,6 +117,7 @@ def fork(self, last_block: Block) -> List[Block]:
         for block in source_blocks:
 
             # Increment refcount for each block.
+            assert block.block_id is not None
             refcount = self._refcounter.incr(block.block_id)
             assert refcount != 1, "can't fork free'd block"
 
@@ -126,7 +133,8 @@ def fork(self, last_block: Block) -> List[Block]:
 
         return forked_blocks
 
-    def get_num_free_blocks(self) -> int:
+    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
+        assert device is None
         return len(self._free_block_indices)
 
     def _allocate_new_block_id(self) -> BlockId:
@@ -148,7 +156,7 @@ def refcounter(self):
         return self._refcounter
 
     @property
-    def all_block_ids(self):
+    def all_block_ids(self) -> FrozenSet[int]:
         return self._all_block_indices
 
     def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
@@ -200,6 +208,9 @@ def get_common_computed_block_ids(
         """
         return []
 
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        raise NotImplementedError
+
 
 class NaiveBlock(Block):
     """An implementation of the Block class that does not support prefix
@@ -224,13 +235,13 @@ class NaiveBlock(Block):
     """
 
     def __init__(self,
-                 prev_block: Block,
+                 prev_block: Optional[Block],
                  token_ids: List[int],
                  block_size: int,
                  allocator: BlockAllocator,
                  block_id: Optional[int] = None,
                  _cow_target: Optional[Block] = None):
-        self._token_ids = []
+        self._token_ids: List[int] = []
         self._block_size = block_size
         self._prev_block = prev_block
         self._block_id = block_id
@@ -256,6 +267,22 @@ def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
         assert self.num_empty_slots >= len(token_ids)
         self._token_ids.extend(token_ids)
 
+    @property
+    def computed(self) -> bool:
+        raise NotImplementedError
+
+    @computed.setter
+    def computed(self, value) -> None:
+        raise NotImplementedError
+
+    @property
+    def last_accessed(self) -> float:
+        raise NotImplementedError
+
+    @last_accessed.setter
+    def last_accessed(self, last_accessed_ts: float):
+        raise NotImplementedError
+
     @property
     def block_id(self) -> Optional[int]:
         return self._block_id
@@ -276,9 +303,14 @@ def num_empty_slots(self) -> int:
     def token_ids(self) -> List[int]:
         return self._token_ids
 
+    @property
     def block_size(self) -> int:
         return self._block_size
 
     @property
     def prev_block(self) -> Optional["Block"]:
         return self._prev_block
+
+    @property
+    def content_hash(self) -> Optional[int]:
+        return None
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 292a750146ae6..e9000c9bfff7f 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,16 +1,15 @@
 """Token blocks."""
 from itertools import takewhile
 from os.path import commonprefix
-from typing import Dict, Iterable, List, Optional
+from typing import Dict, FrozenSet, Iterable, List, Optional
 
 from vllm.core.block.common import (CopyOnWriteTracker,
                                     get_all_blocks_recursively)
-from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
 
 PrefixHash = int
-BlockId = int
 
 # By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
 # so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME,
@@ -38,7 +37,7 @@ def __init__(
         num_blocks: int,
         block_size: int,
         block_ids: Optional[Iterable[int]] = None,
-        eviction_policy: Optional[EvictionPolicy] = EvictionPolicy.LRU,
+        eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
     ):
         # A mapping of prefix hash to block index. All blocks which have a
         # prefix hash will be in this dict, even if they have refcount 0.
@@ -49,7 +48,7 @@ def __init__(
 
         # An allocator for blocks that do not have prefix hashes.
         self._hashless_allocator = NaiveBlockAllocator(
-            create_block=self._create_block,
+            create_block=self._create_block,  # type: ignore
             num_blocks=num_blocks,
             block_size=block_size,
             block_ids=block_ids,
@@ -79,7 +78,7 @@ def _create_block(
         block_size: int,
         allocator: BlockAllocator,
         block_id: Optional[int] = None,
-        computed: Optional[bool] = False,
+        computed: bool = False,
     ) -> Block:
         # Bind block to self.
         allocator = self
@@ -93,8 +92,10 @@ def _create_block(
             computed=computed,
         )
 
-    def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int]) -> Block:
+    def allocate_immutable(self,
+                           prev_block: Optional[Block],
+                           token_ids: List[int],
+                           device: Optional[Device] = None) -> Block:
         """Allocates an immutable block with the given token IDs, reusing cached
         blocks if possible.
 
@@ -105,6 +106,7 @@ def allocate_immutable(self, prev_block: Optional[Block],
         Returns:
             Block: The allocated immutable block.
         """
+        assert device is None
         assert_prefix_caching_block_or_none(prev_block)
 
         block = self._create_block(
@@ -127,16 +129,20 @@ def allocate_immutable(self, prev_block: Optional[Block],
 
         return block
 
-    def allocate_mutable(self, prev_block: Block) -> Block:
+    def allocate_mutable(self,
+                         prev_block: Optional[Block],
+                         device: Optional[Device] = None) -> Block:
         """Allocates a mutable block. If there are no free blocks, this will
         evict unused cached blocks.
 
         Args:
             prev_block (Block): The previous block in the sequence.
+                None is not allowed unlike it is super class.
 
         Returns:
             Block: The allocated mutable block.
         """
+        assert device is None
         assert_prefix_caching_block_or_none(prev_block)
 
         try:
@@ -144,6 +150,7 @@ def allocate_mutable(self, prev_block: Block) -> Block:
                 prev_block=prev_block)
 
             assert block.block_id not in self._blocks
+            assert block.block_id is not None
             self._blocks[block.block_id] = block
             return block
         except BlockAllocator.NoFreeBlocksError:
@@ -183,6 +190,7 @@ def allocate_mutable(self, prev_block: Block) -> Block:
             assert block.content_hash is None
 
             assert block.block_id not in self._blocks
+            assert block.block_id is not None
             self._blocks[block.block_id] = block
             return block
 
@@ -225,6 +233,7 @@ def _free_block_id_for_block(self, block_id: BlockId,
             # We have fork case where block would get more than one ref,
             # so we cannot free it from tracking if ref cnt large than 1
             if refcount <= 1:
+                assert block.block_id is not None
                 del self._blocks[block.block_id]
             return self._hashless_allocator.free(block)
 
@@ -233,6 +242,7 @@ def _free_block_id_for_block(self, block_id: BlockId,
         # If no longer used, add the block to the evictor.
         if refcount == 0:
             assert block.content_hash in self._cached_blocks
+            assert block.block_id is not None
             del self._blocks[block.block_id]
             self.evictor.add(block.block_id, block.content_hash,
                              block.num_tokens_total, block.last_accessed)
@@ -268,18 +278,18 @@ def fork(self, last_block: Block) -> List[Block]:
 
         return forked_blocks
 
-    def get_num_free_blocks(self) -> int:
+    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
+        assert device is None
         # The number of free blocks is the number of hashless free blocks
         # plus the number of blocks evictor could free from its list.
         return self._hashless_allocator.get_num_free_blocks(
         ) + self.evictor.num_blocks
 
     @property
-    def all_block_ids(self) -> frozenset[int]:
+    def all_block_ids(self) -> FrozenSet[int]:
         return self._hashless_allocator.all_block_ids
 
-    def promote_to_immutable_block(self,
-                                   block: "PrefixCachingBlock") -> BlockId:
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
         """Once a mutable block is full, it can be promoted to an immutable
         block. This means that its content can be referenced by future blocks
         having the same prefix.
@@ -289,7 +299,7 @@ def promote_to_immutable_block(self,
         block.
 
         Args:
-            block (PrefixCachingBlock): The mutable block to be promoted.
+            block: The mutable block to be promoted.
 
         Returns:
             BlockId: Either the original block index, or the block index of
@@ -385,8 +395,11 @@ def get_common_computed_block_ids(
                 takewhile(lambda block_id: self.block_is_computed(block_id),
                           seq[:-1])) for seq in seq_block_ids
         ]
-        res = commonprefix([ids for ids in ids_list if ids != []])
-        return res
+        # It returns a list of int although type annotation says list of string.
+        return commonprefix([
+            ids for ids in ids_list  # type: ignore
+            if ids != []
+        ])
 
 
 class PrefixCachingBlock(Block):
@@ -403,7 +416,7 @@ class PrefixCachingBlock(Block):
         token_ids (List[int]): The initial token IDs to be stored in the block.
         block_size (int): The maximum number of token IDs that can be stored in
             the block.
-        prefix_caching_allocator (PrefixCachingBlockAllocator): The prefix
+        prefix_caching_allocator (BlockAllocator): The prefix
             caching block allocator associated with this block.
         block_id (Optional[int], optional): The physical block index
             of this block. Defaults to None.
@@ -411,21 +424,25 @@ class PrefixCachingBlock(Block):
 
     def __init__(
         self,
-        prev_block: Optional["PrefixCachingBlock"],
+        prev_block: Optional[Block],
         token_ids: List[int],
         block_size: int,
-        prefix_caching_allocator: PrefixCachingBlockAllocator,
+        prefix_caching_allocator: BlockAllocator,
         block_id: Optional[int] = None,
-        computed: Optional[bool] = False,
+        computed: bool = False,
     ):
+        assert isinstance(prefix_caching_allocator,
+                          PrefixCachingBlockAllocator), (
+                              "Currently this class is only tested with "
+                              "PrefixCachingBlockAllocator.")
         assert_prefix_caching_block_or_none(prev_block)
 
         self._prev_block = prev_block
         self._cached_content_hash: Optional[int] = None
         self._cached_num_tokens_total: Optional[int] = None
         self._prefix_caching_allocator = prefix_caching_allocator
-        self.last_accessed = _DEFAULT_LAST_ACCESSED_TIME
-        self.computed = computed
+        self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
+        self._computed = computed
 
         self._block = NaiveBlock(
             prev_block=prev_block,
@@ -436,6 +453,22 @@ def __init__(
             _cow_target=self,
         )
 
+    @property
+    def computed(self) -> bool:
+        return self._computed
+
+    @computed.setter
+    def computed(self, value) -> None:
+        self._computed = value
+
+    @property
+    def last_accessed(self) -> float:
+        return self._last_accessed
+
+    @last_accessed.setter
+    def last_accessed(self, last_accessed_ts: float):
+        self._last_accessed = last_accessed_ts
+
     def append_token_ids(self, token_ids: List[int]) -> None:
         """Appends the given token IDs to the block and registers the block as
         immutable if the block becomes full.
@@ -483,7 +516,7 @@ def num_tokens_total(self) -> int:
         if self._cached_num_tokens_total is not None:
             return self._cached_num_tokens_total
 
-        _block = self
+        _block: Optional[Block] = self
         self._cached_num_tokens_total = 0
 
         # TODO: current implement here take O(N^2), we expect future
@@ -524,8 +557,10 @@ def content_hash(self) -> Optional[int]:
             return None
 
         is_first_block = self._prev_block is None
-        prev_block_hash = (None if is_first_block else
-                           self._prev_block.content_hash)
+        prev_block_hash = (
+            None if is_first_block else
+            self._prev_block.content_hash  # type: ignore
+        )
 
         # Previous block exists but does not yet have a hash.
         # Return no hash in this case.
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 0857605e2d005..3fbd8b787cf6c 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -190,7 +190,7 @@ def get_block_table(self, seq: Sequence) -> List[int]:
         assert seq.seq_id in self.block_tables
         block_ids = self.block_tables[seq.seq_id].physical_block_ids
         assert all(b is not None for b in block_ids)
-        return block_ids
+        return block_ids  # type: ignore
 
     def access_all_blocks_in_seq(self, seq: Sequence, now: float):
         # Update the last accessed time of all the blocks accessed
@@ -204,7 +204,9 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float):
             block_ids = []
             for block_id in block_table.physical_block_ids:
                 block_ids.append(block_id)
-            self.block_allocator.mark_blocks_as_accessed(block_ids, now)
+            self.block_allocator.mark_blocks_as_accessed(
+                block_ids,  # type: ignore
+                now)
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         # The only need for mark block as computed is for prefix caching,
@@ -227,8 +229,9 @@ def get_common_computed_block_ids(
         seq_block_ids = [
             self.block_tables[seq.seq_id].physical_block_ids for seq in seqs
         ]
+        # NOTE(sang): This assumes seq_block_ids doesn't contain any None.
         return self.block_allocator.get_common_computed_block_ids(
-            seq_block_ids)
+            seq_block_ids)  # type: ignore
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         src_block_table = self.block_tables[parent_seq.seq_id]
diff --git a/vllm/core/evictor_v2.py b/vllm/core/evictor_v2.py
index b902a39263d14..57759b29347f4 100644
--- a/vllm/core/evictor_v2.py
+++ b/vllm/core/evictor_v2.py
@@ -32,15 +32,20 @@ def evict(self) -> Tuple[int, int]:
 
     @abstractmethod
     def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
-            last_accessed: int):
+            last_accessed: float):
         """Adds block to the evictor, making it a candidate for eviction"""
         pass
 
     @abstractmethod
-    def update(self, block_id: int, last_accessed: int):
+    def update(self, block_id: int, last_accessed: float):
         """Update corresponding block's access time in metadata"""
         pass
 
+    @abstractmethod
+    def remove(self, block_id: int):
+        """Remove a given block id from the cache."""
+        pass
+
     @abstractproperty
     def num_blocks(self) -> int:
         pass
@@ -55,7 +60,7 @@ class BlockMetaData():
     """
 
     def __init__(self, content_hash: int, num_hashed_tokens: int,
-                 last_accessed: int):
+                 last_accessed: float):
         self.content_hash = content_hash
         self.num_hashed_tokens = num_hashed_tokens
         self.last_accessed = last_accessed
@@ -96,12 +101,12 @@ def evict(self) -> Tuple[int, int]:
         return evicted_block_id, evicted_block.content_hash
 
     def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
-            last_accessed: int):
+            last_accessed: float):
         self.free_table[block_id] = BlockMetaData(content_hash,
                                                   num_hashed_tokens,
                                                   last_accessed)
 
-    def update(self, block_id: int, last_accessed: int):
+    def update(self, block_id: int, last_accessed: float):
         self.free_table[block_id].last_accessed = last_accessed
 
     def remove(self, block_id: int):

From 2a85f9300733c09ec90819bc6df4bff8f103fd67 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 1 May 2024 21:28:21 -0700
Subject: [PATCH 33/65] [Core][Distributed] enable multiple tp group (#4512)

Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
---
 .buildkite/test-pipeline.yaml                 | 11 ++++++--
 .buildkite/test-template.j2                   |  3 ++
 tests/distributed/test_pynccl.py              | 28 +++++++++++++++++++
 .../device_communicators/pynccl.py            |  5 +++-
 4 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 641f366d06031..d518fb9ccecfa 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -25,19 +25,24 @@ steps:
 - label: Distributed Comm Ops Test
   command: pytest -v -s test_comm_ops.py
   working_dir: "/vllm-workspace/tests/distributed"
-  num_gpus: 2 # only support 1 or 2 for now.
+  num_gpus: 2
 
 - label: Distributed Tests
   working_dir: "/vllm-workspace/tests/distributed"
-  num_gpus: 2 # only support 1 or 2 for now.
+  num_gpus: 2
   commands:
-  - pytest -v -s test_pynccl.py
   - pytest -v -s test_pynccl_library.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
 
+- label: Distributed Tests (Multiple Groups)
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 4
+  commands:
+  - pytest -v -s test_pynccl.py
+
 - label: Engine Test
   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 5c9515840bb03..2cb21cacd065b 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -45,6 +45,9 @@ steps:
     plugins:
       - kubernetes:
           podSpec:
+            {% if step.num_gpus %}
+            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
+            {% endif %}
             volumes:
               - name: dshm
                 emptyDir:
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 6d7d4a5806bd0..e71d839648c83 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -58,6 +58,34 @@ def test_pynccl():
     distributed_run(worker_fn, 2)
 
 
+@worker_fn_wrapper
+def multiple_tp_worker_fn():
+    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+    groups = [
+        torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
+        torch.distributed.new_group(ranks=[2, 3], backend="gloo")
+    ]
+    group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
+    comm = NCCLCommunicator(group=group, device=device)
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
+    # two groups can communicate independently
+    if torch.distributed.get_rank() in [0, 1]:
+        comm.all_reduce(tensor)
+        comm.all_reduce(tensor)
+        result = tensor.mean().cpu().item()
+        assert result == 4
+    else:
+        comm.all_reduce(tensor)
+        result = tensor.mean().cpu().item()
+        assert result == 2
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_multiple_tp():
+    distributed_run(worker_fn, 4)
+
+
 @worker_fn_wrapper
 def worker_fn_with_cudagraph():
     with torch.no_grad():
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index f21fcd262d810..758994352e3de 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -232,6 +232,7 @@ def __init__(
         assert dist.get_backend(group) != dist.Backend.NCCL, (
             "NCCLCommunicator should be attached to a non-NCCL group.")
         self.group = group
+        # note: this rank is the rank in the group
         self.rank = dist.get_rank(group)
         self.world_size = dist.get_world_size(group)
         if self.rank == 0:
@@ -239,7 +240,9 @@ def __init__(
         else:
             self.unique_id = NcclUniqueId()
         tensor = torch.ByteTensor(list(self.unique_id.internal))
-        dist.broadcast(tensor, src=0, group=group)
+        ranks = dist.get_process_group_ranks(group)
+        # arg `src` in `broadcast` is the global rank
+        dist.broadcast(tensor, src=ranks[0], group=group)
         byte_list = tensor.tolist()
         for i, byte in enumerate(byte_list):
             self.unique_id.internal[i] = byte

From 7038e8b80303bf6128acbe508dec910183a1be56 Mon Sep 17 00:00:00 2001
From: alexm-nm <59768536+alexm-nm@users.noreply.github.com>
Date: Thu, 2 May 2024 12:56:22 -0400
Subject: [PATCH 34/65] [Kernel] Support running GPTQ 8-bit models in Marlin
 (#4533)

---
 csrc/ops.h                                    |   4 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  | 552 ++++++++++++------
 csrc/quantization/gptq_marlin/gptq_marlin.cuh |   8 +-
 .../gptq_marlin/gptq_marlin_repack.cu         | 152 +++--
 tests/models/test_gptq_marlin.py              |  13 +-
 vllm/_custom_ops.py                           |  14 +-
 .../layers/quantization/gptq_marlin.py        | 134 ++---
 7 files changed, 553 insertions(+), 324 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 04b97d1784cd2..8ae052427052f 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -132,6 +132,7 @@ torch::Tensor gptq_marlin_gemm(
   torch::Tensor &g_idx,
   torch::Tensor &perm,
   torch::Tensor &workspace,
+  int64_t num_bits,
   int64_t size_m,
   int64_t size_n,
   int64_t size_k,
@@ -141,7 +142,8 @@ torch::Tensor gptq_marlin_repack(
   torch::Tensor &b_q_weight,
   torch::Tensor &perm,
   int64_t size_k,
-  int64_t size_n);
+  int64_t size_n,
+  int64_t num_bits);
 #endif
 
 void squeezellm_gemm(
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 9902f55167d89..fd0837f0cb39c 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -32,7 +32,8 @@ __global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr,
                                     int4 *__restrict__ out_int4_ptr, int size_m,
                                     int size_k, int block_rows) {}
 
-template <const int threads,         // number of threads in a threadblock
+template <const int num_bits,        // number of bits used for weights
+          const int threads,         // number of threads in a threadblock
           const int thread_m_blocks, // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the threadblock
           const int thread_n_blocks, // same for n dimension (output)
@@ -62,8 +63,8 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
 torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
                                torch::Tensor &b_scales, torch::Tensor &g_idx,
                                torch::Tensor &perm, torch::Tensor &workspace,
-                               int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full) {
+                               int64_t num_bits, int64_t size_m, int64_t size_n,
+                               int64_t size_k, bool is_k_full) {
   TORCH_CHECK_NOT_IMPLEMENTED(false,
                               "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
   return torch::empty({1, 1});
@@ -114,11 +115,21 @@ template <int lut> __device__ inline int lop3(int a, int b, int c) {
   return res;
 }
 
+// Constructs destination register by taking bytes from 2 sources (based on mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
 // Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
 // values. We mostly follow the strategy in the link below, with some small
 // changes:
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-__device__ inline FragB dequant(int q) {
+__device__ inline FragB dequant_4bit(int q) {
   const int LO = 0x000f000f;
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
@@ -139,6 +150,24 @@ __device__ inline FragB dequant(int q) {
   return frag_b;
 }
 
+__device__ inline FragB dequant_8bit(int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2 *>(&lo),
+                      *reinterpret_cast<const half2 *>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2 *>(&hi),
+                      *reinterpret_cast<const half2 *>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
 __device__ inline void scale(FragB &frag_b, FragS &frag_s, int i) {
@@ -162,6 +191,13 @@ __device__ inline void scale4(FragB &frag_b, FragS &frag_s_1, FragS &frag_s_2,
   frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
 }
 
+// Given 2 floats multiply by 2 scales (halves)
+__device__ inline void scale_float(float *c, FragS &s) {
+  __half *s_ptr = reinterpret_cast<__half *>(&s);
+  c[0] = __fmul_rn(c[0], __half2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], __half2float(s_ptr[1]));
+}
+
 // Wait until barrier reaches `count`, then lock for current threadblock.
 __device__ inline void barrier_acquire(int *lock, int count) {
   if (threadIdx.x == 0) {
@@ -250,7 +286,8 @@ __global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr,
   }
 }
 
-template <const int threads,         // number of threads in a threadblock
+template <const int num_bits,        // number of bits used for weights
+          const int threads,         // number of threads in a threadblock
           const int thread_m_blocks, // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the threadblock
           const int thread_n_blocks, // same for n dimension (output)
@@ -286,6 +323,8 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
   // configurations, while requiring as few slow global cross-threadblock
   // reductions as possible.
 
+  constexpr int pack_factor = 32 / num_bits;
+
   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
   // better partitioning with less reductions
   int parallel = 1;
@@ -385,21 +424,25 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
   constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / 32;
-  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
   int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
-  constexpr int b_sh_wr_delta = threads;
-  constexpr int b_sh_rd_delta = threads;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
   constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
   // Scale sizes/strides without act_order
   int s_gl_stride = prob_n / 8;
   constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
-  constexpr int s_tb_groups = !has_act_order && group_blocks < thread_k_blocks
-                                  ? thread_k_blocks / group_blocks
-                                  : 1;
+  constexpr int s_tb_groups =
+      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
+          ? thread_k_blocks / group_blocks
+          : 1;
   constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
   int s_gl_rd_delta = s_gl_stride;
 
@@ -425,12 +468,12 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
       a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
   a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
 
-  int b_gl_rd =
-      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
 
   // For act_order
   constexpr int k_iter_size = tb_k / b_sh_wr_iters;
@@ -442,8 +485,12 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
   // No act_order
   int s_gl_rd;
   if constexpr (!has_act_order) {
-    s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
-              s_sh_stride * slice_col + threadIdx.x;
+    if constexpr (group_blocks == -1) {
+      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+    }
   }
   int s_sh_wr = threadIdx.x;
   bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
@@ -511,7 +558,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2];
+  I4 frag_b_quant[2][b_thread_vecs];
   FragC frag_c[thread_m_blocks][4][2];
   FragS frag_s[2][4];        // No act-order
   FragS act_frag_s[2][4][4]; // For act-order
@@ -575,7 +622,11 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
       int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
 #pragma unroll
       for (int i = 0; i < b_sh_wr_iters; i++) {
-        cp_async4_stream(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+#pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
+
         B_ptr[i] += b_gl_rd_delta_o;
       }
 
@@ -602,15 +653,15 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
             // Only fetch scales if this tile starts a new group
             if (pipe % (group_blocks / thread_k_blocks) == 0) {
               if (s_sh_wr_pred) {
-                cp_async4_stream(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
               }
               s_gl_rd += s_gl_rd_delta;
             }
           } else {
             for (int i = 0; i < s_tb_groups; i++) {
               if (s_sh_wr_pred) {
-                cp_async4_stream(&sh_s_stage[i * s_sh_stride + s_sh_wr],
-                                 &scales_ptr[s_gl_rd]);
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
+                          &scales_ptr[s_gl_rd]);
               }
               s_gl_rd += s_gl_rd_delta;
             }
@@ -641,14 +692,24 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
     for (int i = 0; i < thread_m_blocks; i++)
       ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
-    frag_b_quant[k % 2] = *reinterpret_cast<I4 *>(
-        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+
+#pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4 *>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
   };
 
   bool is_same_group[stages];
   int same_group_id[stages];
 
   auto init_same_group = [&](int pipe) {
+    if constexpr (!has_act_order) {
+      is_same_group[pipe] = false;
+      same_group_id[pipe] = 0;
+      return;
+    }
+
     int4 *sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
     int *sh_g_idx_int_ptr = reinterpret_cast<int *>(sh_g_idx_stage);
 
@@ -767,10 +828,23 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
 // dequantization and matmul operations.
 #pragma unroll
     for (int j = 0; j < 4; j++) {
-      int b_quant = frag_b_quant[k % 2][j];
-      int b_quant_shift = b_quant >> 8;
+      FragB frag_b0;
+      FragB frag_b1;
+      if constexpr (num_bits == 4) {
+        int b_quant = frag_b_quant[k % 2][0][j];
+        int b_quant_shift = b_quant >> 8;
+
+        frag_b0 = dequant_4bit(b_quant);
+        frag_b1 = dequant_4bit(b_quant_shift);
 
-      FragB frag_b0 = dequant(b_quant);
+      } else {
+        int *frag_b_quant_ptr = reinterpret_cast<int *>(frag_b_quant[k % 2]);
+        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+
+        frag_b0 = dequant_8bit(b_quant_0);
+        frag_b1 = dequant_8bit(b_quant_1);
+      }
 
       // Apply scale to frag_b0
       if constexpr (has_act_order) {
@@ -782,8 +856,6 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
         }
       }
 
-      FragB frag_b1 = dequant(b_quant_shift);
-
       // Apply scale to frag_b1
       if constexpr (has_act_order) {
         scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
@@ -808,13 +880,13 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
   // multiple warps that accumulate their partial sums of the same output
   // location; which we have to reduce over in the end. We do in shared memory.
   auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride / 2;
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
-      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
-                      (threadIdx.x % b_sh_stride);
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
 
       // Parallel logarithmic shared memory reduction. We make sure to avoid any
       // unnecessary read or write iterations, e.g., for two warps we write only
@@ -861,7 +933,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
   };
 
   // Since multiple threadblocks may process parts of the same column slice, we
-  // finally have to globally reduce over the results. As the striped portioning
+  // finally have to globally reduce over the results. As the striped partitioning
   // minimizes the number of such reductions and our outputs are usually rather
   // small, we perform this reduction serially in L2 cache.
   auto global_reduce = [&](bool first = false, bool last = false) {
@@ -951,13 +1023,15 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
     auto write = [&](int idx, float c0, float c1, FragS &s) {
       half2 res = __halves2half2(__float2half(c0), __float2half(c1));
 
-      // For per-column quantization we finally apply the scale here
-      if constexpr (!has_act_order && group_blocks == -1) {
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 4) {
         res = __hmul2(res, s[0]);
       }
 
       ((half2 *)sh)[idx] = res;
     };
+
     if (threadIdx.x / 32 < thread_n_blocks / 4) {
 #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
@@ -1023,6 +1097,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
     // ensure all shared memory accesses are static. Note that both pipelines
     // have even length meaning that the next iteration will always start at
     // index 0.
+
 #pragma unroll
     for (int pipe = 0; pipe < stages;) {
 #pragma unroll
@@ -1070,23 +1145,63 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
       // For per-column scales, we only fetch them here in the final step before
       // write-out
       if constexpr (!has_act_order && group_blocks == -1) {
-        if (last) {
+        if constexpr (num_bits == 8) {
           if (s_sh_wr_pred) {
-            cp_async4_stream(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
           cp_async_fence();
+        } else {
+          if (last) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+            }
+            cp_async_fence();
+          }
         }
       }
 
       thread_block_reduce();
       if constexpr (!has_act_order && group_blocks == -1) {
-        if (last) {
+        if constexpr (num_bits == 8) {
           cp_async_wait<0>();
           __syncthreads();
           if (threadIdx.x / 32 < thread_n_blocks / 4) {
             reinterpret_cast<int4 *>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4 *>(&frag_s)[1] = sh_s[s_sh_rd + 4];
           }
+
+        } else {
+          if (last) {
+            cp_async_wait<0>();
+            __syncthreads();
+            if (threadIdx.x / 32 < thread_n_blocks / 4) {
+              reinterpret_cast<int4 *>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+              reinterpret_cast<int4 *>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+#pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+#pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float(reinterpret_cast<float *>(&frag_c[i][j][0][0]),
+                          frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float(reinterpret_cast<float *>(&frag_c[i][j][0][2]),
+                          frag_s[j / 2][2 * (j % 2) + 0]);
+
+              scale_float(reinterpret_cast<float *>(&frag_c[i][j][1][0]),
+                          frag_s[j / 2][2 * (j % 2) + 1]);
+              scale_float(reinterpret_cast<float *>(&frag_c[i][j][1][2]),
+                          frag_s[j / 2][2 * (j % 2) + 1]);
+            }
+          }
         }
       }
 
@@ -1125,28 +1240,25 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
           s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
         }
 
-        // if (blockIdx.x == 0 && threadIdx.x == 0) {
-        //   printf("Move\n");
-        // }
         start_pipes();
       }
     }
   }
 }
 
-#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
+#define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
                   HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)                    \
-  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
+  else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&       \
            thread_n_blocks == THREAD_N_BLOCKS &&                               \
            thread_k_blocks == THREAD_K_BLOCKS &&                               \
            has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&   \
            num_threads == NUM_THREADS) {                                       \
     cudaFuncSetAttribute(                                                      \
-        Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-               pipe_stages, HAS_ACT_ORDER, GROUP_BLOCKS>,                      \
+        Marlin<NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,        \
+               THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, GROUP_BLOCKS>,     \
         cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);          \
-    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
-           pipe_stages, HAS_ACT_ORDER, GROUP_BLOCKS>                           \
+    Marlin<NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,            \
+           THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, GROUP_BLOCKS>          \
         <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
             A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n, \
             prob_k, locks);                                                    \
@@ -1158,28 +1270,92 @@ typedef struct {
   int num_threads;
 } thread_config_t;
 
-thread_config_t small_batch_thread_configs[] = {
+typedef struct {
+  int max_m_blocks;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+thread_config_t thread_configs[] = {
     // Ordered by priority
 
     // thread_k, thread_n, num_threads
-    {128, 128, 256}, // Default
-    {128, 64, 128},  // Reduce N 2X, same K
-    {64, 256, 256},  // Reduce K 2X, increase N 2X
-    {64, 128, 128},  // Reduce K 2X, same N
+    {64, 256, 256}, // Default (max cache usage)
+    {64, 128, 128}, // Reduce N, reduce warps
+    {128, 64, 128}, // Reduce N more, but increase K
+
 };
 
-thread_config_t large_batch_thread_configs[] = {
-    // Ordered by priority
+int get_scales_cache_size(thread_config_t const &th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
 
-    // thread_k, thread_n, num_threads
-    {64, 256, 256}, // Default
-    {128, 64, 128}, // Reduce N 2X, same K
-    {64, 128, 128}, // Reduce N 2X, same K
-                    // {128, 64, 128},  // Reduce N 4X, increase K 2X
-};
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = div_ceil(tb_k, 32); // Worst case is 32 group size
+  } else {
+    tb_groups = div_ceil(tb_k, group_size);
+  }
+
+  if (cache_scales_chunk) {
+    int load_groups =
+        tb_groups * pipe_stages * 2;    // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32); // We load at least 32 scale groups
+    return load_groups * tb_n * 2;
+
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+
+    return tb_scales * pipe_stages;
+  }
+}
+
+bool is_valid_cache_size(thread_config_t const &th_config, int max_m_blocks,
+                         int prob_m, int prob_n, int prob_k, int num_bits,
+                         int scales_cache_size, int max_shared_mem) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+
+  int b_size = (tb_k * tb_n / pack_factor) * 4;
+
+  // Get A size
+  int m_blocks = div_ceil(prob_m, 16);
+  int tb_max_m = 16;
 
-bool is_valid_config(thread_config_t const &th_config, int prob_m, int prob_n,
-                     int prob_k) {
+  while (true) {
+    if (m_blocks >= max_m_blocks) {
+      tb_max_m *= max_m_blocks;
+      break;
+    }
+
+    max_m_blocks--;
+    if (max_m_blocks == 0) {
+      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
+    }
+  }
+
+  int a_size = (tb_max_m * tb_k) * 2;
+
+  float pipe_size = (a_size + b_size) * pipe_stages;
+
+  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity
+
+  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
+}
+
+bool is_valid_config(thread_config_t const &th_config, int max_m_blocks,
+                     int prob_m, int prob_n, int prob_k, int num_bits,
+                     int group_size, bool has_act_order, bool is_k_full,
+                     int max_shared_mem) {
   // Sanity
   if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
       th_config.num_threads == -1) {
@@ -1201,62 +1377,79 @@ bool is_valid_config(thread_config_t const &th_config, int prob_m, int prob_n,
     return false;
   }
 
+  //  Determine cache for scales
+  int scales_cache_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
+                            group_size, has_act_order, is_k_full);
+
+  // Check that pipeline fits into cache
+  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                           num_bits, scales_cache_size, max_shared_mem)) {
+    return false;
+  }
+
   return true;
 }
 
-thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
-
-  // TODO: Enable if needed after some more testing
-  if (prob_m <= 0) {
-    for (auto th_config : small_batch_thread_configs) {
-      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-        return th_config;
+exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
+                                      int num_bits, int group_size,
+                                      bool has_act_order, bool is_k_full,
+                                      int max_shared_mem) {
+  int max_m_blocks = 4;
+  while (max_m_blocks > 0) {
+    for (auto th_config : thread_configs) {
+      if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                          num_bits, group_size, has_act_order, is_k_full,
+                          max_shared_mem)) {
+        return exec_config_t{max_m_blocks, th_config};
       }
     }
 
-  } else {
-    for (auto th_config : large_batch_thread_configs) {
-      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-        return th_config;
-      }
-    }
+    printf("WARNING: Marlin kernel is reducing max_m_blocks due to small SM "
+           "GPU cache. This may "
+           "hurt performance. Consider upgrading your GPU.\n");
+
+    max_m_blocks--; // Process less M blocks per invocation to reduce cache
+                    // usage
   }
 
-  return thread_config_t{-1, -1, -1};
+  return exec_config_t{0, {-1, -1, -1}};
 }
 
-#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)                               \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)                       \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)                       \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)                       \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)                       \
+#define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)                     \
+  __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)             \
+  __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)             \
+  __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)             \
+  __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)             \
                                                                                \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)                     \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)                      \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)                      \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)                      \
+  __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)           \
+  __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)            \
+  __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)            \
+  __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)            \
                                                                                \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)                     \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)                      \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)                      \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)                      \
+  __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)           \
+  __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)            \
+  __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)            \
+  __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)            \
                                                                                \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)                     \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)                      \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)                      \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)                      \
+  __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)           \
+  __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)            \
+  __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)            \
+  __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)            \
                                                                                \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)                     \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)                      \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)                      \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
-
-void marlin_cuda(const void *A, const void *B, void *C, void *s, void *g_idx,
-                 void *perm, void *a_tmp, int prob_m, int prob_n, int prob_k,
-                 void *workspace, bool has_act_order, bool is_k_full,
-                 int num_groups, int group_size, int dev = 0,
-                 cudaStream_t stream = 0, int thread_k = -1, int thread_n = -1,
-                 int sms = -1, int max_par = 16) {
+  __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)           \
+  __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)            \
+  __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)            \
+  __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+
+void marlin_mm_f16i4(const void *A, const void *B, void *C, void *s,
+                     void *g_idx, void *perm, void *a_tmp, int prob_m,
+                     int prob_n, int prob_k, void *workspace, int num_bits,
+                     bool has_act_order, bool is_k_full, int num_groups,
+                     int group_size, int dev, cudaStream_t stream, int thread_k,
+                     int thread_n, int sms, int max_par) {
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -1274,25 +1467,34 @@ void marlin_cuda(const void *A, const void *B, void *C, void *s, void *g_idx,
   TORCH_CHECK(max_shared_mem > 0);
 
   // Set thread config
-  thread_config_t th_config;
+  exec_config_t exec_cfg;
   if (thread_k != -1 && thread_n != -1) {
     // User-defined config
-    th_config = thread_config_t{thread_k, thread_n, default_threads};
+    exec_cfg =
+        exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}};
   } else {
     // Auto config
-    th_config = determine_thread_config(prob_m, prob_n, prob_k);
+    exec_cfg =
+        determine_thread_config(prob_m, prob_n, prob_k, num_bits, group_size,
+                                has_act_order, is_k_full, max_shared_mem);
   }
 
-  TORCH_CHECK(is_valid_config(th_config, prob_m, prob_n, prob_k),
-              "Invalid thread config: thread_k = " + str(th_config.thread_k) +
-                  ", thread_n = " + str(th_config.thread_n) +
-                  ", num_threads = " + str(th_config.num_threads) +
-                  " for MKN = [" + str(prob_m) + ", " + str(prob_k) + ", " +
-                  str(prob_n) + "]");
-
-  int num_threads = th_config.num_threads;
-  thread_k = th_config.thread_k;
-  thread_n = th_config.thread_n;
+  TORCH_CHECK(exec_cfg.max_m_blocks > 0 &&
+                  is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks,
+                                  prob_m, prob_n, prob_k, num_bits, group_size,
+                                  has_act_order, is_k_full, max_shared_mem),
+              "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
+              ", thread_k = ", exec_cfg.tb_cfg.thread_k,
+              ", thread_n = ", exec_cfg.tb_cfg.thread_n,
+              ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [",
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              ", group_size = ", group_size,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+              ", max_shared_mem = ", max_shared_mem);
+
+  int num_threads = exec_cfg.tb_cfg.num_threads;
+  thread_k = exec_cfg.tb_cfg.thread_k;
+  thread_n = exec_cfg.tb_cfg.thread_n;
 
   int thread_k_blocks = thread_k / 16;
   int thread_n_blocks = thread_n / 16;
@@ -1352,28 +1554,32 @@ void marlin_cuda(const void *A, const void *B, void *C, void *s, void *g_idx,
   }
 
   // Main loop
-  for (int i = 0; i < tot_m_blocks; i += 4) {
+  for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
     int thread_m_blocks = tot_m_blocks - i;
     prob_m = tot_m - 16 * i;
     int par = 1;
-    if (thread_m_blocks > 4) {
+    if (thread_m_blocks > exec_cfg.max_m_blocks) {
       // Note that parallel > 1 currently only works for inputs without any
       // padding
-      par = (16 * thread_m_blocks - pad) / 64;
+      par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
       if (par > max_par)
         par = max_par;
-      prob_m = 64 * par;
-      i += 4 * (par - 1);
-      thread_m_blocks = 4;
+      prob_m = (16 * exec_cfg.max_m_blocks) * par;
+      i += exec_cfg.max_m_blocks * (par - 1);
+      thread_m_blocks = exec_cfg.max_m_blocks;
     }
 
     // Define kernel configurations
     if (false) {
     }
-    CALL_IF(16, 4, 256)
-    CALL_IF(8, 8, 256)
-    CALL_IF(8, 4, 128)
-    CALL_IF(4, 8, 128)
+    CALL_IF(4, 32, 2, 256)
+    CALL_IF(4, 16, 4, 256)
+    CALL_IF(4, 8, 4, 128)
+    CALL_IF(4, 4, 8, 128)
+    CALL_IF(8, 32, 2, 256)
+    CALL_IF(8, 16, 4, 256)
+    CALL_IF(8, 8, 4, 128)
+    CALL_IF(8, 4, 8, 128)
     else {
       TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
                              str(prob_n) + ", " + str(prob_k) + "]" +
@@ -1395,33 +1601,32 @@ void marlin_cuda(const void *A, const void *B, void *C, void *s, void *g_idx,
 torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
                                torch::Tensor &b_scales, torch::Tensor &g_idx,
                                torch::Tensor &perm, torch::Tensor &workspace,
-                               int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full) {
+                               int64_t num_bits, int64_t size_m, int64_t size_n,
+                               int64_t size_k, bool is_k_full) {
+  // Verify num_bits
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  int pack_factor = 32 / num_bits;
+
   // Verify A
-  TORCH_CHECK(a.size(0) == size_m,
-              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
-                  ", size_m = " + str(size_m));
-  TORCH_CHECK(a.size(1) == size_k,
-              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
-                  ", size_k = " + str(size_k));
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+              ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
+              ", size_k = ", size_k);
 
   // Verify B
-  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0,
-              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
-                  str(gptq_marlin::tile_size));
+  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_size = ", gptq_marlin::tile_size);
   TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
-              "Shape mismatch: b_q_weight.size(0) = " +
-                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
-                  ", tile_size = " + str(gptq_marlin::tile_size));
-  TORCH_CHECK(
-      b_q_weight.size(1) % gptq_marlin::tile_size == 0,
-      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
-          " is not divisible by tile_size = " + str(gptq_marlin::tile_size));
-  int actual_size_n = (b_q_weight.size(1) / gptq_marlin::tile_size) *
-                      gptq_marlin::pack_factor_4bit;
-  TORCH_CHECK(size_n == actual_size_n,
-              "size_n = " + str(size_n) +
-                  ", actual_size_n = " + str(actual_size_n));
+              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+              ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
+  TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
+              "b_q_weight.size(1) = ", b_q_weight.size(1),
+              " is not divisible by tile_size = ", gptq_marlin::tile_size);
+  int actual_size_n =
+      (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+              ", actual_size_n = ", actual_size_n);
 
   // Verify device and strides
   TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
@@ -1457,9 +1662,9 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
   // Verify g_idx and perm
   TORCH_CHECK((g_idx.size(0) == 0 && perm.size(0) == 0) ||
                   (g_idx.size(0) == size_k && perm.size(0) == size_k),
-              "Unexpected g_idx.size(0) = " + str(g_idx.size(0)) +
-                  " and perm.size(0) = " + str(perm.size(0)) +
-                  ", where size_k = " + str(size_k));
+              "Unexpected g_idx.size(0) = ", g_idx.size(0),
+              " and perm.size(0) = ", perm.size(0),
+              ", where size_k = ", size_k);
 
   // Detect groupsize and act_order
   int num_groups = -1;
@@ -1475,9 +1680,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
   if (has_act_order) {
     if (is_k_full) {
       TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
-      TORCH_CHECK(size_k % num_groups == 0,
-                  "size_k = " + str(size_k) +
-                      ", is not divisible by num_groups = " + str(num_groups));
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
+                  ", is not divisible by num_groups = ", num_groups);
       group_size = size_k / num_groups;
     } else {
       group_size = 0;
@@ -1485,10 +1689,9 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
 
   } else {
     if (num_groups > 1) {
-      TORCH_CHECK(size_k % num_groups == 0,
-                  "size_k = " + str(size_k) +
-                      ", is not divisible by b_scales.size(0) = " +
-                      str(b_scales.size(0)));
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k,
+          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
       group_size = size_k / num_groups;
     } else {
       group_size = -1;
@@ -1496,23 +1699,22 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
   }
 
   // Verify workspace size
-  TORCH_CHECK(size_n % gptq_marlin::min_thread_n == 0,
-              "size_n = " + str(size_n) +
-                  ", is not divisible by min_thread_n = " +
-                  str(gptq_marlin::min_thread_n));
+  TORCH_CHECK(
+      size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
+      ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
   int min_workspace_size =
       (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
   TORCH_CHECK(workspace.numel() >= min_workspace_size,
-              "workspace.numel = " + str(workspace.numel()) +
-                  " is below min_workspace_size = " + str(min_workspace_size));
+              "workspace.numel = ", workspace.numel(),
+              " is below min_workspace_size = ", min_workspace_size);
 
   int dev = a.get_device();
-  gptq_marlin::marlin_cuda(
+  gptq_marlin::marlin_mm_f16i4(
       a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), b_scales.data_ptr(),
       g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n,
-      size_k, workspace.data_ptr(), has_act_order, is_k_full, num_groups,
-      group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n,
-      sms, gptq_marlin::max_par);
+      size_k, workspace.data_ptr(), num_bits, has_act_order, is_k_full,
+      num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+      thread_k, thread_n, sms, gptq_marlin::max_par);
 
   return c;
 }
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cuh b/csrc/quantization/gptq_marlin/gptq_marlin.cuh
index 8cfce6b2575d5..35ea48aaba310 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cuh
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cuh
@@ -24,8 +24,6 @@ static constexpr int min_thread_k = 64;
 static constexpr int tile_size = 16;
 static constexpr int max_par   = 16;
 
-static constexpr int pack_factor_4bit = 8; // We have 8 4-bit vals inside a 32 bit
-
 template <typename T, int n>
 struct Vec {
   T             elems[n];
@@ -51,13 +49,11 @@ __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool
                "r"(smem), "l"(glob_ptr), "n"(BYTES));
 }
 
-__device__ inline void cp_async4_stream(void* smem_ptr, const void* glob_ptr) {
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
   const int BYTES = 16;
   uint32_t  smem  = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
   asm volatile("{\n"
-               "   .reg .b64 p;\n"
-               "   createpolicy.fractional.L2::evict_first.b64 p, 1.0;"
-               "   cp.async.cg.shared.global.L2::cache_hint [%0], [%1], %2, p;\n"
+               "   cp.async.cg.shared.global [%0], [%1], %2;\n"
                "}\n" ::"r"(smem),
                "l"(glob_ptr), "n"(BYTES));
 }
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
index fa45ce68a0c77..0d3da6240dbca 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -11,7 +11,7 @@ static constexpr int tile_n_size = tile_k_size * 4;
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
-template <int const num_threads, bool const has_perm>
+template <int const num_threads, int const num_bits, bool const has_perm>
 __global__ void
 marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
                      uint32_t const *__restrict__ perm_ptr,
@@ -20,7 +20,8 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
 } // namespace gptq_marlin
 
 torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
-                                 int64_t size_k, int64_t size_n) {
+                                 int64_t size_k, int64_t size_n,
+                                 int64_t num_bits) {
   TORCH_CHECK_NOT_IMPLEMENTED(
       false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
   return torch::empty({1, 1});
@@ -28,11 +29,13 @@ torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
 
 #else
 
-template <int const num_threads, bool const has_perm>
+template <int const num_threads, int const num_bits, bool const has_perm>
 __global__ void
 marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
                      uint32_t const *__restrict__ perm_ptr,
                      uint32_t *__restrict__ out_ptr, int size_k, int size_n) {
+  constexpr int pack_factor = 32 / num_bits;
+
   int k_tiles = size_k / tile_k_size;
   int n_tiles = size_n / tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
@@ -64,9 +67,10 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
     sh_pipe_ptr += perm_size;
   }
 
+  constexpr int tile_ints = tile_k_size / pack_factor;
+
   constexpr int stage_n_threads = tile_n_size / 4;
-  constexpr int stage_k_threads =
-      has_perm ? tile_k_size : tile_k_size / pack_factor_4bit;
+  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
   constexpr int stage_size = stage_k_threads * stage_n_threads;
 
   auto load_perm_to_shared = [&](int k_tile_id) {
@@ -99,9 +103,9 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
             reinterpret_cast<uint32_t const *>(sh_perm_ptr);
 
         int src_k = sh_perm_int_ptr[k_id];
-        int src_k_packed = src_k / pack_factor_4bit;
+        int src_k_packed = src_k / pack_factor;
 
-        cp_async4_stream(
+        cp_async4(
             &sh_ptr[k_id * stage_n_threads + n_id],
             reinterpret_cast<int4 const *>(&(
                 b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
@@ -113,12 +117,12 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
         int n_id = threadIdx.x % stage_n_threads;
 
         int first_k = k_tile_id * tile_k_size;
-        int first_k_packed = first_k / pack_factor_4bit;
+        int first_k_packed = first_k / pack_factor;
 
-        cp_async4_stream(&sh_ptr[k_id * stage_n_threads + n_id],
-                         reinterpret_cast<int4 const *>(
-                             &(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
-                                              first_n + (n_id * 4)])));
+        cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
+                  reinterpret_cast<int4 const *>(
+                      &(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
+                                       first_n + (n_id * 4)])));
       }
     }
 
@@ -145,26 +149,27 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
     int cur_n = warp_id * 16 + tc_col;
 
     constexpr int sh_stride = 64;
+    constexpr uint32_t mask = (1 << num_bits) - 1;
 
     int4 *sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
     uint32_t *sh_stage_int_ptr = reinterpret_cast<uint32_t *>(sh_stage_ptr);
 
     uint32_t *sh_perm_int_ptr = reinterpret_cast<uint32_t *>(sh_perm_ptr);
 
-    uint32_t vals[pack_factor_4bit];
+    uint32_t vals[8];
 
     if constexpr (has_perm) {
       for (int i = 0; i < 4; i++) {
         int k_idx = tc_row + tc_offsets[i];
 
         uint32_t src_k = sh_perm_int_ptr[k_idx];
-        uint32_t src_k_pos = src_k % pack_factor_4bit;
+        uint32_t src_k_pos = src_k % pack_factor;
 
         uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
-        uint32_t b1_cur_val = (b1_val >> (src_k_pos * 4)) & 0xf;
+        uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;
 
         uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
-        uint32_t b2_cur_val = (b2_val >> (src_k_pos * 4)) & 0xf;
+        uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;
 
         vals[i] = b1_cur_val;
         vals[4 + i] = b2_cur_val;
@@ -172,41 +177,56 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
 
     } else {
 
-      uint32_t b1_val_1 = sh_stage_int_ptr[cur_n];
-      uint32_t b1_val_2 = sh_stage_int_ptr[sh_stride + cur_n];
-
-      uint32_t b2_val_1 = sh_stage_int_ptr[cur_n + 8];
-      uint32_t b2_val_2 = sh_stage_int_ptr[sh_stride + cur_n + 8];
+      uint32_t b1_vals[tile_ints];
+      uint32_t b2_vals[tile_ints];
 
 #pragma unroll
-      for (int i = 0; i < 2; i++) {
-        int cur_elem = tc_row + tc_offsets[i];
-        vals[i] = (b1_val_1 >> (cur_elem * 4)) & 0xf;
-        vals[4 + i] = (b2_val_1 >> (cur_elem * 4)) & 0xf;
+      for (int i = 0; i < tile_ints; i++) {
+        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
+        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
       }
 
 #pragma unroll
-      for (int i = 2; i < 4; i++) {
-        int cur_elem = tc_row + tc_offsets[i] - 8;
-        vals[i] = (b1_val_2 >> (cur_elem * 4)) & 0xf;
-        vals[4 + i] = (b2_val_2 >> (cur_elem * 4)) & 0xf;
+      for (int i = 0; i < 4; i++) {
+        int cur_elem = tc_row + tc_offsets[i];
+        int cur_int = cur_elem / pack_factor;
+        int cur_pos = cur_elem % pack_factor;
+
+        vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
       }
     }
 
+    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+
     // Result of:
     // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-    constexpr int pack_idx[pack_factor_4bit] = {0, 2, 4, 6, 1, 3, 5, 7};
+    if constexpr (num_bits == 4) {
+      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
 
-    uint32_t res = 0;
+      uint32_t res = 0;
 #pragma unroll
-    for (int i = 0; i < pack_factor_4bit; i++) {
-      res |= vals[pack_idx[i]] << (i * 4);
-    }
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
 
-    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor_4bit;
-    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
 
-    out_ptr[out_offset + th_id * 4 + warp_id] = res;
+    } else {
+      constexpr int pack_idx[4] = {0, 2, 1, 3};
+
+      uint32_t res1 = 0;
+      uint32_t res2 = 0;
+#pragma unroll
+      for (int i = 0; i < 4; i++) {
+        res1 |= vals[pack_idx[i]] << (i * 8);
+        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+      }
+
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
+    }
   };
 
   auto start_pipes = [&](int k_tile_id, int n_tile_id) {
@@ -242,19 +262,35 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
 
 } // namespace gptq_marlin
 
+#define CALL_IF(NUM_BITS, HAS_PERM)                                            \
+  else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                     \
+    cudaFuncSetAttribute(                                                      \
+        gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads,         \
+                                          NUM_BITS, HAS_PERM>,                 \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);          \
+    gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS,   \
+                                      HAS_PERM>                                \
+        <<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>(     \
+            b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);                \
+  }
+
 torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
-                                 int64_t size_k, int64_t size_n) {
+                                 int64_t size_k, int64_t size_n,
+                                 int64_t num_bits) {
   // Verify compatibility with marlin tile of 16x64
   TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
               " is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
   TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
               " is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);
 
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  int const pack_factor = 32 / num_bits;
+
   // Verify B
-  TORCH_CHECK((size_k / gptq_marlin::pack_factor_4bit) == b_q_weight.size(0),
+  TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0),
               "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
-              ", size_k = ", size_k,
-              ", pack_factor_4bit = ", gptq_marlin::pack_factor_4bit);
+              ", size_k = ", size_k, ", pack_factor = ", pack_factor);
   TORCH_CHECK(b_q_weight.size(1) == size_n,
               "b_q_weight.size(1) = ", b_q_weight.size(1),
               " is not size_n = ", size_n);
@@ -273,10 +309,10 @@ torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
   auto options = torch::TensorOptions()
                      .dtype(b_q_weight.dtype())
                      .device(b_q_weight.device());
-  torch::Tensor out = torch::empty(
-      {size_k / gptq_marlin::tile_size,
-       size_n * gptq_marlin::tile_size / gptq_marlin::pack_factor_4bit},
-      options);
+  torch::Tensor out =
+      torch::empty({size_k / gptq_marlin::tile_size,
+                    size_n * gptq_marlin::tile_size / pack_factor},
+                   options);
 
   // Detect if there is act_order
   bool has_perm = perm.size(0) != 0;
@@ -299,23 +335,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
                          cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
   TORCH_CHECK(max_shared_mem > 0);
 
-  if (has_perm) {
-    cudaFuncSetAttribute(
-        gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, true>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        max_shared_mem);
-    gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, true>
-        <<<blocks, gptq_marlin::repack_threads, max_shared_mem,
-           stream>>>(b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);
-
-  } else {
-    cudaFuncSetAttribute(
-        gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, false>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        max_shared_mem);
-    gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, false>
-        <<<blocks, gptq_marlin::repack_threads, max_shared_mem,
-           stream>>>(b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);
+  if (false) {
+  }
+  CALL_IF(4, false)
+  CALL_IF(4, true)
+  CALL_IF(8, false)
+  CALL_IF(8, true)
+  else {
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
+                ", has_perm = ", has_perm);
   }
 
   return out;
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index dc027697ffd4d..4d73843f970c4 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -39,6 +39,13 @@
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
     # act_order==True, group_size=32
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
+
+    # 8-bit, act_order==True, group_size=channelwise
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
+    # 8-bit, act_order==True, group_size=128
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
+    # 8-bit, act_order==True, group_size=32
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
 ]
 
 
@@ -65,8 +72,7 @@ def test_models(
                                     dtype=dtype,
                                     quantization="marlin",
                                     max_model_len=MAX_MODEL_LEN,
-                                    tensor_parallel_size=1,
-                                    disable_custom_all_reduce=True)
+                                    tensor_parallel_size=1)
 
     gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
         example_prompts, max_tokens, num_logprobs)
@@ -78,8 +84,7 @@ def test_models(
                              dtype=dtype,
                              quantization="gptq",
                              max_model_len=MAX_MODEL_LEN,
-                             tensor_parallel_size=1,
-                             disable_custom_all_reduce=True)
+                             tensor_parallel_size=1)
     gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
                                                        max_tokens,
                                                        num_logprobs)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 4af8b09b1e16c..3faed5ea85307 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -169,18 +169,20 @@ def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
 
 # gptq_marlin
 def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
-                       size_k: int, size_n: int) -> torch.Tensor:
-    return vllm_ops.gptq_marlin_repack(b_q_weight, perm, size_k, size_n)
+                       size_k: int, size_n: int,
+                       num_bits: int) -> torch.Tensor:
+    return vllm_ops.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
+                                       num_bits)
 
 
 def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                      b_scales: torch.Tensor, g_idx: torch.Tensor,
-                     perm: torch.Tensor, workspace: torch.Tensor, size_m: int,
-                     size_n: int, size_k: int,
+                     perm: torch.Tensor, workspace: torch.Tensor,
+                     num_bits: int, size_m: int, size_n: int, size_k: int,
                      is_k_full: bool) -> torch.Tensor:
     return vllm_ops.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
-                                     workspace, size_m, size_n, size_k,
-                                     is_k_full)
+                                     workspace, num_bits, size_m, size_n,
+                                     size_k, is_k_full)
 
 
 # fp8
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index efbffa0878c4b..e2464008a875f 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -2,7 +2,6 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional
 
-import numpy
 import torch
 from torch.nn.parameter import Parameter
 
@@ -17,41 +16,13 @@
 GPTQ_MARLIN_MIN_THREAD_K = 128
 GPTQ_MARLIN_MAX_PARALLEL = 16
 
-GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4]
+GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
 GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 GPTQ_MARLIN_SUPPORTED_SYM = [True]
 
 
-# Precompute permutations for Marlin weight and scale shuffling
-#
-# Marlin works on [16,64] tiles. The goal of the permutations
-# is to reorder the weight data so that it is compatible
-# with the tensor-core format that is described here:
-# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
-#
-# As a result of this reordering, the vector loads inside the
-# kernel will get the data as it is needed for tensor-core
-# (without the need to use ldmatrix instructions)
-def _get_perms():
-    perm = []
-    for i in range(32):
-        perm1 = []
-        col = i // 4
-        for block in [0, 1]:
-            for row in [
-                    2 * (i % 4),
-                    2 * (i % 4) + 1,
-                    2 * (i % 4 + 4),
-                    2 * (i % 4 + 4) + 1,
-            ]:
-                perm1.append(16 * row + col + 8 * block)
-        for j in range(4):
-            perm.extend([p + 256 * j for p in perm1])
-
-    perm = numpy.array(perm)
-    interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
-    perm = perm.reshape((-1, 8))[:, interleave].ravel()  # type: ignore
-    perm = torch.from_numpy(perm)
+# Permutations for Marlin scale shuffling
+def get_scale_perms(num_bits):
     scale_perm = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
@@ -59,23 +30,21 @@ def _get_perms():
     for i in range(4):
         scale_perm_single.extend(
             [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-    return perm, scale_perm, scale_perm_single
-
-
-_perm, _scale_perm, _scale_perm_single = _get_perms()
+    return scale_perm, scale_perm_single
 
 
 def get_pack_factor(num_bits):
-    assert num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS, (
-        f"Unsupported num_bits = {num_bits}")
+    assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
+            ), f"Unsupported num_bits = {num_bits}"
     return 32 // num_bits
 
 
-def marlin_permute_scales(s, size_k, size_n, group_size):
+def marlin_permute_scales(s, size_k, size_n, group_size, num_bits):
+    scale_perm, scale_perm_single = get_scale_perms(num_bits)
     if group_size < size_k and group_size != -1:
-        s = s.reshape((-1, len(_scale_perm)))[:, _scale_perm]
+        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
     else:
-        s = s.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single]
+        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
     s = s.reshape((-1, size_n)).contiguous()
 
     return s
@@ -279,13 +248,15 @@ def create_weights(
             requires_grad=False,
         )
         set_weight_attrs(
-            qweight, {
+            qweight,
+            {
                 **extra_weight_attrs,
                 "input_dim": 0,
                 "output_dim": 1,
                 "packed_dim": 0,
                 "pack_factor": self.quant_config.pack_factor,
-            })
+            },
+        )
 
         # Activation order
         g_idx = Parameter(
@@ -296,10 +267,13 @@ def create_weights(
             requires_grad=False,
         )
         # Ignore warning from fused linear layers such as QKVParallelLinear.
-        set_weight_attrs(g_idx, {
-            **extra_weight_attrs, "input_dim": 0,
-            "ignore_warning": True
-        })
+        set_weight_attrs(
+            g_idx,
+            {
+                **extra_weight_attrs, "input_dim": 0,
+                "ignore_warning": True
+            },
+        )
 
         g_idx_sort_indices = Parameter(
             torch.empty(
@@ -320,29 +294,34 @@ def create_weights(
             requires_grad=False,
         )
         set_weight_attrs(
-            scales, {
+            scales,
+            {
                 **extra_weight_attrs,
                 "input_dim": scales_and_zp_input_dim,
                 "output_dim": 1,
-            })
+            },
+        )
 
         # Quantized zero-points
         qzeros = Parameter(
-            torch.empty(scales_and_zp_size,
-                        output_size_per_partition //
-                        self.quant_config.pack_factor,
-                        dtype=torch.int32,
-                        device="meta"),
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+                device="meta",
+            ),
             requires_grad=False,
         )
         set_weight_attrs(
-            qzeros, {
+            qzeros,
+            {
                 **extra_weight_attrs,
                 "input_dim": scales_and_zp_input_dim,
                 "output_dim": 1,
                 "packed_dim": 1,
                 "pack_factor": self.quant_config.pack_factor,
-            })
+            },
+        )
 
         # Allocate marlin workspace
         max_workspace_size = (
@@ -405,13 +384,14 @@ def replace_tensor(name, new_t):
 
             else:
                 # Reset g_idx related tensors
-                layer.g_idx = Parameter(torch.empty(0,
-                                                    dtype=torch.int,
-                                                    device=cur_device),
-                                        requires_grad=False)
-                layer.g_idx_sort_indices = Parameter(torch.empty(
-                    0, dtype=torch.int, device=cur_device),
-                                                     requires_grad=False)
+                layer.g_idx = Parameter(
+                    torch.empty(0, dtype=torch.int, device=cur_device),
+                    requires_grad=False,
+                )
+                layer.g_idx_sort_indices = Parameter(
+                    torch.empty(0, dtype=torch.int, device=cur_device),
+                    requires_grad=False,
+                )
 
             # Repack weights
             marlin_qweight = ops.gptq_marlin_repack(
@@ -419,6 +399,7 @@ def replace_tensor(name, new_t):
                 layer.g_idx_sort_indices,
                 part_size_k,
                 part_size_n,
+                self.quant_config.weight_bits,
             )
             replace_tensor("qweight", marlin_qweight)
 
@@ -428,15 +409,28 @@ def replace_tensor(name, new_t):
             if self.quant_config.desc_act:
                 scales_size_k = full_size_k
 
-            marlin_scales = marlin_permute_scales(layer.scales, scales_size_k,
-                                                  scales_size_n,
-                                                  self.quant_config.group_size)
+            marlin_scales = marlin_permute_scales(
+                layer.scales,
+                scales_size_k,
+                scales_size_n,
+                self.quant_config.group_size,
+                self.quant_config.weight_bits,
+            )
             replace_tensor("scales", marlin_scales)
 
-        output = ops.gptq_marlin_gemm(reshaped_x, layer.qweight, layer.scales,
-                                      layer.g_idx, layer.g_idx_sort_indices,
-                                      layer.workspace, size_m, part_size_n,
-                                      part_size_k, layer.is_k_full)
+        output = ops.gptq_marlin_gemm(
+            reshaped_x,
+            layer.qweight,
+            layer.scales,
+            layer.g_idx,
+            layer.g_idx_sort_indices,
+            layer.workspace,
+            self.quant_config.weight_bits,
+            size_m,
+            part_size_n,
+            part_size_k,
+            layer.is_k_full,
+        )
 
         if bias is not None:
             output.add_(bias)  # In-place add

From fb087af52e3834d98250a455355a3ef329663168 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Fri, 3 May 2024 02:47:41 +0900
Subject: [PATCH 35/65] [mypy][7/N] Cover all directories (#4555)

---
 .github/workflows/mypy.yaml | 2 ++
 format.sh                   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 5b2bad1476dc3..a20753d8a7702 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -45,4 +45,6 @@ jobs:
         mypy vllm/spec_decode --config-file pyproject.toml
         mypy vllm/model_executor  --config-file pyproject.toml
         mypy vllm/lora --config-file pyproject.toml
+        mypy vllm/logging --config-file pyproject.toml
+        mypy vllm/model_executor --config-file pyproject.toml
 
diff --git a/format.sh b/format.sh
index 49149afe41d04..233e6af0c9479 100755
--- a/format.sh
+++ b/format.sh
@@ -107,6 +107,8 @@ mypy vllm/worker --config-file pyproject.toml
 mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/model_executor  --config-file pyproject.toml
 mypy vllm/lora --config-file pyproject.toml
+mypy vllm/logging --config-file pyproject.toml
+mypy vllm/model_executor --config-file pyproject.toml
 
 
 CODESPELL_EXCLUDES=(

From 5ad60b0cbd0a396eb3f1fda6bbf2c95aff6d5ecf Mon Sep 17 00:00:00 2001
From: Hu Dong <itechbear@gmail.com>
Date: Fri, 3 May 2024 01:50:25 +0800
Subject: [PATCH 36/65] [Misc] Exclude the `tests` directory from being
 packaged (#4552)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index d534cec437261..801d8d50db1dc 100644
--- a/setup.py
+++ b/setup.py
@@ -404,7 +404,7 @@ def _read_requirements(filename: str) -> List[str]:
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
     packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
-                                    "tests")),
+                                    "tests*")),
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,

From 1ff0c73a79b0c2788b12bd83523b74c01d414480 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 2 May 2024 18:52:51 +0100
Subject: [PATCH 37/65] [BugFix] Include target-device specific
 requirements.txt in sdist (#4559)

---
 MANIFEST.in | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/MANIFEST.in b/MANIFEST.in
index d385f194c6c0f..82be639ef4d73 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,9 @@
 include LICENSE
 include requirements-common.txt
 include requirements-cuda.txt
+include requirements-rocm.txt
+include requirements-neuron.txt
+include requirements-cpu.txt
 include CMakeLists.txt
 
 recursive-include cmake *

From 5b8a7c1cb0f1bb81266bae98944c055a8abb1a68 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 2 May 2024 11:13:25 -0700
Subject: [PATCH 38/65] [Misc] centralize all usage of environment variables
 (#4548)

---
 vllm/attention/backends/rocm_flash_attn.py    |   5 +-
 vllm/attention/selector.py                    |   6 +-
 vllm/config.py                                |   5 -
 .../device_communicators/custom_all_reduce.py |   8 +-
 vllm/distributed/parallel_state.py            |   4 +-
 vllm/distributed/utils.py                     |   7 +-
 vllm/engine/async_llm_engine.py               |   5 +-
 vllm/entrypoints/openai/api_server.py         |   4 +-
 vllm/envs.py                                  | 160 ++++++++++++++++++
 vllm/executor/cpu_executor.py                 |   5 +-
 vllm/executor/multiproc_worker_utils.py       |   5 +-
 vllm/executor/ray_gpu_executor.py             |   8 +-
 vllm/logger.py                                |   6 +-
 vllm/model_executor/model_loader/loader.py    |   7 +-
 .../model_executor/model_loader/tensorizer.py |  12 +-
 vllm/transformers_utils/tokenizer.py          |   2 +-
 vllm/usage/usage_lib.py                       |  16 +-
 vllm/utils.py                                 |  19 ++-
 18 files changed, 220 insertions(+), 64 deletions(-)
 create mode 100644 vllm/envs.py

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 934acea0a3d60..b7d15de772556 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -1,10 +1,10 @@
 """Attention layer ROCm GPUs."""
-import os
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Type
 
 import torch
 
+import vllm.envs as envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata,
                                               AttentionMetadataPerStage)
@@ -156,8 +156,7 @@ def __init__(
 
         self.use_naive_attn = False
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
-        self.use_triton_flash_attn = (os.environ.get(
-            "VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1"))
+        self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
         if self.use_triton_flash_attn:
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                 triton_attention)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 7cc17f21dcd0e..7ae8c31fae1ac 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,18 +1,16 @@
 import enum
-import os
 from functools import lru_cache
 from typing import Type
 
 import torch
 
+import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.utils import is_cpu, is_hip
 
 logger = init_logger(__name__)
 
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
-
 
 class _Backend(enum.Enum):
     FLASH_ATTN = enum.auto()
@@ -79,7 +77,7 @@ def _which_attn_to_use(dtype: torch.dtype) -> _Backend:
             "package is not found. Please install it for better performance.")
         return _Backend.XFORMERS
 
-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
+    backend_by_env_var = envs.VLLM_ATTENTION_BACKEND
     if backend_by_env_var is not None:
         return _Backend[backend_by_env_var]
 
diff --git a/vllm/config.py b/vllm/config.py
index 257d49b6e804f..aaa2f60739d55 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,6 +1,5 @@
 import enum
 import json
-import os
 from dataclasses import dataclass, field, fields
 from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
 
@@ -24,10 +23,6 @@
 
 logger = init_logger(__name__)
 
-# If true, will load models from ModelScope instead of Hugging Face Hub.
-VLLM_USE_MODELSCOPE = os.environ.get("VLLM_USE_MODELSCOPE",
-                                     "False").lower() == "true"
-
 _GB = 1 << 30
 
 
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index ec4533326e841..cc5f8166877ce 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,10 +1,10 @@
-import os
 from contextlib import contextmanager
 from typing import Any, List, Optional
 
 import torch
 import torch.distributed as dist
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 try:
@@ -54,9 +54,9 @@ def init_custom_ar() -> None:
         return
     # test nvlink first, this will filter out most of the cases
     # where custom allreduce is not supported
-    if "CUDA_VISIBLE_DEVICES" in os.environ:
-        device_ids = list(
-            map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")))
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+    if cuda_visible_devices:
+        device_ids = list(map(int, cuda_visible_devices.split(",")))
     else:
         device_ids = list(range(num_dev))
     # this checks hardware and driver support for NVLink
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 6ca6fc5b5f9fe..a82a1254693df 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -4,11 +4,11 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 """Tensor and pipeline parallel groups."""
 import contextlib
-import os
 from typing import Optional
 
 import torch
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -80,7 +80,7 @@ def init_distributed_environment(
         # local_rank is not available in torch ProcessGroup,
         # see https://github.com/pytorch/pytorch/issues/122816
         if local_rank == -1 and distributed_init_method == "env://":
-            local_rank = int(os.environ['LOCAL_RANK'])
+            local_rank = envs.LOCAL_RANK
         global _LOCAL_RANK
         _LOCAL_RANK = local_rank
 
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 9a13b94c3ada1..1965d4c1d3cbc 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -9,6 +9,7 @@
 import torch
 import torch.distributed as dist
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 from .parallel_state import get_cpu_world_group, get_local_rank
@@ -102,11 +103,13 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
     is_distributed = dist.is_initialized()
 
     num_dev = torch.cuda.device_count()
-    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     if cuda_visible_devices is None:
         cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
+    VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
     path = os.path.expanduser(
-        f"~/.config/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
+        f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
+    )
     os.makedirs(os.path.dirname(path), exist_ok=True)
     if (not is_distributed or get_local_rank() == 0) \
         and (not os.path.exists(path)):
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 5591893d267a2..cf5053bba1d48 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,5 +1,4 @@
 import asyncio
-import os
 import time
 from functools import partial
 from typing import (Any, AsyncIterator, Callable, Dict, Iterable, List,
@@ -7,6 +6,7 @@
 
 from transformers import PreTrainedTokenizer
 
+import vllm.envs as envs
 from vllm.config import DecodingConfig, ModelConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -20,8 +20,7 @@
 from vllm.usage.usage_lib import UsageContext
 
 logger = init_logger(__name__)
-ENGINE_ITERATION_TIMEOUT_S = int(
-    os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60"))
+ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
 
 
 class AsyncEngineDeadError(RuntimeError):
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 40103f70a31a3..8b3c5ea9de9c0 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,7 +1,6 @@
 import asyncio
 import importlib
 import inspect
-import os
 import re
 from contextlib import asynccontextmanager
 from http import HTTPStatus
@@ -16,6 +15,7 @@
 from starlette.routing import Mount
 
 import vllm
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.cli_args import make_arg_parser
@@ -129,7 +129,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
         allow_headers=args.allowed_headers,
     )
 
-    if token := os.environ.get("VLLM_API_KEY") or args.api_key:
+    if token := envs.VLLM_API_KEY or args.api_key:
 
         @app.middleware("http")
         async def authentication(request: Request, call_next):
diff --git a/vllm/envs.py b/vllm/envs.py
new file mode 100644
index 0000000000000..26ed731caa5ff
--- /dev/null
+++ b/vllm/envs.py
@@ -0,0 +1,160 @@
+import os
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
+
+if TYPE_CHECKING:
+    VLLM_HOST_IP: str = ""
+    VLLM_USE_MODELSCOPE: bool = False
+    VLLM_INSTANCE_ID: Optional[str] = None
+    VLLM_NCCL_SO_PATH: Optional[str] = None
+    LD_LIBRARY_PATH: Optional[str] = None
+    VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    LOCAL_RANK: int = 0
+    CUDA_VISIBLE_DEVICES: Optional[str] = None
+    VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
+    VLLM_API_KEY: Optional[str] = None
+    S3_ACCESS_KEY_ID: Optional[str] = None
+    S3_SECRET_ACCESS_KEY: Optional[str] = None
+    S3_ENDPOINT_URL: Optional[str] = None
+    VLLM_CONFIG_ROOT: str = ""
+    VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
+    VLLM_NO_USAGE_STATS: bool = False
+    VLLM_DO_NOT_TRACK: bool = False
+    VLLM_USAGE_SOURCE: str = ""
+    VLLM_CONFIGURE_LOGGING: int = 1
+    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
+    VLLM_TRACE_FUNCTION: int = 0
+    VLLM_ATTENTION_BACKEND: Optional[str] = None
+    VLLM_CPU_KVCACHE_SPACE: int = 0
+    VLLM_USE_RAY_COMPILED_DAG: bool = False
+    VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+
+environment_variables: Dict[str, Callable[[], Any]] = {
+    # used in distributed environment to determine the master address
+    'VLLM_HOST_IP':
+    lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
+
+    # If true, will load models from ModelScope instead of Hugging Face Hub.
+    # note that the value is true or false, not numbers
+    "VLLM_USE_MODELSCOPE":
+    lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
+
+    # Instance id represents an instance of the VLLM. All processes in the same
+    # instance should have the same instance id.
+    "VLLM_INSTANCE_ID":
+    lambda: os.environ.get("VLLM_INSTANCE_ID", None),
+
+    # path to cudatoolkit home directory, under which should be bin, include,
+    # and lib directories.
+    "CUDA_HOME":
+    lambda: os.environ.get("CUDA_HOME", None),
+
+    # Path to the NCCL library file. It is needed because nccl>=2.19 brought
+    # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
+    "VLLM_NCCL_SO_PATH":
+    lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
+
+    # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
+    # library file in the locations specified by `LD_LIBRARY_PATH`
+    "LD_LIBRARY_PATH":
+    lambda: os.environ.get("LD_LIBRARY_PATH", None),
+
+    # flag to control if vllm should use triton flash attention
+    "VLLM_USE_TRITON_FLASH_ATTN":
+    lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
+             ("true", "1")),
+
+    # local rank of the process in the distributed setting, used to determine
+    # the GPU device id
+    "LOCAL_RANK":
+    lambda: int(os.environ.get("LOCAL_RANK", "0")),
+
+    # used to control the visible devices in the distributed setting
+    "CUDA_VISIBLE_DEVICES":
+    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
+
+    # timeout for each iteration in the engine
+    "VLLM_ENGINE_ITERATION_TIMEOUT_S":
+    lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
+
+    # API key for VLLM API server
+    "VLLM_API_KEY":
+    lambda: os.environ.get("VLLM_API_KEY", None),
+
+    # S3 access information, used for tensorizer to load model from S3
+    "S3_ACCESS_KEY_ID":
+    lambda: os.environ.get("S3_ACCESS_KEY", None),
+    "S3_SECRET_ACCESS_KEY":
+    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
+    "S3_ENDPOINT_URL":
+    lambda: os.environ.get("S3_ENDPOINT_URL", None),
+
+    # Root directory for VLLM configuration files
+    # Note that this not only affects how vllm finds its configuration files
+    # during runtime, but also affects how vllm installs its configuration
+    # files during **installation**.
+    "VLLM_CONFIG_ROOT":
+    lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
+        "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
+
+    # Usage stats collection
+    "VLLM_USAGE_STATS_SERVER":
+    lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
+    "VLLM_NO_USAGE_STATS":
+    lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
+    "VLLM_DO_NOT_TRACK":
+    lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
+        "DO_NOT_TRACK", None) or "0") == "1",
+    "VLLM_USAGE_SOURCE":
+    lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
+
+    # Logging configuration
+    # If set to 0, vllm will not configure logging
+    # If set to 1, vllm will configure logging using the default configuration
+    #    or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
+    "VLLM_CONFIGURE_LOGGING":
+    lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
+    "VLLM_LOGGING_CONFIG_PATH":
+    lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
+
+    # Trace function calls
+    # If set to 1, vllm will trace function calls
+    # Useful for debugging
+    "VLLM_TRACE_FUNCTION":
+    lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
+
+    # Backend for attention computation
+    # Available options:
+    # - "TORCH_SDPA": use torch.nn.MultiheadAttention
+    # - "FLASH_ATTN": use FlashAttention
+    # - "XFORMERS": use XFormers
+    # - "ROCM_FLASH": use ROCmFlashAttention
+    "VLLM_ATTENTION_BACKEND":
+    lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
+
+    # CPU key-value cache space
+    # default is 4GB
+    "VLLM_CPU_KVCACHE_SPACE":
+    lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
+
+    # If the env var is set, it uses the Ray's compiled DAG API
+    # which optimizes the control plane overhead.
+    # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+    "VLLM_USE_RAY_COMPILED_DAG":
+    lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
+
+    # Use dedicated multiprocess context for workers.
+    # Both spawn and fork work
+    "VLLM_WORKER_MULTIPROC_METHOD":
+    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
+}
+
+
+def __getattr__(name):
+    # lazy evaluation of environment variables
+    if name in environment_variables:
+        return environment_variables[name]()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    return list(environment_variables.keys())
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index da1b500cddaf6..733eef828adc4 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -1,8 +1,8 @@
-import os
 from typing import Dict, List, Set, Tuple
 
 import torch
 
+import vllm.envs as envs
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
@@ -152,8 +152,7 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
         logger.warning("Prefix caching is not supported on CPU, disable it.")
         config.enable_prefix_caching = False
 
-    kv_cache_space_str = os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")
-    kv_cache_space = int(kv_cache_space_str)
+    kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
 
     if kv_cache_space >= 0:
         if kv_cache_space == 0:
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index 0c04796bc38e3..62887533f5c27 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -12,6 +12,7 @@
 from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
                     TypeVar, Union)
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -26,9 +27,7 @@
 
 JOIN_TIMEOUT_S = 2
 
-# Use dedicated multiprocess context for workers.
-# Both spawn and fork work
-mp_method = os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
 mp = multiprocessing.get_context(mp_method)
 
 
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 16d239b9ab580..4684b857ccd39 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -5,6 +5,7 @@
 from itertools import islice, repeat
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
+import vllm.envs as envs
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
@@ -21,10 +22,7 @@
 
 logger = init_logger(__name__)
 
-# If the env var is set, it uses the Ray's compiled DAG API
-# which optimizes the control plane overhead.
-# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
-USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
+USE_RAY_COMPILED_DAG = envs.VLLM_USE_RAY_COMPILED_DAG
 
 
 class RayGPUExecutor(DistributedGPUExecutor):
@@ -145,7 +143,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             "VLLM_INSTANCE_ID":
             VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
-            os.getenv("VLLM_TRACE_FUNCTION", "0"),
+            str(envs.VLLM_TRACE_FUNCTION),
         }, ) for (node_id, _) in worker_node_and_gpu_ids]
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
diff --git a/vllm/logger.py b/vllm/logger.py
index 40c29da2b70ce..153cdfb373bb4 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -10,8 +10,10 @@
 from os import path
 from typing import Dict, Optional
 
-VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
-VLLM_LOGGING_CONFIG_PATH = os.getenv("VLLM_LOGGING_CONFIG_PATH")
+import vllm.envs as envs
+
+VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
+VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
 
 _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
 _DATE_FORMAT = "%m-%d %H:%M:%S"
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 70e64167f8698..bafa2de62e5df 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -9,9 +9,10 @@
 import torch
 from torch import nn
 
-from vllm.config import (VLLM_USE_MODELSCOPE, DeviceConfig, LoadConfig,
-                         LoadFormat, LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VisionLanguageConfig)
+from vllm.config import (DeviceConfig, LoadConfig, LoadFormat, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 0ce9fa95aa7e5..af433b86e604d 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -11,6 +11,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
+import vllm.envs as envs
 from vllm.config import ModelConfig, ParallelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
@@ -142,13 +143,10 @@ class TensorizerArgs:
 
     def __post_init__(self):
         self.file_obj = self.tensorizer_uri
-        self.s3_access_key_id = (self.s3_access_key_id
-                                 or os.environ.get("S3_ACCESS_KEY_ID")) or None
-        self.s3_secret_access_key = (
-            self.s3_secret_access_key
-            or os.environ.get("S3_SECRET_ACCESS_KEY")) or None
-        self.s3_endpoint = (self.s3_endpoint
-                            or os.environ.get("S3_ENDPOINT_URL")) or None
+        self.s3_access_key_id = self.s3_access_key_id or envs.S3_ACCESS_KEY_ID
+        self.s3_secret_access_key = (self.s3_secret_access_key
+                                     or envs.S3_SECRET_ACCESS_KEY)
+        self.s3_endpoint = self.s3_endpoint or envs.S3_ENDPOINT_URL
         self.stream_params = {
             "s3_access_key_id": self.s3_access_key_id,
             "s3_secret_access_key": self.s3_secret_access_key,
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 9066db5a9e7f1..f5684dbf1271c 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -5,7 +5,7 @@
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                           PreTrainedTokenizerFast)
 
-from vllm.config import VLLM_USE_MODELSCOPE
+from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizers import BaichuanTokenizer
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index b2672f7f1da61..9029a5b16af72 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -15,20 +15,22 @@
 import requests
 import torch
 
-_config_home = os.getenv("XDG_CONFIG_HOME", os.path.expanduser("~/.config"))
+import vllm.envs as envs
+
+_config_home = envs.VLLM_CONFIG_ROOT
 _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
 _USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
                                               "vllm/do_not_track")
 _USAGE_STATS_ENABLED = None
-_USAGE_STATS_SERVER = os.environ.get("VLLM_USAGE_STATS_SERVER",
-                                     "https://stats.vllm.ai")
+_USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
 
 
 def is_usage_stats_enabled():
     """Determine whether or not we can send usage stats to the server.
     The logic is as follows:
     - By default, it should be enabled.
-    - Two environment variables can disable it:
+    - Three environment variables can disable it:
+        - VLLM_DO_NOT_TRACK=1
         - DO_NOT_TRACK=1
         - VLLM_NO_USAGE_STATS=1
     - A file in the home directory can disable it if it exists:
@@ -36,8 +38,8 @@ def is_usage_stats_enabled():
     """
     global _USAGE_STATS_ENABLED
     if _USAGE_STATS_ENABLED is None:
-        do_not_track = os.environ.get("DO_NOT_TRACK", "0") == "1"
-        no_usage_stats = os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1"
+        do_not_track = envs.VLLM_DO_NOT_TRACK
+        no_usage_stats = envs.VLLM_NO_USAGE_STATS
         do_not_track_file = os.path.exists(_USAGE_STATS_DO_NOT_TRACK_PATH)
 
         _USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats
@@ -167,7 +169,7 @@ def _report_usage_once(self, model_architecture: str,
 
         # Metadata
         self.log_time = _get_current_timestamp_ns()
-        self.source = os.environ.get("VLLM_USAGE_SOURCE", "production")
+        self.source = envs.VLLM_USAGE_SOURCE
 
         data = vars(self)
         if extra_kvs:
diff --git a/vllm/utils.py b/vllm/utils.py
index 88447878f1706..ce55253ce2199 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -21,6 +21,7 @@
 import torch
 from packaging.version import Version, parse
 
+import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
 
 T = TypeVar("T")
@@ -174,7 +175,7 @@ def get_vllm_instance_id():
     Instance id represents an instance of the VLLM. All processes in the same
     instance should have the same instance id.
     """
-    return os.environ.get("VLLM_INSTANCE_ID", f"vllm-instance-{random_uuid()}")
+    return envs.VLLM_INSTANCE_ID or f"vllm-instance-{random_uuid()}"
 
 
 @lru_cache(maxsize=None)
@@ -243,7 +244,7 @@ async def consumer():
 
 
 def get_ip() -> str:
-    host_ip = os.environ.get("HOST_IP")
+    host_ip = envs.VLLM_HOST_IP
     if host_ip:
         return host_ip
 
@@ -269,7 +270,8 @@ def get_ip() -> str:
 
     warnings.warn(
         "Failed to get the IP address, using 0.0.0.0 by default."
-        "The value can be set by the environment variable HOST_IP.",
+        "The value can be set by the environment variable"
+        " VLLM_HOST_IP or HOST_IP.",
         stacklevel=2)
     return "0.0.0.0"
 
@@ -314,7 +316,7 @@ def cdiv(a: int, b: int) -> int:
 
 @lru_cache(maxsize=None)
 def get_nvcc_cuda_version() -> Optional[Version]:
-    cuda_home = os.environ.get('CUDA_HOME')
+    cuda_home = envs.CUDA_HOME
     if not cuda_home:
         cuda_home = '/usr/local/cuda'
         if os.path.isfile(cuda_home + '/bin/nvcc'):
@@ -581,7 +583,7 @@ def find_library(lib_name: str) -> str:
     # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
     locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
     # `LD_LIBRARY_PATH` searches the library in the user-defined paths
-    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+    env_ld_library_path = envs.LD_LIBRARY_PATH
     if not locs and env_ld_library_path:
         locs = [
             os.path.join(dir, lib_name)
@@ -594,14 +596,15 @@ def find_library(lib_name: str) -> str:
 
 
 def find_nccl_library():
-    so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
+    so_file = envs.VLLM_NCCL_SO_PATH
+    VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
 
     # check if we have vllm-managed nccl
     vllm_nccl_path = None
     if torch.version.cuda is not None:
         cuda_major = torch.version.cuda.split(".")[0]
         path = os.path.expanduser(
-            f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*")
+            f"{VLLM_CONFIG_ROOT}/vllm/nccl/cu{cuda_major}/libnccl.so.*")
         files = glob.glob(path)
         vllm_nccl_path = files[0] if files else None
 
@@ -626,7 +629,7 @@ def enable_trace_function_call_for_thread() -> None:
     if enabled via the VLLM_TRACE_FUNCTION environment variable
     """
 
-    if int(os.getenv("VLLM_TRACE_FUNCTION", "0")):
+    if envs.VLLM_TRACE_FUNCTION:
         tmp_dir = tempfile.gettempdir()
         filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
                     f"_thread_{threading.get_ident()}_"

From 32881f3f3106e17d2fd52d8ac00217a0f0b2476a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Moskal?= <michal@moskal.me>
Date: Thu, 2 May 2024 11:23:37 -0700
Subject: [PATCH 39/65] [kernel] fix sliding window in prefix prefill Triton
 kernel (#4405)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
---
 tests/kernels/test_prefix_prefill.py       | 34 ++++++++--
 vllm/attention/backends/flash_attn.py      |  1 +
 vllm/attention/backends/rocm_flash_attn.py |  1 +
 vllm/attention/backends/xformers.py        |  1 +
 vllm/attention/ops/paged_attn.py           |  2 +
 vllm/attention/ops/prefix_prefill.py       | 75 ++++++++++++++++------
 6 files changed, 91 insertions(+), 23 deletions(-)

diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index ad31b0a7c2a19..8ab1167384c45 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -15,6 +15,7 @@
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
+SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
 
 
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -22,11 +23,13 @@
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
 @torch.inference_mode()
 def test_contexted_kv_attention(
     num_heads: int,
     num_queries_per_kv: int,
     head_size: int,
+    sliding_window: int,
     dtype: torch.dtype,
     device: str,
 ) -> None:
@@ -123,12 +126,32 @@ def test_contexted_kv_attention(
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
-    context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
-                          b_start_loc, b_seq_len, b_ctx_len, max_input_len)
+    context_attention_fwd(query,
+                          k,
+                          v,
+                          output,
+                          k_cache,
+                          v_cache,
+                          block_table,
+                          b_start_loc,
+                          b_seq_len,
+                          b_ctx_len,
+                          max_input_len,
+                          sliding_window=sliding_window)
     torch.cuda.synchronize()
     start_time = time.time()
-    context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
-                          b_start_loc, b_seq_len, b_ctx_len, max_input_len)
+    context_attention_fwd(query,
+                          k,
+                          v,
+                          output,
+                          k_cache,
+                          v_cache,
+                          block_table,
+                          b_start_loc,
+                          b_seq_len,
+                          b_ctx_len,
+                          max_input_len,
+                          sliding_window=sliding_window)
     torch.cuda.synchronize()
     end_time = time.time()
     print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
@@ -156,6 +179,9 @@ def test_contexted_kv_attention(
 
     attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
         subquery_lens, seq_lens)
+    if sliding_window > 0:
+        attn_bias = attn_bias.make_local_attention_from_bottomright(
+            sliding_window)
     output_ref = xops.memory_efficient_attention_forward(
         query,
         key,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 12e8c4404b94e..10b8c19b7499e 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -249,6 +249,7 @@ def forward(
                     prefill_meta.context_lens,
                     prefill_meta.max_subquery_len,
                     self.alibi_slopes,
+                    self.sliding_window[0],
                 )
         if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index b7d15de772556..3bc436315c3de 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -307,6 +307,7 @@ def forward(
                     prefill_meta.context_lens,
                     prefill_meta.max_subquery_len,
                     self.alibi_slopes,
+                    self.sliding_window[0],
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 572a4dc79a719..dc64ac0bf985d 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -246,6 +246,7 @@ def forward(
                     prefill_meta.context_lens,
                     prefill_meta.max_subquery_len,
                     self.alibi_slopes,
+                    self.sliding_window,
                 )
                 assert output[:num_prefill_tokens].shape == out.shape
                 output[:num_prefill_tokens] = out
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index cd0690a4ba957..c20b94ac8315b 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -172,6 +172,7 @@ def forward_prefix(
         context_lens: torch.Tensor,
         max_subquery_len: int,
         alibi_slopes: Optional[torch.Tensor],
+        sliding_window: Optional[int],
     ) -> torch.Tensor:
         output = torch.empty_like(query)
         context_attention_fwd(
@@ -188,6 +189,7 @@ def forward_prefix(
             context_lens,
             max_subquery_len,
             alibi_slopes,
+            sliding_window,
         )
         return output
 
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 4896cf3909c6e..79878b26c5294 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -50,6 +50,7 @@ def _fwd_kernel(
         BLOCK_DMODEL: tl.constexpr,  # head size
         BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
         BLOCK_N: tl.constexpr,
+        SLIDING_WINDOW: tl.constexpr,
     ):
         cur_batch = tl.program_id(0)
         cur_head = tl.program_id(1)
@@ -62,42 +63,53 @@ def _fwd_kernel(
         cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
         cur_batch_query_len = cur_batch_seq_len - cur_batch_ctx_len
 
+        # start position inside of the query
+        # generally, N goes over kv, while M goes over query_len
         block_start_loc = BLOCK_M * start_m
 
         # initialize offsets
+        # [N]; starts at 0
         offs_n = tl.arange(0, BLOCK_N)
+        # [D]; starts at 0
         offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+        # [M]; starts at current position in query
         offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+        # [M,D]
         off_q = (
             (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
         dim_mask = tl.where(
-            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
+            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
+            0).to(tl.int1)  # [D]
 
         q = tl.load(Q + off_q,
                     mask=dim_mask[None, :] &
                     (offs_m[:, None] < cur_batch_query_len),
-                    other=0.0)
+                    other=0.0)  # [M,D]
 
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
+        # initialize pointer to m and l
+        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")  # [M]
+        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)  # [M]
+        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED],
+                       dtype=tl.float32)  # [M,D]
 
+        # compute query against context (no causal mask here)
         for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
             start_n = tl.multiple_of(start_n, BLOCK_N)
             # -- compute qk ----
             bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
                          ((start_n + offs_n) // block_size) * stride_b_loc_s,
                          mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
+                         other=0)  # [N]
+            # [D,N]
             off_k = (bn[None, :] * stride_k_cache_bs +
                      cur_kv_head * stride_k_cache_h +
                      (offs_d[:, None] // x) * stride_k_cache_d +
                      ((start_n + offs_n[None, :]) % block_size) *
                      stride_k_cache_bl +
                      (offs_d[:, None] % x) * stride_k_cache_x)
+            # [N,D]
             off_v = (
                 bn[:, None] * stride_v_cache_bs +
                 cur_kv_head * stride_v_cache_h +
@@ -106,23 +118,39 @@ def _fwd_kernel(
             k = tl.load(K_cache + off_k,
                         mask=dim_mask[:, None] &
                         ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
-                        other=0.0)
+                        other=0.0)  # [D,N]
 
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)  # [M,N]
             qk += tl.dot(q, k)
             qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
                           float("-inf"))
             qk *= sm_scale
+            if SLIDING_WINDOW > 0:
+                # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
+                # Q entries in sequence
+                # (start_n + offs_n[None, :]) are the positions of
+                # KV entries in sequence
+                # So the condition makes sure each entry in Q only attends
+                # to KV entries not more than SLIDING_WINDOW away.
+                #
+                # We can't use -inf here, because the
+                # sliding window may lead to the entire row being masked.
+                # This then makes m_ij contain -inf, which causes NaNs in
+                # exp().
+                qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
+                              (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk,
+                              -10000)
 
             # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            p = tl.exp(qk - m_ij[:, None])
-            l_ij = tl.sum(p, 1)
+            m_ij = tl.max(qk, 1)  # [M]
+            p = tl.exp(qk - m_ij[:, None])  # [M,N]
+            l_ij = tl.sum(p, 1)  # [M]
             # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)
-            alpha = tl.exp(m_i - m_i_new)
-            beta = tl.exp(m_ij - m_i_new)
-            l_i_new = alpha * l_i + beta * l_ij
+            m_i_new = tl.maximum(m_i, m_ij)  # [M]
+            alpha = tl.exp(m_i - m_i_new)  # [M]
+            beta = tl.exp(m_ij - m_i_new)  # [M]
+            l_i_new = alpha * l_i + beta * l_ij  # [M]
+
             # -- update output accumulator --
             # scale p
             p_scale = beta / l_i_new
@@ -134,7 +162,7 @@ def _fwd_kernel(
             v = tl.load(V_cache + off_v,
                         mask=dim_mask[None, :] &
                         ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
-                        other=0.0)
+                        other=0.0)  # [N,D]
 
             p = p.to(v.dtype)
             acc += tl.dot(p, v)
@@ -149,8 +177,10 @@ def _fwd_kernel(
         k_ptrs = K + off_k
         v_ptrs = V + off_v
 
+        # block_mask is 0 when we're already past the current query length
         block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
 
+        # compute query against itself (with causal mask)
         for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
             start_n = tl.multiple_of(start_n, BLOCK_N)
             # -- compute qk ----
@@ -163,8 +193,13 @@ def _fwd_kernel(
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
             qk += tl.dot(q, k)
             qk *= sm_scale
+            # apply causal mask
             qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
                           float("-inf"))
+            if SLIDING_WINDOW > 0:
+                qk = tl.where(
+                    offs_m[:, None] -
+                    (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000)
 
             # -- compute m_ij, p, l_ij
             m_ij = tl.max(qk, 1)
@@ -636,7 +671,8 @@ def context_attention_fwd(q,
                               b_seq_len,
                               b_ctx_len,
                               max_input_len,
-                              alibi_slopes=None):
+                              alibi_slopes=None,
+                              sliding_window=None):
 
         cap = torch.cuda.get_device_capability()
         BLOCK = 128 if cap[0] >= 8 else 64
@@ -644,7 +680,7 @@ def context_attention_fwd(q,
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
         assert Lq == Lk and Lk == Lv
         # round up Lk to a power of 2 - this is required for Triton block size
-        Lk_padded = 2**((Lk - 1).bit_length())
+        Lk_padded = triton.next_power_of_2(Lk)
 
         sm_scale = 1.0 / (Lq**0.5)
         batch, head = b_seq_len.shape[0], q.shape[1]
@@ -749,6 +785,7 @@ def context_attention_fwd(q,
             BLOCK_DMODEL=Lk,
             BLOCK_DMODEL_PADDED=Lk_padded,
             BLOCK_N=BLOCK,
+            SLIDING_WINDOW=sliding_window if sliding_window is not None else 0,
             num_warps=num_warps,
             num_stages=1,
         )

From 9b5c9f9484858279a937498ebf9239a9df67f61f Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 2 May 2024 14:29:07 -0500
Subject: [PATCH 40/65] [CI/Build] AMD CI pipeline with extended set of tests.
 (#4267)

Co-authored-by: simon-mo <simon.mo@hey.com>
---
 .buildkite/run-amd-test.sh    | 58 +++++++++++++++--------------------
 .buildkite/run-benchmarks.sh  |  5 +++
 .buildkite/test-pipeline.yaml | 15 ++++++++-
 .buildkite/test-template.j2   | 21 ++++++++++---
 Dockerfile.rocm               | 13 ++++----
 5 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 38aff57a410dc..c04e05a994894 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -1,10 +1,11 @@
-# This script build the ROCm docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
+# This script build the ROCm docker image and runs test inside it.
 set -ex
 
 # Print ROCm version
+echo "--- ROCm info"
 rocminfo
 
+echo "--- Resetting GPUs"
 
 echo "reset" > /opt/amdgpu/etc/gpu_state
 
@@ -16,37 +17,28 @@ while true; do
         fi
 done
 
+echo "--- Building container"
+sha=$(git rev-parse --short HEAD)
+container_name=rocm_${sha}
+docker build \
+        -t ${container_name} \
+        -f Dockerfile.rocm \
+        --progress plain \
+        .
+
+remove_docker_container() {
+   docker rm -f ${container_name} || docker image rm -f ${container_name} || true
+}
+trap remove_docker_container EXIT
 
+echo "--- Running container"
 
-# Try building the docker image
-docker build -t rocm -f Dockerfile.rocm .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f rocm || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image
-export HIP_VISIBLE_DEVICES=1
-docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &
-
-# Wait for the server to start
-wait_for_server_to_start() {
-    timeout=300
-    counter=0
-
-    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
-        sleep 1
-        counter=$((counter + 1))
-        if [ $counter -ge $timeout ]; then
-            echo "Timeout after $timeout seconds"
-            break
-        fi
-    done
-}
-wait_for_server_to_start
+docker run \
+        --device /dev/kfd --device /dev/dri \
+        --network host \
+        --rm \
+        -e HF_TOKEN \
+        --name ${container_name} \
+        ${container_name} \
+        /bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
 
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
-    localhost:8000/generate \
-    -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index f6a542afe1a3d..7fbad1c4bd950 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
 tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
 echo '```' >> benchmark_results.md
 
+# if the agent binary is not found, skip uploading the results, exit 0
+if [ ! -f /workspace/buildkite-agent ]; then
+    exit 0
+fi
+
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d518fb9ccecfa..e49a5650c44ea 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -20,6 +20,7 @@ steps:
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Core Test
+  mirror_hardwares: [amd]
   command: pytest -v -s core
 
 - label: Distributed Comm Ops Test
@@ -29,7 +30,10 @@ steps:
 
 - label: Distributed Tests
   working_dir: "/vllm-workspace/tests/distributed"
-  num_gpus: 2
+
+  num_gpus: 2 # only support 1 or 2 for now.
+  mirror_hardwares: [amd]
+
   commands:
   - pytest -v -s test_pynccl_library.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
@@ -44,6 +48,7 @@ steps:
   - pytest -v -s test_pynccl.py
 
 - label: Engine Test
+  mirror_hardwares: [amd]
   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 
 - label: Entrypoints Test
@@ -54,6 +59,7 @@ steps:
 
 - label: Examples Test
   working_dir: "/vllm-workspace/examples"
+  mirror_hardwares: [amd]
   commands:
     # install aws cli for llava_example.py
     - pip install awscli
@@ -67,16 +73,19 @@ steps:
   parallelism: 4
 
 - label: Models Test
+  mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
     - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
 
 - label: Llava Test
+  mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
     - pytest -v -s models/test_llava.py
 
 - label: Prefix Caching Test
+  mirror_hardwares: [amd]
   commands:
     - pytest -v -s prefix_caching
 
@@ -84,12 +93,15 @@ steps:
   command: pytest -v -s samplers
 
 - label: LogitsProcessor Test
+  mirror_hardwares: [amd]
   command: pytest -v -s test_logits_processor.py
 
 - label: Worker Test
+  mirror_hardwares: [amd]
   command: pytest -v -s worker
 
 - label: Speculative decoding tests
+  mirror_hardwares: [amd]
   command: pytest -v -s spec_decode
 
 - label: LoRA Test %N
@@ -107,6 +119,7 @@ steps:
 
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
+  mirror_hardwares: [amd]
   commands:
   - pip install aiohttp
   - bash run-benchmarks.sh
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 2cb21cacd065b..ea02b6b1e9c9e 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -16,18 +16,29 @@ steps:
           limit: 5
   - wait
 
-  - label: "AMD Test"
-    agents:
-      queue: amd
-    command: bash .buildkite/run-amd-test.sh
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+        env:
+          DOCKER_BUILDKIT: "1"
+    {% endif %}
+    {% endfor %}
 
   - label: "Neuron Test"
+    depends_on: ~
     agents:
       queue: neuron
     command: bash .buildkite/run-neuron-test.sh
     soft_fail: true
 
-  - label: "CPU Test"
+  - label: "Intel Test"
+    depends_on: ~
     command: bash .buildkite/run-cpu-test.sh
 
   {% for step in steps %}
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 3f84b949481d1..d04bb9915e2ab 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -46,7 +46,7 @@ RUN apt-get update && apt-get install -y \
 
 ### Mount Point ###
 # When launching the container, mount the code directory to /app
-ARG APP_MOUNT=/app
+ARG APP_MOUNT=/vllm-workspace
 VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}
 
@@ -89,15 +89,16 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
     && cd ../..; \
     fi
 
-COPY ./ /app/vllm
+WORKDIR /vllm-workspace
+COPY . .
 
 RUN python3 -m pip install --upgrade pip numba
 
-RUN cd /app \
-    && cd vllm \
-    && pip install -U -r requirements-rocm.txt \
-    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -U -r requirements-rocm.txt \
+    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
     && cd ..
 
 RUN python3 -m pip install --upgrade pip

From 0f8a91401c89ac0a8018def3756829611b57727f Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Fri, 3 May 2024 06:31:20 +0900
Subject: [PATCH 41/65] [Core] Ignore infeasible swap requests. (#4557)

---
 tests/basic_correctness/test_preemption.py   | 85 ++++++++++++++++++++
 tests/core/test_block_manager.py             |  2 +-
 tests/core/test_chunked_prefill_scheduler.py |  5 +-
 tests/core/test_scheduler.py                 | 30 ++++++-
 vllm/core/block/cpu_gpu_block_allocator.py   | 19 ++---
 vllm/core/block/interfaces.py                | 21 +++--
 vllm/core/block/naive_block.py               |  6 +-
 vllm/core/block/prefix_caching_block.py      |  3 +
 vllm/core/block_manager_v1.py                | 19 ++++-
 vllm/core/block_manager_v2.py                |  4 +-
 vllm/core/interfaces.py                      |  2 +-
 vllm/core/scheduler.py                       | 33 +++++---
 12 files changed, 187 insertions(+), 42 deletions(-)

diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 1adfc7dddd6fa..ffb0717b3bfdb 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -7,6 +7,7 @@
 """
 import pytest
 
+from vllm import SamplingParams
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                  ENABLE_ARTIFICIAL_PREEMPT)
 
@@ -136,3 +137,87 @@ def test_swap(
             assert hf_output_ids[j] == vllm_output_ids[j], (
                 f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
                 f"vLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("beam_width", [4])
+def test_swap_infeasible(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    """Verify infeasible swap request will be ignored."""
+    BLOCK_SIZE = 16
+    prefill_blocks = 2
+    decode_blocks = max_tokens // BLOCK_SIZE
+    example_prompts = example_prompts[:1]
+
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        swap_space=10,
+        block_size=BLOCK_SIZE,
+        # Since beam search have more than 1 sequence, prefill + decode blocks
+        # are not enough to finish.
+        num_gpu_blocks_override=prefill_blocks + decode_blocks,
+        max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
+    )
+    sampling_params = SamplingParams(n=beam_width,
+                                     use_beam_search=True,
+                                     temperature=0.0,
+                                     max_tokens=max_tokens,
+                                     ignore_eos=True)
+    req_outputs = vllm_model.model.generate(
+        example_prompts,
+        sampling_params=sampling_params,
+    )
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
+    # Verify the request is ignored and not hang.
+    assert req_outputs[0].outputs[0].finish_reason == "length"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_preemption_infeasible(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    """Verify infeasible preemption request will be ignored."""
+    BLOCK_SIZE = 16
+    prefill_blocks = 2
+    decode_blocks = max_tokens // BLOCK_SIZE
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        block_size=BLOCK_SIZE,
+        # Not enough gpu blocks to complete a single sequence.
+        # preemption should happen, and the sequence should be
+        # ignored instead of hanging forever.
+        num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+        max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+    )
+    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+    req_outputs = vllm_model.model.generate(
+        example_prompts,
+        sampling_params=sampling_params,
+    )
+
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
+    # Verify the request is ignored and not hang.
+    for req_output in req_outputs:
+        outputs = req_output.outputs
+        assert len(outputs) == 1
+        assert outputs[0].finish_reason == "length"
diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 62984ef4caabb..9f9a6180add78 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -224,7 +224,7 @@ def test_swap():
 
     # Swap seq group from CPU -> GPU.
     cpu_blocks = block_manager.get_block_table(prompt)
-    assert block_manager.can_swap_in(seq_group)
+    assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
     before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
     before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
     mapping = block_manager.swap_in(seq_group)
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index cce396bf4953c..92498c0014666 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -4,6 +4,7 @@
 import pytest  # noqa
 
 from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler
 from vllm.sequence import Logprob, SequenceGroup
 
@@ -410,7 +411,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
 
     # Add 1 more task. Swap is not possible, so prefill is running.
     scheduler.block_manager.can_swap_in = MagicMock()
-    scheduler.block_manager.can_swap_in.return_value = False
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
 
     _, seq_group2 = create_dummy_prompt("2", prompt_length=60)
     scheduler.add_seq_group(seq_group2)
@@ -423,7 +424,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     assert out.scheduled_seq_groups[0].seq_group == seq_group2
 
     # Now although swap is possible, running prefill is prioritized.
-    scheduler.block_manager.can_swap_in.return_value = True
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.OK
     _, out = schedule_and_update_computed_tokens(scheduler)
     assert len(out.scheduled_seq_groups) == 1
     # 3 decodes. It is swapped in.
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index ab471d206618b..1358dffec8104 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -791,7 +791,7 @@ def test_schedule_swapped_cannot_swap_in():
 
     # The last request should be swapped out.
     scheduler.block_manager.can_swap_in = MagicMock()
-    scheduler.block_manager.can_swap_in.return_value = False
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
     # Since we cannot swap in, none of the requests are swapped in.
     budget = create_token_budget()
     remaining_swapped, output = scheduler._schedule_swapped(
@@ -803,6 +803,34 @@ def test_schedule_swapped_cannot_swap_in():
     assert len(output.prefill_seq_groups) == 0
 
 
+def test_infeasible_swap():
+    scheduler = initialize_scheduler()
+    swapped = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = None
+    blocks_to_swap_out = {}
+    for _ in range(2):
+        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        swapped.append(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.can_swap_in = MagicMock()
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
+    # Since we cannot swap in, none of the requests are swapped in.
+    budget = create_token_budget()
+    remaining_swapped, output = scheduler._schedule_swapped(
+        swapped, budget, curr_loras, policy)
+    assert len(remaining_swapped) == 0
+    assert len(output.infeasible_seq_groups) == 2
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(output.decode_seq_groups) == 0
+    assert len(output.prefill_seq_groups) == 0
+
+
 def test_schedule_swapped_blocks_to_copy():
     scheduler = initialize_scheduler()
     swapped = deque()
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index d25d22cf52838..5b25e1bcdada0 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -110,9 +110,8 @@ def __init__(
             for block_id in allocator.all_block_ids:
                 self._block_ids_to_allocator[block_id] = allocator
 
-    def allocate_mutable(self,
-                         prev_block: Optional[Block],
-                         device: Optional[Device] = None) -> Block:
+    def allocate_mutable(self, prev_block: Optional[Block],
+                         device: Device) -> Block:
         """Allocates a new mutable block on the specified device.
 
         Args:
@@ -123,13 +122,10 @@ def allocate_mutable(self,
         Returns:
             Block: The newly allocated mutable block.
         """
-        assert device is not None
         return self._allocators[device].allocate_mutable(prev_block)
 
-    def allocate_immutable(self,
-                           prev_block: Optional[Block],
-                           token_ids: List[int],
-                           device: Optional[Device] = None) -> Block:
+    def allocate_immutable(self, prev_block: Optional[Block],
+                           token_ids: List[int], device: Device) -> Block:
         """Allocates a new immutable block with the provided token IDs on the
         specified device.
 
@@ -144,7 +140,6 @@ def allocate_immutable(self,
             Block: The newly allocated immutable block containing the provided
                 token IDs.
         """
-        assert device is not None
         return self._allocators[device].allocate_immutable(
             prev_block, token_ids)
 
@@ -175,7 +170,7 @@ def fork(self, last_block: Block) -> List[Block]:
         allocator = self._block_ids_to_allocator[block_id]
         return allocator.fork(last_block)
 
-    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
+    def get_num_free_blocks(self, device: Device) -> int:
         """Returns the number of free blocks available on the specified device.
 
         Args:
@@ -185,9 +180,11 @@ def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
         Returns:
             int: The number of free blocks available on the specified device.
         """
-        assert device is not None
         return self._allocators[device].get_num_free_blocks()
 
+    def get_num_total_blocks(self, device: Device) -> int:
+        return self._allocators[device].get_num_total_blocks()
+
     def clear_copy_on_writes(self) -> Dict[int, List[int]]:
         """Clears the copy-on-write (CoW) state and returns the mapping of
             source to destination block IDs.
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 08d2f87301d92..634c4016ca19c 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -108,6 +108,10 @@ def free(self, block: Block) -> None:
     def fork(self, last_block: Block) -> List[Block]:
         pass
 
+    @abstractmethod
+    def get_num_total_blocks(self) -> int:
+        pass
+
     @abstractmethod
     def get_num_free_blocks(self) -> int:
         pass
@@ -152,20 +156,21 @@ class NoFreeBlocksError(ValueError):
 class DeviceAwareBlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable(self,
-                         prev_block: Optional[Block],
-                         device: Optional[Device] = None) -> Block:
+    def allocate_mutable(self, prev_block: Optional[Block],
+                         device: Device) -> Block:
+        pass
+
+    @abstractmethod
+    def allocate_immutable(self, prev_block: Optional[Block],
+                           token_ids: List[int], device: Device) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable(self,
-                           prev_block: Optional[Block],
-                           token_ids: List[int],
-                           device: Optional[Device] = None) -> Block:
+    def get_num_free_blocks(self, device: Device) -> int:
         pass
 
     @abstractmethod
-    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
+    def get_num_total_blocks(self, device: Device) -> int:
         pass
 
     @abstractmethod
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 10af129246889..a1b901bf78efc 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -133,10 +133,12 @@ def fork(self, last_block: Block) -> List[Block]:
 
         return forked_blocks
 
-    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
-        assert device is None
+    def get_num_free_blocks(self) -> int:
         return len(self._free_block_indices)
 
+    def get_num_total_blocks(self) -> int:
+        return len(self._all_block_indices)
+
     def _allocate_new_block_id(self) -> BlockId:
         if not self._free_block_indices:
             raise BlockAllocator.NoFreeBlocksError()
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index e9000c9bfff7f..4a37e8f87c379 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -285,6 +285,9 @@ def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
         return self._hashless_allocator.get_num_free_blocks(
         ) + self.evictor.num_blocks
 
+    def get_num_total_blocks(self) -> int:
+        return self._hashless_allocator.get_num_total_blocks()
+
     @property
     def all_block_ids(self) -> FrozenSet[int]:
         return self._hashless_allocator.all_block_ids
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 4a9a2999e3913..268c5c135d887 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -47,6 +47,10 @@ def free(self, block: PhysicalTokenBlock) -> None:
     def get_num_free_blocks(self) -> int:
         pass
 
+    @abstractmethod
+    def get_num_total_blocks(self) -> int:
+        pass
+
     @abstractmethod
     def contains_block(self, block_hash: int) -> bool:
         pass
@@ -131,6 +135,9 @@ def get_num_free_blocks(self) -> int:
         return (self.num_blocks - self.current_num_blocks +
                 self.evictor.num_blocks)
 
+    def get_num_total_blocks(self) -> int:
+        return self.num_blocks
+
     def contains_block(self, block_hash: int) -> bool:
         return block_hash in self.cached_blocks or block_hash in self.evictor
 
@@ -190,6 +197,9 @@ def free(self, block: PhysicalTokenBlock) -> None:
     def get_num_free_blocks(self) -> int:
         return len(self.free_blocks)
 
+    def get_num_total_blocks(self) -> int:
+        return self.num_blocks
+
     def contains_block(self, block_hash: int) -> bool:
         raise NotImplementedError(
             "Invalid codepath for uncached block allocator.")
@@ -444,7 +454,7 @@ def _get_physical_blocks(
 
     def can_swap_in(self,
                     seq_group: SequenceGroup,
-                    num_lookahead_slots: int = 0) -> bool:
+                    num_lookahead_slots: int = 0) -> AllocStatus:
         assert (num_lookahead_slots == 0
                 ), "BlockSpaceManagerV1 does not support lookahead allocation"
         blocks = self._get_physical_blocks(seq_group)
@@ -454,7 +464,12 @@ def can_swap_in(self,
         # at least one free block right after the swap-in.
         # NOTE: This should match the logic in can_append_slot().
         num_required_blocks = len(blocks) + num_swapped_seqs
-        return num_free_blocks - num_required_blocks >= self.watermark_blocks
+        if self.gpu_allocator.get_num_total_blocks() < num_required_blocks:
+            return AllocStatus.NEVER
+        elif num_free_blocks - num_required_blocks >= self.watermark_blocks:
+            return AllocStatus.OK
+        else:
+            return AllocStatus.LATER
 
     def swap_in(self,
                 seq_group: SequenceGroup,
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 3fbd8b787cf6c..ce90ce2f17278 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -238,8 +238,8 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         self.block_tables[child_seq.seq_id] = src_block_table.fork()
 
     def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> bool:
-        return False
+                    num_lookahead_slots: int) -> AllocStatus:
+        return AllocStatus.LATER
 
     def swap_in(self, seq_group: SequenceGroup,
                 num_lookahead_slots: int) -> Dict[int, int]:
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 56c2c5995c38b..09ccaddb62615 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -63,7 +63,7 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
 
     @abstractmethod
     def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> bool:
+                    num_lookahead_slots: int) -> AllocStatus:
         pass
 
     @abstractmethod
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index b17b6cc7fe733..7c55b08d4857d 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -210,6 +210,8 @@ class SchedulerSwappedInOutputs:
     blocks_to_copy: Dict[int, List[int]]
     # The number of slots for lookahead decoding.
     num_lookahead_slots: int
+    # Infeasible sequence groups.
+    infeasible_seq_groups: List[SequenceGroup]
 
     @classmethod
     def create_empty(cls) -> "SchedulerSwappedInOutputs":
@@ -219,6 +221,7 @@ def create_empty(cls) -> "SchedulerSwappedInOutputs":
             blocks_to_swap_in={},
             blocks_to_copy={},
             num_lookahead_slots=0,
+            infeasible_seq_groups=[],
         )
 
 
@@ -511,14 +514,26 @@ def _schedule_swapped(
         prefill_seq_groups: List[ScheduledSequenceGroup] = []
         now = time.time()
         swapped_queue = policy.sort_by_priority(now, swapped_queue)
+        infeasible_seq_groups: List[SequenceGroup] = []
 
         leftover_swapped: Deque[SequenceGroup] = deque()
         while swapped_queue:
             seq_group = swapped_queue[0]
 
             # If the sequence group cannot be swapped in, stop.
-            if not self.block_manager.can_swap_in(seq_group):
+            alloc_status = self.block_manager.can_swap_in(seq_group)
+            if alloc_status == AllocStatus.LATER:
                 break
+            elif alloc_status == AllocStatus.NEVER:
+                logger.warning(
+                    "Failing the request %s because there's not enough kv "
+                    "cache blocks to run the entire sequence.",
+                    seq_group.request_id)
+                for seq in seq_group.get_seqs():
+                    seq.status = SequenceStatus.FINISHED_IGNORED
+                infeasible_seq_groups.append(seq_group)
+                swapped_queue.popleft()
+                continue
 
             lora_int_id = 0
             if self.lora_enabled:
@@ -569,7 +584,9 @@ def _schedule_swapped(
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_copy=blocks_to_copy,
             num_lookahead_slots=self._get_num_lookahead_slots(
-                is_prefill=False))
+                is_prefill=False),
+            infeasible_seq_groups=infeasible_seq_groups,
+        )
 
     def _schedule_prefills(
         self,
@@ -777,7 +794,8 @@ def _schedule_default(self) -> SchedulerOutputs:
             blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
             blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
                                        swapped_in.blocks_to_copy),
-            ignored_seq_groups=prefills.ignored_seq_groups,
+            ignored_seq_groups=prefills.ignored_seq_groups +
+            swapped_in.infeasible_seq_groups,
             num_lookahead_slots=running_scheduled.num_lookahead_slots,
         )
 
@@ -893,15 +911,6 @@ def _can_append_slots(self, seq_group: SequenceGroup) -> bool:
             num_lookahead_slots=self._get_num_lookahead_slots(is_prefill),
         )
 
-    def _can_swap_in(self, seq_group: SequenceGroup) -> bool:
-        # Swapping in is considered decode.
-        is_prefill = False
-
-        return self.block_manager.can_swap_in(
-            seq_group=seq_group,
-            num_lookahead_slots=self._get_num_lookahead_slots(is_prefill),
-        )
-
     def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
         # Schedule sequence groups.
         # This function call changes the internal states of the scheduler

From 344a5d0c332c3945caf336fd1d21f450f1455e6c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 2 May 2024 17:32:33 -0700
Subject: [PATCH 42/65] [Core][Distributed] enable allreduce for multiple tp
 groups (#4566)

---
 tests/distributed/test_pynccl.py     | 43 +++++++++++++++++++++++++---
 vllm/distributed/communication_op.py |  1 -
 vllm/distributed/parallel_state.py   | 36 ++++++++++++++++-------
 vllm/worker/worker.py                | 13 +++++----
 4 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index e71d839648c83..b6f461b76ed03 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -3,9 +3,13 @@
 import pytest
 import torch
 
+import vllm.distributed.device_communicators.pynccl_utils as pynccl_utils
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
                                                           ncclGetUniqueId)
-from vllm.distributed.parallel_state import init_distributed_environment
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group,
+    init_distributed_environment, with_pynccl_for_all_reduce)
 from vllm.utils import update_environment_variables
 
 
@@ -67,7 +71,7 @@ def multiple_tp_worker_fn():
     ]
     group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
     comm = NCCLCommunicator(group=group, device=device)
-    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
     # two groups can communicate independently
     if torch.distributed.get_rank() in [0, 1]:
         comm.all_reduce(tensor)
@@ -81,9 +85,40 @@ def multiple_tp_worker_fn():
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 2 GPUs to run the test.")
+                    reason="Need at least 4 GPUs to run the test.")
 def test_pynccl_multiple_tp():
-    distributed_run(worker_fn, 4)
+    # this tests pynccl for multiple tp groups, in a standalone way
+    # i.e. call `comm.all_reduce` directly
+    distributed_run(multiple_tp_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def multiple_tp_with_vllm_worker_fn():
+    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+    torch.cuda.set_device(torch.distributed.get_rank())
+    ensure_model_parallel_initialized(2, 2)
+    pynccl_utils.init_process_group(
+        group=get_tensor_model_parallel_cpu_group())
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+    with with_pynccl_for_all_reduce():
+        # two tp groups can communicate independently
+        if torch.distributed.get_rank() in [0, 1]:
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            result = tensor.mean().cpu().item()
+            assert result == 4
+        else:
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            result = tensor.mean().cpu().item()
+            assert result == 2
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_multiple_tp_with_vllm():
+    # this tests pynccl for multiple tp groups, together with vllm
+    # i.e. call `tensor_model_parallel_all_reduce`
+    distributed_run(multiple_tp_with_vllm_worker_fn, 4)
 
 
 @worker_fn_wrapper
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 8b2c26c3a8afb..b539a7beedbfe 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -34,7 +34,6 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     if out is not None:
         return out
     if is_pynccl_enabled_for_all_reduce():
-        # TODO: support multiple parallel groups.
         pynccl_utils.all_reduce(input_)
     else:
         torch.distributed.all_reduce(input_,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a82a1254693df..be5bb4e857caf 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -14,7 +14,8 @@
 logger = init_logger(__name__)
 
 # Tensor model parallel group that the current rank belongs to.
-_TENSOR_MODEL_PARALLEL_GROUP = None
+_TP_DEVICE_GROUP = None
+_TP_CPU_GROUP = None
 # Pipeline model parallel group that the current rank belongs to.
 _PIPELINE_MODEL_PARALLEL_GROUP = None
 
@@ -132,15 +133,17 @@ def initialize_model_parallel(
     rank = torch.distributed.get_rank()
 
     # Build the tensor model-parallel groups.
-    global _TENSOR_MODEL_PARALLEL_GROUP
-    assert _TENSOR_MODEL_PARALLEL_GROUP is None, (
+    global _TP_DEVICE_GROUP, _TP_CPU_GROUP
+    assert _TP_DEVICE_GROUP is None, (
         "tensor model parallel group is already initialized")
     for i in range(num_tensor_model_parallel_groups):
         ranks = range(i * tensor_model_parallel_size,
                       (i + 1) * tensor_model_parallel_size)
         group = torch.distributed.new_group(ranks, backend=backend)
+        cpu_group = torch.distributed.new_group(ranks, backend="gloo")
         if rank in ranks:
-            _TENSOR_MODEL_PARALLEL_GROUP = group
+            _TP_DEVICE_GROUP = group
+            _TP_CPU_GROUP = cpu_group
 
     # Build the pipeline model-parallel groups.
     global _PIPELINE_MODEL_PARALLEL_GROUP
@@ -185,7 +188,7 @@ def ensure_model_parallel_initialized(
 
 def model_parallel_is_initialized():
     """Check if tensor and pipeline parallel groups are initialized."""
-    return (_TENSOR_MODEL_PARALLEL_GROUP is not None
+    return (_TP_DEVICE_GROUP is not None
             and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
 
 
@@ -197,9 +200,16 @@ def get_cpu_world_group():
 
 def get_tensor_model_parallel_group():
     """Get the tensor model parallel group the caller rank belongs to."""
-    assert _TENSOR_MODEL_PARALLEL_GROUP is not None, (
+    assert _TP_DEVICE_GROUP is not None, (
         "tensor model parallel group is not initialized")
-    return _TENSOR_MODEL_PARALLEL_GROUP
+    return _TP_DEVICE_GROUP
+
+
+def get_tensor_model_parallel_cpu_group():
+    """Get the tensor model parallel cpu group the caller rank belongs to."""
+    assert _TP_CPU_GROUP is not None, (
+        "tensor model parallel cpu group is not initialized")
+    return _TP_CPU_GROUP
 
 
 def get_pipeline_model_parallel_group():
@@ -277,10 +287,14 @@ def get_pipeline_model_parallel_prev_rank():
 
 def destroy_model_parallel():
     """Set the groups to none and destroy them."""
-    global _TENSOR_MODEL_PARALLEL_GROUP
-    if _TENSOR_MODEL_PARALLEL_GROUP:
-        torch.distributed.destroy_process_group(_TENSOR_MODEL_PARALLEL_GROUP)
-    _TENSOR_MODEL_PARALLEL_GROUP = None
+    global _TP_DEVICE_GROUP
+    if _TP_DEVICE_GROUP:
+        torch.distributed.destroy_process_group(_TP_DEVICE_GROUP)
+    _TP_DEVICE_GROUP = None
+    global _TP_CPU_GROUP
+    if _TP_CPU_GROUP:
+        torch.distributed.destroy_process_group(_TP_CPU_GROUP)
+    _TP_CPU_GROUP = None
     global _PIPELINE_MODEL_PARALLEL_GROUP
     if _PIPELINE_MODEL_PARALLEL_GROUP:
         torch.distributed.destroy_process_group(_PIPELINE_MODEL_PARALLEL_GROUP)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 39ad428f16fe3..808261e47318b 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -11,6 +11,7 @@
                          VisionLanguageConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
+                              get_tensor_model_parallel_cpu_group,
                               init_distributed_environment)
 from vllm.distributed.device_communicators import pynccl_utils
 from vllm.distributed.device_communicators.custom_all_reduce import (
@@ -288,6 +289,9 @@ def init_worker_distributed_environment(
     init_distributed_environment(parallel_config.world_size, rank,
                                  distributed_init_method, local_rank)
 
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+
     if pynccl_utils.is_initialized():
         pynccl_world_size = pynccl_utils.get_world_size()
         if pynccl_world_size != parallel_config.world_size:
@@ -298,12 +302,9 @@ def init_worker_distributed_environment(
     elif parallel_config.world_size > 1:
         # NOTE(woosuk): We don't initialize pynccl process group when world size
         # is 1.
-        # NOTE(kaichao): By default, pynccl will use information inside
-        # `parallel_state` for initialization.
-        pynccl_utils.init_process_group()
-
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
+        # NOTE(kaichao): By default, pynccl is initialized for tp group.
+        pynccl_utils.init_process_group(
+            group=get_tensor_model_parallel_cpu_group())
 
     # Initialize a custom fast all-reduce implementation.
     if not parallel_config.disable_custom_all_reduce:

From 808632d3b4effd3c0807325b529d0354894c31b1 Mon Sep 17 00:00:00 2001
From: "Yang, Bo" <pop.atry@gmail.com>
Date: Thu, 2 May 2024 18:35:18 -0700
Subject: [PATCH 43/65] [BugFix] Prevent the task of `_force_log` from being
 garbage collected (#4567)

---
 vllm/entrypoints/openai/api_server.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 8b3c5ea9de9c0..f9e294af47253 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -4,6 +4,7 @@
 import re
 from contextlib import asynccontextmanager
 from http import HTTPStatus
+from typing import Any, Set
 
 import fastapi
 import uvicorn
@@ -33,6 +34,8 @@
 openai_serving_completion: OpenAIServingCompletion
 logger = init_logger(__name__)
 
+_running_tasks: Set[asyncio.Task[Any]] = set()
+
 
 @asynccontextmanager
 async def lifespan(app: fastapi.FastAPI):
@@ -43,7 +46,9 @@ async def _force_log():
             await engine.do_log_stats()
 
     if not engine_args.disable_log_stats:
-        asyncio.create_task(_force_log())
+        task = asyncio.create_task(_force_log())
+        _running_tasks.add(task)
+        task.add_done_callback(_running_tasks.remove)
 
     yield
 

From ce3f1eedf8e7e015054a166f17205eb3206e4625 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 3 May 2024 12:48:08 +0800
Subject: [PATCH 44/65] [Misc] remove chunk detected debug logs (#4571)

---
 vllm/engine/llm_engine.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 19e7143ac2b45..94a5b397a4d43 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -662,10 +662,10 @@ def _get_stats(
         # decode seq_groups in scheduled_seq_groups.
         if scheduler_outputs is not None:
             num_generation_tokens_from_prefill_groups = 0.
-            if scheduler_outputs.num_prefill_groups > 0 and len(
-                    scheduler_outputs.scheduled_seq_groups
-            ) != scheduler_outputs.num_prefill_groups:
-                print("DETECTED CHUNKED")
+            # NOTE: if scheduler_outputs.num_prefill_groups > 0 and
+            # the len of scheduler_outputs.scheduled_seq_groups is !=
+            # scheduler_outputs.num_prefill_groups, this means that
+            # chunked prefills have been detected.
 
             for idx, scheduled_seq_group in enumerate(
                     scheduler_outputs.scheduled_seq_groups):

From 2d7bce9cd5981db146b18a8a95c5a7e0480687bd Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 2 May 2024 22:13:49 -0700
Subject: [PATCH 45/65] [Doc] add env vars to the doc (#4572)

---
 docs/source/index.rst            | 1 +
 docs/source/serving/env_vars.rst | 9 +++++++++
 vllm/envs.py                     | 7 +++++++
 3 files changed, 17 insertions(+)
 create mode 100644 docs/source/serving/env_vars.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index e0269987ec5d8..5cc28a2d70139 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -75,6 +75,7 @@ Documentation
    serving/deploying_with_docker
    serving/distributed_serving
    serving/metrics
+   serving/env_vars
    serving/usage_stats
    serving/integrations
 
diff --git a/docs/source/serving/env_vars.rst b/docs/source/serving/env_vars.rst
new file mode 100644
index 0000000000000..0ce1374a3967b
--- /dev/null
+++ b/docs/source/serving/env_vars.rst
@@ -0,0 +1,9 @@
+Environment Variables
+========================
+
+vLLM uses the following environment variables to configure the system:
+
+.. literalinclude:: ../../../vllm/envs.py
+    :language: python
+    :start-after: begin-env-vars-definition
+    :end-before: end-env-vars-definition
diff --git a/vllm/envs.py b/vllm/envs.py
index 26ed731caa5ff..2dbb57e6253a7 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -28,6 +28,11 @@
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
 
+# The begin-* and end* here are used by the documentation generator
+# to extract the used env vars.
+
+# begin-env-vars-definition
+
 environment_variables: Dict[str, Callable[[], Any]] = {
     # used in distributed environment to determine the master address
     'VLLM_HOST_IP':
@@ -148,6 +153,8 @@
     lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
 }
 
+# end-env-vars-definition
+
 
 def __getattr__(name):
     # lazy evaluation of environment variables

From 3521ba4f2554bcf246a95a9fb2d1b80990a6835b Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Sat, 4 May 2024 02:20:12 +0900
Subject: [PATCH 46/65] [Core][Model runner refactoring 1/N] Refactor attn
 metadata term (#4518)

---
 .../kernels/benchmark_paged_attention.py      |  25 ++-
 csrc/attention/attention_kernels.cu           |  76 ++++----
 csrc/cpu/attention.cpp                        |  92 +++++-----
 csrc/ops.h                                    |   8 +-
 tests/kernels/test_attention.py               |  35 ++--
 tests/kernels/test_prefix_prefill.py          |  16 +-
 tests/samplers/test_sampler.py                |  34 ++--
 tests/spec_decode/e2e/conftest.py             |   4 +-
 tests/spec_decode/test_multi_step_worker.py   |  24 +--
 tests/spec_decode/test_ngram_worker.py        |  24 ++-
 tests/spec_decode/utils.py                    |   8 +-
 tests/test_logits_processor.py                |   8 +-
 tests/worker/test_model_runner.py             |  99 +++++------
 vllm/_custom_ops.py                           |  18 +-
 vllm/attention/backends/flash_attn.py         |  44 ++---
 vllm/attention/backends/rocm_flash_attn.py    |  60 +++----
 vllm/attention/backends/torch_sdpa.py         |  36 ++--
 vllm/attention/backends/xformers.py           |  65 ++++---
 vllm/attention/ops/paged_attn.py              |  35 ++--
 vllm/config.py                                |  23 ++-
 vllm/engine/arg_utils.py                      |  14 +-
 vllm/entrypoints/llm.py                       |   7 +-
 vllm/model_executor/layers/sampler.py         |   6 +-
 vllm/model_executor/sampling_metadata.py      |  63 ++++---
 vllm/worker/cpu_model_runner.py               |  58 +++---
 vllm/worker/model_runner.py                   | 167 +++++++++---------
 vllm/worker/neuron_model_runner.py            |  30 ++--
 27 files changed, 554 insertions(+), 525 deletions(-)

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 5c3650fa72d17..ca7967c1ab0d2 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -16,7 +16,7 @@
 def main(
     version: str,
     num_seqs: int,
-    context_len: int,
+    seq_len: int,
     num_query_heads: int,
     num_kv_heads: int,
     head_size: int,
@@ -48,12 +48,12 @@ def main(
                                    dtype=torch.float,
                                    device=device)
 
-    context_lens = [context_len for _ in range(num_seqs)]
-    max_context_len = max(context_lens)
-    context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
+    seq_lens = [seq_len for _ in range(num_seqs)]
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
 
     # Create the block tables.
-    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = []
     for _ in range(num_seqs):
         block_table = [
@@ -77,8 +77,7 @@ def main(
     # Prepare for the paged attention kernel.
     output = torch.empty_like(query)
     if version == "v2":
-        num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
-                          PARTITION_SIZE)
+        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
         tmp_output = torch.empty(
             size=(num_seqs, num_query_heads, num_partitions, head_size),
             dtype=output.dtype,
@@ -110,9 +109,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     num_kv_heads,
                     scale,
                     block_tables,
-                    context_lens,
+                    seq_lens,
                     block_size,
-                    max_context_len,
+                    max_seq_len,
                     alibi_slopes,
                     kv_cache_dtype,
                     kv_scale,
@@ -129,9 +128,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     num_kv_heads,
                     scale,
                     block_tables,
-                    context_lens,
+                    seq_lens,
                     block_size,
-                    max_context_len,
+                    max_seq_len,
                     alibi_slopes,
                     kv_cache_dtype,
                     kv_scale,
@@ -166,7 +165,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                         choices=["v1", "v2"],
                         default="v2")
     parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument("--context-len", type=int, default=4096)
+    parser.add_argument("--seq_len", type=int, default=4096)
     parser.add_argument("--num-query-heads", type=int, default=64)
     parser.add_argument("--num-kv-heads", type=int, default=8)
     parser.add_argument("--head-size",
@@ -199,7 +198,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
     main(
         version=args.version,
         num_seqs=args.batch_size,
-        context_len=args.context_len,
+        seq_len=args.seq_len,
         num_query_heads=args.num_query_heads,
         num_kv_heads=args.num_kv_heads,
         head_size=args.head_size,
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index f3a5bbfd3098d..8b1b5e098015f 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -104,7 +104,7 @@ __device__ void paged_attention_kernel(
   const int num_kv_heads,                 // [num_heads]
   const float scale,
   const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
-  const int* __restrict__ context_lens,   // [num_seqs]
+  const int* __restrict__ seq_lens,   // [num_seqs]
   const int max_num_blocks_per_seq,
   const float* __restrict__ alibi_slopes, // [num_heads]
   const int q_stride,
@@ -115,23 +115,23 @@ __device__ void paged_attention_kernel(
   const int partition_idx = blockIdx.z;
   const int max_num_partitions = gridDim.z;
   constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
-  const int context_len = context_lens[seq_idx];
-  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
+  const int seq_len = seq_lens[seq_idx];
+  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= seq_len) {
     // No work to do. Terminate the thread block.
     return;
   }
 
-  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
-  const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
+  const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
+  const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
 
   // [start_block_idx, end_block_idx) is the range of blocks to process.
   const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
-  const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
+  const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
   const int num_blocks = end_block_idx - start_block_idx;
 
   // [start_token_idx, end_token_idx) is the range of tokens to process.
   const int start_token_idx = start_block_idx * BLOCK_SIZE;
-  const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
+  const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
   const int num_tokens = end_token_idx - start_token_idx;
 
   constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
@@ -245,12 +245,12 @@ __device__ void paged_attention_kernel(
       // This includes a reduction across the threads in the same thread group.
       float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
       // Add the ALiBi bias if slopes are given.
-      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
+      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
 
       if (thread_group_offset == 0) {
         // Store the partial reductions to shared memory.
         // NOTE(woosuk): It is required to zero out the masked logits.
-        const bool mask = token_idx >= context_len;
+        const bool mask = token_idx >= seq_len;
         logits[token_idx - start_token_idx] = mask ? 0.f : qk;
         // Update the max value.
         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
@@ -364,14 +364,14 @@ __device__ void paged_attention_kernel(
         } else {
           v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
         }
-        if (block_idx == num_context_blocks - 1) {
+        if (block_idx == num_seq_blocks - 1) {
           // NOTE(woosuk): When v_vec contains the tokens that are out of the context,
           // we should explicitly zero out the values since they may contain NaNs.
           // See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
           scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
 #pragma unroll
           for (int j = 0; j < V_VEC_SIZE; j++) {
-            v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
+            v_vec_ptr[j] = token_idx + j < seq_len ? v_vec_ptr[j] : zero_value;
           }
         }
         accs[i] += dot(logits_vec, v_vec);
@@ -457,7 +457,7 @@ __global__ void paged_attention_v1_kernel(
   const int num_kv_heads,                 // [num_heads]
   const float scale,
   const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
-  const int* __restrict__ context_lens,   // [num_seqs]
+  const int* __restrict__ seq_lens,   // [num_seqs]
   const int max_num_blocks_per_seq,
   const float* __restrict__ alibi_slopes, // [num_heads]
   const int q_stride,
@@ -466,7 +466,7 @@ __global__ void paged_attention_v1_kernel(
   const float kv_scale) {
   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_KV_CACHE>(
     /* exp_sums */ nullptr, /* max_logits */ nullptr,
-    out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
+    out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, seq_lens,
     max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, kv_scale);
 }
 
@@ -489,7 +489,7 @@ __global__ void paged_attention_v2_kernel(
   const int num_kv_heads,                 // [num_heads]
   const float scale,
   const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
-  const int* __restrict__ context_lens,   // [num_seqs]
+  const int* __restrict__ seq_lens,   // [num_seqs]
   const int max_num_blocks_per_seq,
   const float* __restrict__ alibi_slopes, // [num_heads]
   const int q_stride,
@@ -498,7 +498,7 @@ __global__ void paged_attention_v2_kernel(
   const float kv_scale) {
   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_KV_CACHE, PARTITION_SIZE>(
     exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
-    block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
+    block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes,
     q_stride, kv_block_stride, kv_head_stride, kv_scale);
 }
 
@@ -513,13 +513,13 @@ __global__ void paged_attention_v2_reduce_kernel(
   const float* __restrict__ exp_sums,     // [num_seqs, num_heads, max_num_partitions]
   const float* __restrict__ max_logits,   // [num_seqs, num_heads, max_num_partitions]
   const scalar_t* __restrict__ tmp_out,   // [num_seqs, num_heads, max_num_partitions, head_size]
-  const int* __restrict__ context_lens,   // [num_seqs]
+  const int* __restrict__ seq_lens,   // [num_seqs]
   const int max_num_partitions) {
   const int num_heads = gridDim.x;
   const int head_idx = blockIdx.x;
   const int seq_idx = blockIdx.y;
-  const int context_len = context_lens[seq_idx];
-  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  const int seq_len = seq_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
   if (num_partitions == 1) {
     // No need to reduce. Only copy tmp_out to out.
     scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
@@ -616,7 +616,7 @@ __global__ void paged_attention_v2_reduce_kernel(
     num_kv_heads,                                                                             \
     scale,                                                                                    \
     block_tables_ptr,                                                                         \
-    context_lens_ptr,                                                                         \
+    seq_lens_ptr,                                                                              \
     max_num_blocks_per_seq,                                                                   \
     alibi_slopes_ptr,                                                                         \
     q_stride,                                                                                 \
@@ -639,8 +639,8 @@ void paged_attention_v1_launcher(
   int num_kv_heads,
   float scale,
   torch::Tensor& block_tables,
-  torch::Tensor& context_lens,
-  int max_context_len,
+  torch::Tensor& seq_lens,
+  int max_seq_len,
   const c10::optional<torch::Tensor>& alibi_slopes,
   float kv_scale) {
   int num_seqs = query.size(0);
@@ -664,11 +664,11 @@ void paged_attention_v1_launcher(
   CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
   CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
   int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* context_lens_ptr = context_lens.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
 
   constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
-  int logits_size = padded_max_context_len * sizeof(float);
+  int padded_max_seq_len = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
+  int logits_size = padded_max_seq_len * sizeof(float);
   int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
   // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
   // Keep that in sync with the logic here!
@@ -715,8 +715,8 @@ void paged_attention_v1_launcher(
     num_kv_heads,                                                            \
     scale,                                                                   \
     block_tables,                                                            \
-    context_lens,                                                            \
-    max_context_len,                                                         \
+    seq_lens,                                                            \
+    max_seq_len,                                                         \
     alibi_slopes,                                                            \
     kv_scale);
 
@@ -746,9 +746,9 @@ void paged_attention_v1(
   int num_kv_heads,               // [num_heads]
   float scale,
   torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
-  torch::Tensor& context_lens,    // [num_seqs]
+  torch::Tensor& seq_lens,    // [num_seqs]
   int block_size,
-  int max_context_len,
+  int max_seq_len,
   const c10::optional<torch::Tensor>& alibi_slopes,
   const std::string& kv_cache_dtype,
   float kv_scale) {
@@ -790,7 +790,7 @@ void paged_attention_v1(
     num_kv_heads,                                                                             \
     scale,                                                                                    \
     block_tables_ptr,                                                                         \
-    context_lens_ptr,                                                                         \
+    seq_lens_ptr,                                                                         \
     max_num_blocks_per_seq,                                                                   \
     alibi_slopes_ptr,                                                                         \
     q_stride,                                                                                 \
@@ -803,7 +803,7 @@ void paged_attention_v1(
     exp_sums_ptr,                                                                             \
     max_logits_ptr,                                                                           \
     tmp_out_ptr,                                                                              \
-    context_lens_ptr,                                                                         \
+    seq_lens_ptr,                                                                         \
     max_num_partitions);
 
 template<
@@ -824,8 +824,8 @@ void paged_attention_v2_launcher(
   int num_kv_heads,
   float scale,
   torch::Tensor& block_tables,
-  torch::Tensor& context_lens,
-  int max_context_len,
+  torch::Tensor& seq_lens,
+  int max_seq_len,
   const c10::optional<torch::Tensor>& alibi_slopes,
   float kv_scale) {
   int num_seqs = query.size(0);
@@ -852,10 +852,10 @@ void paged_attention_v2_launcher(
   CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
   CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
   int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* context_lens_ptr = context_lens.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
 
   constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
+  int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
   int logits_size = PARTITION_SIZE * sizeof(float);
   int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
 
@@ -909,8 +909,8 @@ void paged_attention_v2_launcher(
     num_kv_heads,                                                                \
     scale,                                                                       \
     block_tables,                                                                \
-    context_lens,                                                                \
-    max_context_len,                                                             \
+    seq_lens,                                                                \
+    max_seq_len,                                                             \
     alibi_slopes,                                                                \
     kv_scale);
 
@@ -943,9 +943,9 @@ void paged_attention_v2(
   int num_kv_heads,               // [num_heads]
   float scale,
   torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
-  torch::Tensor& context_lens,    // [num_seqs]
+  torch::Tensor& seq_lens,    // [num_seqs]
   int block_size,
-  int max_context_len,
+  int max_seq_len,
   const c10::optional<torch::Tensor>& alibi_slopes,
   const std::string& kv_cache_dtype,
   float kv_scale) {
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index 365bbd5e23728..c1d765be05598 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -70,11 +70,11 @@ template <typename T>
 FORCE_INLINE std::pair<T, T>
 reduceSoftmaxAlibi(T *data, const int size, const int capacity,
                    const float alibi_slope, const int start_index,
-                   const int context_len) {
-  data[0] += alibi_slope * (start_index - context_len + 1);
+                   const int seq_len) {
+  data[0] += alibi_slope * (start_index - seq_len + 1);
   T max = data[0];
   for (int i = 1; i < size; ++i) {
-    T qk = data[i] + alibi_slope * (start_index + i - context_len + 1);
+    T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
     data[i] = qk;
     max = max >= qk ? max : qk;
   }
@@ -225,7 +225,7 @@ struct paged_attention_v1_impl {
        const int num_kv_heads, const float scale,
        const int
            *__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
-       const int *__restrict__ context_lens, // [num_seqs]
+       const int *__restrict__ seq_lens, // [num_seqs]
        const int max_num_blocks_per_seq,
        const float *__restrict__ alibi_slopes, // [num_heads]
        const int q_stride, const int kv_block_stride, const int kv_head_stride,
@@ -235,32 +235,32 @@ struct paged_attention_v1_impl {
 
     static_assert(BLOCK_SIZE == 16);
 
-    int max_context_len = max_num_blocks_per_seq * BLOCK_SIZE;
-    int max_context_len_padded = (max_context_len + 15) & 0xFFFFFFF0;
-    TORCH_CHECK((max_context_len_padded * sizeof(float)) % 64 == 0);
+    int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
+    int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
+    TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
 
     const int parallel_work_item_num = omp_get_max_threads();
 
     size_t logits_bytes =
-        parallel_work_item_num * max_context_len_padded * sizeof(float);
+        parallel_work_item_num * max_seq_len_padded * sizeof(float);
     float *logits = (float *)std::aligned_alloc(
         64, logits_bytes); // Cacheline alignment for each context token.
-                           // [parallel_work_item_num, max_context_len_padded]
+                           // [parallel_work_item_num, max_seq_len_padded]
 
 #pragma omp parallel for collapse(2) schedule(dynamic, 1)
     for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
       for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        int context_len = context_lens[seq_idx];
+        int seq_len = seq_lens[seq_idx];
         const int *seq_block_table =
             block_tables + max_num_blocks_per_seq * seq_idx;
-        const int block_num = (context_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
         const int64_t kv_head_idx = head_idx / num_queries_per_kv;
         const scalar_t *__restrict__ q_vec_ptr =
             q + seq_idx * q_stride + head_idx * HEAD_SIZE;
         const int last_block_token_num =
-            context_len - (block_num - 1) * BLOCK_SIZE;
+            seq_len - (block_num - 1) * BLOCK_SIZE;
         float *__restrict__ thread_block_logits =
-            logits + omp_get_thread_num() * max_context_len_padded;
+            logits + omp_get_thread_num() * max_seq_len_padded;
 
         // Compute logits
         for (int block_idx = 0; block_idx < block_num; ++block_idx) {
@@ -278,11 +278,11 @@ struct paged_attention_v1_impl {
 
         // Compute softmax
         if (alibi_slopes) {
-          reduceSoftmaxAlibi(thread_block_logits, context_len,
+          reduceSoftmaxAlibi(thread_block_logits, seq_len,
                              block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
-                             context_len);
+                             seq_len);
         } else {
-          reduceSoftmax(thread_block_logits, context_len,
+          reduceSoftmax(thread_block_logits, seq_len,
                         block_num * BLOCK_SIZE);
         }
 
@@ -340,7 +340,7 @@ struct paged_attention_v1_impl {
 #define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                   \
   paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call(                     \
       out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
-      block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,              \
+      block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,              \
       alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs,   \
       num_heads);
 
@@ -348,8 +348,8 @@ template <typename T, int BLOCK_SIZE>
 void paged_attention_v1_impl_launcher(
     torch::Tensor &out, torch::Tensor &query, torch::Tensor &key_cache,
     torch::Tensor &value_cache, int num_kv_heads, float scale,
-    torch::Tensor &block_tables, torch::Tensor &context_lens,
-    int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes) {
+    torch::Tensor &block_tables, torch::Tensor &seq_lens,
+    int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -369,7 +369,7 @@ void paged_attention_v1_impl_launcher(
   T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
   T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
   int *block_tables_ptr = block_tables.data_ptr<int>();
-  int *context_lens_ptr = context_lens.data_ptr<int>();
+  int *seq_lens_ptr = seq_lens.data_ptr<int>();
 
   switch (head_size) {
   case 64:
@@ -399,7 +399,7 @@ void paged_attention_v1_impl_launcher(
 #define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE)                                 \
   paged_attention_v1_impl_launcher<T, BLOCK_SIZE>(                             \
       out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,   \
-      context_lens, max_context_len, alibi_slopes);
+      seq_lens, max_seq_len, alibi_slopes);
 
 #define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T)                                  \
   switch (block_size) {                                                        \
@@ -416,8 +416,8 @@ void paged_attention_v1(torch::Tensor &out, torch::Tensor &query,
                         torch::Tensor &key_cache, torch::Tensor &value_cache,
                         int num_kv_heads, float scale,
                         torch::Tensor &block_tables,
-                        torch::Tensor &context_lens, int block_size,
-                        int max_context_len,
+                        torch::Tensor &seq_lens, int block_size,
+                        int max_seq_len,
                         const c10::optional<torch::Tensor> &alibi_slopes,
                         const std::string &kv_cache_dtype, float kv_scale) {
   TORCH_CHECK(kv_scale == 1.0f);
@@ -448,7 +448,7 @@ struct paged_attention_v2_impl {
       const int num_kv_heads, const float scale,
       const int
           *__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
-      const int *__restrict__ context_lens, // [num_seqs]
+      const int *__restrict__ seq_lens, // [num_seqs]
       const int max_num_blocks_per_seq,
       const float *__restrict__ alibi_slopes, // [num_heads]
       const int q_stride, const int kv_block_stride, const int kv_head_stride,
@@ -465,22 +465,22 @@ struct paged_attention_v2_impl {
       for (int partition_idx = 0; partition_idx < max_num_partitions;
            ++partition_idx) {
         for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-          const int context_len = context_lens[seq_idx];
+          const int seq_len = seq_lens[seq_idx];
           const int start_token_idx = partition_idx * PARTITION_SIZE;
 
-          if (start_token_idx >= context_len)
+          if (start_token_idx >= seq_len)
             continue;
 
           const int partition_num =
-              (context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
+              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
           const bool no_reduce = (partition_num == 1);
-          const int context_token_num =
-              (std::min(context_len, start_token_idx + PARTITION_SIZE) -
+          const int token_num =
+              (std::min(seq_len, start_token_idx + PARTITION_SIZE) -
                start_token_idx);
           const int block_num =
-              (context_token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
+              (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
           const int last_block_token_num =
-              context_token_num - (block_num - 1) * BLOCK_SIZE;
+              token_num - (block_num - 1) * BLOCK_SIZE;
           const int *seq_block_table = block_tables +
                                        max_num_blocks_per_seq * seq_idx +
                                        start_token_idx / BLOCK_SIZE;
@@ -507,10 +507,10 @@ struct paged_attention_v2_impl {
           std::pair<float, float> max_and_sum;
           if (alibi_slopes) {
             max_and_sum = reduceSoftmaxAlibi(
-                logits, context_token_num, block_num * BLOCK_SIZE,
-                alibi_slopes[head_idx], start_token_idx, context_len);
+                logits, token_num, block_num * BLOCK_SIZE,
+                alibi_slopes[head_idx], start_token_idx, seq_len);
           } else {
-            max_and_sum = reduceSoftmax(logits, context_token_num,
+            max_and_sum = reduceSoftmax(logits, token_num,
                                         block_num * BLOCK_SIZE);
           }
 
@@ -583,9 +583,9 @@ struct paged_attention_v2_impl {
 #pragma omp parallel for collapse(2) schedule(static, 1)
     for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
       for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        const int context_len = context_lens[seq_idx];
+        const int seq_len = seq_lens[seq_idx];
         const int partition_num =
-            (context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
+            (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
 
         if (partition_num == 1)
           continue;
@@ -612,9 +612,9 @@ struct paged_attention_v2_impl {
     for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
       for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
         for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
-          const int context_len = context_lens[seq_idx];
+          const int seq_len = seq_lens[seq_idx];
           const int partition_num =
-              (context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
+              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
 
           if (partition_num == 1)
             continue;
@@ -649,7 +649,7 @@ struct paged_attention_v2_impl {
   paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call(     \
       out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr,           \
       key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr,   \
-      context_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
+      seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
       kv_block_stride, kv_head_stride, num_seqs, num_heads,                    \
       max_num_partitions);
 
@@ -658,8 +658,8 @@ void paged_attention_v2_impl_launcher(
     torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits,
     torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache,
     torch::Tensor &value_cache, int num_kv_heads, float scale,
-    torch::Tensor &block_tables, torch::Tensor &context_lens, int block_size,
-    int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes) {
+    torch::Tensor &block_tables, torch::Tensor &seq_lens, int block_size,
+    int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -683,7 +683,7 @@ void paged_attention_v2_impl_launcher(
   T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
   T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
   int *block_tables_ptr = block_tables.data_ptr<int>();
-  int *context_lens_ptr = context_lens.data_ptr<int>();
+  int *seq_lens_ptr = seq_lens.data_ptr<int>();
 
   switch (head_size) {
   case 64:
@@ -713,8 +713,8 @@ void paged_attention_v2_impl_launcher(
 #define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE)                                 \
   paged_attention_v2_impl_launcher<T, BLOCK_SIZE>(                             \
       out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,       \
-      num_kv_heads, scale, block_tables, context_lens, block_size,             \
-      max_context_len, alibi_slopes);
+      num_kv_heads, scale, block_tables, seq_lens, block_size,             \
+      max_seq_len, alibi_slopes);
 
 #define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T)                                  \
   switch (block_size) {                                                        \
@@ -732,8 +732,8 @@ void paged_attention_v2(torch::Tensor &out, torch::Tensor &exp_sums,
                         torch::Tensor &query, torch::Tensor &key_cache,
                         torch::Tensor &value_cache, int num_kv_heads,
                         float scale, torch::Tensor &block_tables,
-                        torch::Tensor &context_lens, int block_size,
-                        int max_context_len,
+                        torch::Tensor &seq_lens, int block_size,
+                        int max_seq_len,
                         const c10::optional<torch::Tensor> &alibi_slopes,
                         const std::string &kv_cache_dtype, float kv_scale) {
   TORCH_CHECK(kv_scale == 1.0f);
diff --git a/csrc/ops.h b/csrc/ops.h
index 8ae052427052f..9541adcb3de88 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -10,9 +10,9 @@ void paged_attention_v1(
   int num_kv_heads,
   float scale,
   torch::Tensor& block_tables,
-  torch::Tensor& context_lens,
+  torch::Tensor& seq_lens,
   int block_size,
-  int max_context_len,
+  int max_seq_len,
   const c10::optional<torch::Tensor>& alibi_slopes,
   const std::string& kv_cache_dtype,
   float kv_scale);
@@ -28,9 +28,9 @@ void paged_attention_v2(
   int num_kv_heads,
   float scale,
   torch::Tensor& block_tables,
-  torch::Tensor& context_lens,
+  torch::Tensor& seq_lens,
   int block_size,
-  int max_context_len,
+  int max_seq_len,
   const c10::optional<torch::Tensor>& alibi_slopes,
   const std::string& kv_cache_dtype,
   float kv_scale);
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 9b1f3e30b6dca..84539205e0ae3 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -61,7 +61,7 @@ def ref_single_query_cached_kv_attention(
     key_cache: torch.Tensor,
     value_cache: torch.Tensor,
     block_tables: torch.Tensor,
-    context_lens: torch.Tensor,
+    seq_lens: torch.Tensor,
     scale: float,
     alibi_slopes: Optional[torch.Tensor],
 ) -> None:
@@ -72,15 +72,15 @@ def ref_single_query_cached_kv_attention(
     num_seqs = query.shape[0]
 
     block_tables = block_tables.cpu().tolist()
-    context_lens = context_lens.cpu().tolist()
+    seq_lens = seq_lens.cpu().tolist()
     for i in range(num_seqs):
         q = query[i].unsqueeze(0)
         block_table = block_tables[i]
-        context_len = int(context_lens[i])
+        seq_len = int(seq_lens[i])
 
         keys = []
         values = []
-        for j in range(context_len):
+        for j in range(seq_len):
             block_number = int(block_table[j // block_size])
             block_offset = j % block_size
 
@@ -100,8 +100,8 @@ def ref_single_query_cached_kv_attention(
         alibi_bias = None
         if alibi_slopes is not None:
             # Create the ALiBi bias used in the paged attention kernel.
-            position_ids = torch.arange(context_len).int()
-            alibi_bias = (position_ids - context_len + 1).float()
+            position_ids = torch.arange(seq_len).int()
+            alibi_bias = (position_ids - seq_len + 1).float()
             alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
                 1, 1, -1)
 
@@ -149,13 +149,13 @@ def test_paged_attention(
     if use_alibi:
         alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
 
-    context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
-    context_lens[-1] = MAX_SEQ_LEN
-    max_context_len = max(context_lens)
-    context_lens = torch.tensor(context_lens, dtype=torch.int)
+    seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    seq_lens[-1] = MAX_SEQ_LEN
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int)
 
     # Create the block tables.
-    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = []
     for _ in range(num_seqs):
         block_table = [
@@ -186,16 +186,15 @@ def test_paged_attention(
             num_kv_heads,
             scale,
             block_tables,
-            context_lens,
+            seq_lens,
             block_size,
-            max_context_len,
+            max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
             kv_scale,
         )
     elif version == "v2":
-        num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
-                          PARTITION_SIZE)
+        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
         assert PARTITION_SIZE % block_size == 0
         num_seqs, num_heads, head_size = output.shape
         tmp_output = torch.empty(
@@ -218,9 +217,9 @@ def test_paged_attention(
             num_kv_heads,
             scale,
             block_tables,
-            context_lens,
+            seq_lens,
             block_size,
-            max_context_len,
+            max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
             kv_scale,
@@ -255,7 +254,7 @@ def test_paged_attention(
         key_cache,
         value_cache,
         block_tables,
-        context_lens,
+        seq_lens,
         scale,
         alibi_slopes,
     )
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 8ab1167384c45..5a5987e2242fa 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -51,12 +51,12 @@ def test_contexted_kv_attention(
     cache_size = 640
     block_size = 32
     max_block_per_request = 64
-    subquery_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
+    query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
     ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
-    seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
     num_kv_heads = num_heads // num_queries_per_kv
 
-    num_tokens = sum(subquery_lens)
+    num_tokens = sum(query_lens)
     query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
     query.uniform_(-1e-3, 1e-3)
     output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
@@ -75,15 +75,15 @@ def test_contexted_kv_attention(
                           num_kv_heads,
                           head_size,
                           dtype=dtype)
-    k = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype)
-    v = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype)
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     values = torch.arange(0, cache_size, dtype=torch.long)
     values = values[torch.randperm(cache_size)]
     block_table = values[:BS * max_block_per_request].view(
         BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1],
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
                                             dtype=torch.long),
                                dim=0)
     max_input_len = MAX_SEQ_LEN
@@ -92,7 +92,7 @@ def test_contexted_kv_attention(
                                                 dtype=torch.long),
                                    dim=0)
     for i in range(BS):
-        for j in range(subquery_lens[i]):
+        for j in range(query_lens[i]):
             k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
                                             j])
             v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
@@ -178,7 +178,7 @@ def test_contexted_kv_attention(
     value = value.unsqueeze(0)
 
     attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-        subquery_lens, seq_lens)
+        query_lens, seq_lens)
     if sliding_window > 0:
         attn_bias = attn_bias.make_local_attention_from_bottomright(
             sliding_window)
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 7859f0b21812f..e4fea165a4d46 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -58,7 +58,7 @@ def _do_sample(
     device: str,
 ):
     seq_group_metadata_list = []
-    prompt_lens = []
+    seq_lens = []
     for i in range(batch_size):
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
@@ -68,12 +68,12 @@ def _do_sample(
                 sampling_params=sampling_params,
                 block_tables={0: [1]},
             ))
-        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
     sampling_metadata = SamplingMetadata.prepare(
         seq_group_metadata_list,
-        prompt_lens,
-        subquery_lens=prompt_lens,
+        seq_lens,
+        query_lens=seq_lens,
         device=device,
         pin_memory=model_runner.pin_memory)
     return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
@@ -421,7 +421,7 @@ def run_test_case(*,
             "Invalid test case, need seq_group_metadata_list"
 
         batch_size = 0
-        prompt_lens = []
+        seq_lens = []
         sampling_params_per_row = []
         for sgm in seq_group_metadata_list:
             sampling_params = sgm.sampling_params
@@ -431,7 +431,7 @@ def run_test_case(*,
                 # a prompt seq_group has only one sequence
                 seq_data = next(iter(sgm.seq_data.values()))
                 prompt_len = seq_data.get_prompt_len()
-                prompt_lens.append(prompt_len)
+                seq_lens.append(prompt_len)
 
                 if sgm.sampling_params.prompt_logprobs:
                     # with prompt_logprobs each token in the prompt has a row in
@@ -451,8 +451,8 @@ def run_test_case(*,
         _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list,
-            prompt_lens=prompt_lens if prompt_lens else None,
-            subquery_lens=prompt_lens if prompt_lens else None,
+            seq_lens=seq_lens if seq_lens else None,
+            query_lens=seq_lens if seq_lens else None,
             device=device,
             pin_memory=model_runner.pin_memory)
         # the logits tensor is modified in-place by the sampler
@@ -497,7 +497,7 @@ def test_sampler_mixed(seed: int, device: str):
 
     seq_group_metadata_list = []
     expected_tokens: List[Optional[List[int]]] = []
-    prompt_lens = []
+    seq_lens = []
     for i in range(batch_size):
         expected: Optional[List[int]] = None
         sampling_type = random.randint(0, 3)
@@ -532,13 +532,13 @@ def test_sampler_mixed(seed: int, device: str):
                 sampling_params=sampling_params,
                 block_tables={0: [1]},
             ))
-        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
     def test_sampling(model_runner: ModelRunner):
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list,
-            prompt_lens,
-            subquery_lens=prompt_lens,
+            seq_lens,
+            query_lens=seq_lens,
             device=device,
             pin_memory=model_runner.pin_memory)
         sampler_output = sampler(logits=fake_logits,
@@ -575,7 +575,7 @@ def test_sampling(model_runner: ModelRunner):
     # Shuffle the batch and resample
     target_index = list(range(batch_size))
     for list_to_shuffle in (target_index, seq_group_metadata_list,
-                            expected_tokens, prompt_lens):
+                            expected_tokens, seq_lens):
         random.Random(seed).shuffle(list_to_shuffle)
     target_index = torch.tensor(target_index)
     input_tensor.data = input_tensor.index_select(0, target_index)
@@ -620,7 +620,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
     assert len(warpers) == 2  # top_p and top_k
 
     seq_group_metadata_list = []
-    prompt_lens = []
+    seq_lens = []
     for i in range(batch_size):
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
@@ -634,12 +634,12 @@ def test_sampler_top_k_top_p(seed: int, device: str):
                 ),
                 block_tables={0: [1]},
             ))
-        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
     sampling_metadata = SamplingMetadata.prepare(
         seq_group_metadata_list,
-        prompt_lens,
-        subquery_lens=prompt_lens,
+        seq_lens,
+        query_lens=seq_lens,
         device=device,
         pin_memory=model_runner.pin_memory)
 
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 0eb784a9c5ac5..492620cf6e2cf 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -45,7 +45,7 @@ def __init__(
         gpu_memory_utilization: float = 0.9,
         swap_space: int = 4,
         enforce_eager: bool = False,
-        max_context_len_to_capture: int = 8192,
+        max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
         **kwargs,
     ) -> None:
@@ -66,7 +66,7 @@ def __init__(
             gpu_memory_utilization=gpu_memory_utilization,
             swap_space=swap_space,
             enforce_eager=enforce_eager,
-            max_context_len_to_capture=max_context_len_to_capture,
+            max_seq_len_to_capture=max_seq_len_to_capture,
             engine_use_ray=True,
             disable_custom_all_reduce=disable_custom_all_reduce,
             **kwargs,
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 98f2731de9aa3..cc0427633e688 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -34,7 +34,7 @@ def test_assert_enough_kv_space(num_steps: int):
         list(range(block_size * 2)),
     ]
 
-    final_seq_lens = [
+    final_prompt_lens = [
         len(prompt + output) + num_steps
         for prompt, output in zip(prompts, prev_output_tokens)
     ]
@@ -43,7 +43,7 @@ def test_assert_enough_kv_space(num_steps: int):
         prompts,
         num_gpu_blocks,
         block_size,
-        final_seq_lens,
+        final_prompt_lens,
         continuations=prev_output_tokens)
 
     assert_enough_kv_space = MultiStepWorker._assert_enough_kv_space  # pylint: disable=protected-access
@@ -103,17 +103,21 @@ def test_same_output_for_single_step():
         [6, 7, 8, 9, 10],
     ]
 
-    final_seq_lens = [len(prompt) + num_steps for prompt in prompts]
+    final_prompt_lens = [len(prompt) + num_steps for prompt in prompts]
 
     multi_step_execute_model_data = create_execute_model_data(
         seq_group_metadata_list=create_seq_group_metadata_from_prompts(
-            prompts, num_gpu_blocks, block_size,
-            final_seq_lens=final_seq_lens))
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            final_prompt_lens=final_prompt_lens))
 
     single_step_execute_model_data = create_execute_model_data(
         seq_group_metadata_list=create_seq_group_metadata_from_prompts(
-            prompts, num_gpu_blocks, block_size,
-            final_seq_lens=final_seq_lens))
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            final_prompt_lens=final_prompt_lens))
 
     zero_kv_cache(multi_step_worker.cache_engine)
     set_random_seed(seed)
@@ -181,7 +185,7 @@ def test_same_output_for_multi_step():
         random.randint(0, 1000) for _ in range(random.randint(10, 20))
     ] for _ in range(10)]
 
-    final_seq_lens = [len(prompt) + num_steps for prompt in prompts]
+    final_prompt_lens = [len(prompt) + num_steps for prompt in prompts]
 
     rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
     multi_step_worker.execute_model = patch_execute_model_with_seeds(
@@ -195,7 +199,7 @@ def test_same_output_for_multi_step():
             num_gpu_blocks,
             block_size,
             continuations=continuations,
-            final_seq_lens=final_seq_lens), )
+            final_prompt_lens=final_prompt_lens), )
 
     # Run multi-step.
     zero_kv_cache(multi_step_worker.cache_engine)
@@ -217,7 +221,7 @@ def test_same_output_for_multi_step():
                 num_gpu_blocks,
                 block_size,
                 continuations=continuations,
-                final_seq_lens=final_seq_lens))
+                final_prompt_lens=final_prompt_lens))
 
         single_step_output.extend(
             worker.execute_model(**execute_model_data.to_dict(), ))
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index ee4135015713d..e7e2e87f599dd 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -43,11 +43,13 @@ def test_ngram_algo_correctness_for_single_no_match():
     ]
 
     proposal_len = 5
-    final_seq_lens = [len(prompt) + proposal_len for prompt in prompts]
+    final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
     ngram_sampler_output_data = create_execute_model_data(
         seq_group_metadata_list=create_seq_group_metadata_from_prompts(
-            prompts, num_gpu_blocks, block_size,
-            final_seq_lens=final_seq_lens))
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            final_prompt_lens=final_prompt_lens))
 
     proposals = proposer.get_proposals(
         **ngram_sampler_output_data.to_dict(),
@@ -110,11 +112,13 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
     ]
 
     proposal_len = 5
-    final_seq_lens = [len(prompt) + proposal_len for prompt in prompts]
+    final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
     ngram_sampler_output_data = create_execute_model_data(
         seq_group_metadata_list=create_seq_group_metadata_from_prompts(
-            prompts, num_gpu_blocks, block_size,
-            final_seq_lens=final_seq_lens))
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            final_prompt_lens=final_prompt_lens))
 
     proposals = proposer.get_proposals(
         **ngram_sampler_output_data.to_dict(),
@@ -180,11 +184,13 @@ def test_ngram_algo_correctness_for_batches_match_all():
     ]
 
     proposal_len = 5
-    final_seq_lens = [len(prompt) + proposal_len for prompt in prompts]
+    final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
     ngram_sampler_output_data = create_execute_model_data(
         seq_group_metadata_list=create_seq_group_metadata_from_prompts(
-            prompts, num_gpu_blocks, block_size,
-            final_seq_lens=final_seq_lens))
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            final_prompt_lens=final_prompt_lens))
 
     proposals = proposer.get_proposals(
         **ngram_sampler_output_data.to_dict(),
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index 4f8295d25cf41..87c7d88a80f42 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -144,7 +144,7 @@ def create_seq_group_metadata_from_prompts(
     prompts: List[List[int]],
     num_gpu_blocks: int,
     block_size: int,
-    final_seq_lens: List[int],
+    final_prompt_lens: List[int],
     continuations: Optional[List[List[int]]] = None,
     seq_ids: Optional[List[int]] = None,
 ) -> List[SequenceGroupMetadata]:
@@ -162,7 +162,7 @@ def create_seq_group_metadata_from_prompts(
             free_gpu_blocks.pop()
             for _ in range(round_up_to_next_block(final_len, block_size))
         ]
-        for i, final_len in enumerate(final_seq_lens)
+        for i, final_len in enumerate(final_prompt_lens)
     }
 
     return [
@@ -251,13 +251,13 @@ def create_batch(batch_size,
     prev_output_tokens = [[
         next(iterator) for _ in range(prev_output_token_len)
     ] for _ in range(batch_size)]
-    final_seq_lens = [
+    final_prompt_lens = [
         len(prompt) + len(prev_output_token) + k + 1
         for prompt, prev_output_token in zip(prompts, prev_output_tokens)
     ]
 
     execute_model_data = create_execute_model_data(
         create_seq_group_metadata_from_prompts(prompts, num_gpu_blocks,
-                                               block_size, final_seq_lens,
+                                               block_size, final_prompt_lens,
                                                prev_output_tokens, seq_ids), )
     return execute_model_data, prompts, prev_output_tokens
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
index dbaeb4de18258..179e8d25a341b 100644
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -70,7 +70,7 @@ def pick_ith(token_ids, logits):
         return logits
 
     seq_group_metadata_list = []
-    prompt_lens = []
+    seq_lens = []
     for i in range(batch_size):
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
@@ -81,12 +81,12 @@ def pick_ith(token_ids, logits):
                                                logits_processors=[pick_ith]),
                 block_tables={0: [1]},
             ))
-        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
     sampling_metadata = SamplingMetadata.prepare(
         seq_group_metadata_list,
-        prompt_lens,
-        subquery_lens=prompt_lens,
+        seq_lens,
+        query_lens=seq_lens,
         device=model_runner.device,
         pin_memory=model_runner.pin_memory)
     logits_processor_output = logits_processor(
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 56fe6db589f18..e7975d0ef48b9 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -23,14 +23,14 @@ def test_prepare_prompt(batch_size):
                                lora_config=None)
     model_runner.set_block_size(16)
 
-    prompt_lens = []
+    seq_lens = []
     seq_group_metadata_list = []
     block_tables = {0: [1]}
     for i in range(batch_size):
         # make sure all tokens fit into one block
-        prompt_len = i % (model_runner.block_size - 1) + 1
-        prompt_lens.append(prompt_len)
-        seq_data = SequenceData(list(range(prompt_len)))
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_lens.append(seq_len)
+        seq_data = SequenceData(list(range(seq_len)))
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
             is_prompt=True,
@@ -43,29 +43,29 @@ def test_prepare_prompt(batch_size):
 
     expected_selected_token_indices = []
     selected_token_start_idx = 0
-    for prompt_len in prompt_lens:
+    for seq_len in seq_lens:
         expected_selected_token_indices.append(selected_token_start_idx +
-                                               prompt_len - 1)
-        selected_token_start_idx += prompt_len
-    (input_tokens, input_positions, attn_metadata, return_prompt_lens, _, _, _,
-     _, _,
-     slot_mapping) = (model_runner._prepare_prompt(seq_group_metadata_list))
-    assert return_prompt_lens == prompt_lens
+                                               seq_len - 1)
+        selected_token_start_idx += seq_len
+    (input_tokens, input_positions, attn_metadata, return_seq_lens, _, _, _, _,
+     _, slot_mapping) = (model_runner._prepare_prompt(seq_group_metadata_list))
+    assert return_seq_lens == seq_lens
     assert len(slot_mapping) == len(input_tokens)
 
     # Verify input metadata is correct for prompts.
     device = model_runner.device
     assert attn_metadata.is_prompt is True
-    assert torch.allclose(attn_metadata.prompt_lens_tensor,
-                          torch.tensor(prompt_lens, device=device))
-    assert attn_metadata.prompt_lens == prompt_lens
-    assert attn_metadata.max_prompt_len == max(prompt_lens)
+    assert torch.allclose(
+        attn_metadata.seq_lens_tensor,
+        torch.tensor(seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.seq_lens == seq_lens
+    assert attn_metadata.max_seq_len == max(seq_lens)
 
     # Test subquery start locs.
     start_idx = 0
     start_loc = [start_idx]
-    for prompt_len in prompt_lens:
-        start_idx += prompt_len
+    for seq_len in seq_lens:
+        start_idx += seq_len
         start_loc.append(start_idx)
     assert torch.allclose(
         attn_metadata.subquery_start_loc,
@@ -75,17 +75,16 @@ def test_prepare_prompt(batch_size):
     # equivalent to subquery_start_loc.
     start_idx = 0
     seq_start_loc = [start_idx]
-    for prompt_len in prompt_lens:
-        start_idx += prompt_len
+    for seq_len in seq_lens:
+        start_idx += seq_len
         seq_start_loc.append(start_idx)
 
     assert torch.allclose(
         attn_metadata.seq_start_loc,
         torch.tensor(start_loc, dtype=torch.int32, device=device))
-    assert attn_metadata.max_context_len is None
     assert torch.allclose(
-        attn_metadata.context_lens,
-        torch.zeros(attn_metadata.context_lens.shape[0],
+        attn_metadata.context_lens_tensor,
+        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
                     dtype=torch.int,
                     device=device))
 
@@ -96,18 +95,18 @@ def test_prepare_prompt(batch_size):
     # Cuda graph should not be used for prerill.
     assert attn_metadata.use_cuda_graph is False
 
-    assert len(input_tokens) == sum(prompt_lens)
-    assert len(input_positions) == sum(prompt_lens)
+    assert len(input_tokens) == sum(seq_lens)
+    assert len(input_positions) == sum(seq_lens)
     torch.testing.assert_close(input_tokens, input_positions)
 
     sampling_metadata = SamplingMetadata.prepare(
         seq_group_metadata_list,
-        prompt_lens,
-        subquery_lens=prompt_lens,
+        seq_lens,
+        query_lens=seq_lens,
         device=model_runner.device,
         pin_memory=model_runner.pin_memory)
-    assert len(input_tokens) == sum(prompt_lens)
-    assert len(input_positions) == sum(prompt_lens)
+    assert len(input_tokens) == sum(seq_lens)
+    assert len(input_positions) == sum(seq_lens)
     actual = sampling_metadata.selected_token_indices
     expected = torch.tensor(expected_selected_token_indices,
                             device=actual.device,
@@ -146,13 +145,13 @@ def test_prepare_decode_cuda_graph(batch_size):
                                lora_config=None)
     model_runner.set_block_size(16)
 
-    prompt_lens = []
+    seq_lens = []
     seq_group_metadata_list = []
     for i in range(batch_size):
         # make sure all tokens fit into one block
-        prompt_len = i % (model_runner.block_size - 1) + 1
-        prompt_lens.append(prompt_len)
-        seq_data = list(range(prompt_len))
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_lens.append(seq_len)
+        seq_data = list(range(seq_len))
         seq_data = SequenceData(seq_data)
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
@@ -172,14 +171,13 @@ def test_prepare_decode_cuda_graph(batch_size):
     # Verify input metadata is correct for prompts.
     device = model_runner.device
     assert attn_metadata.is_prompt is False
-    assert attn_metadata.prompt_lens is None
-    assert attn_metadata.max_prompt_len is None
+    assert attn_metadata.seq_lens is None
     assert attn_metadata.subquery_start_loc is None
     assert attn_metadata.seq_start_loc is None
-    assert attn_metadata.max_context_len == max(prompt_lens)
+    assert attn_metadata.max_seq_len == max(seq_lens)
     assert torch.allclose(
-        attn_metadata.context_lens[:len(prompt_lens)],
-        torch.tensor(prompt_lens, dtype=torch.int, device=device))
+        attn_metadata.seq_lens_tensor[:len(seq_lens)],
+        torch.tensor(seq_lens, dtype=torch.int, device=device))
 
     # block table's first index corresponds to each batch, meaning in
     # decoding it is each token.
@@ -198,13 +196,13 @@ def test_prepare_decode_cuda_graph(batch_size):
     # Verify Sampling
     expected_selected_token_indices = []
     selected_token_start_idx = 0
-    for prompt_len in prompt_lens:
+    for seq_len in seq_lens:
         expected_selected_token_indices.append(selected_token_start_idx)
         selected_token_start_idx += 1
     sampling_metadata = SamplingMetadata.prepare(
         seq_group_metadata_list,
-        prompt_lens,
-        subquery_lens=prompt_lens,
+        seq_lens,
+        query_lens=seq_lens,
         device=model_runner.device,
         pin_memory=model_runner.pin_memory)
     actual = sampling_metadata.selected_token_indices
@@ -241,14 +239,13 @@ def test_empty_seq_group():
     assert attn_metadata is None
     assert len(slot_mapping) == 0
 
-    (input_tokens, input_positions, attn_metadata, return_prompt_lens, _, _, _,
-     _, _,
-     slot_mapping) = (model_runner._prepare_prompt(seq_group_metadata_list))
+    (input_tokens, input_positions, attn_metadata, return_seq_lens, _, _, _, _,
+     _, slot_mapping) = (model_runner._prepare_prompt(seq_group_metadata_list))
     assert len(input_tokens) == 0
     assert len(input_positions) == 0
     assert attn_metadata is None
     assert len(slot_mapping) == 0
-    assert len(return_prompt_lens) == 0
+    assert len(return_seq_lens) == 0
 
 
 @pytest.fixture
@@ -288,7 +285,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
     model_runner.set_block_size(16)
 
     # Add prefill requests.
-    prompt_lens = []
+    seq_lens = []
     seq_group_metadata_list = []
     prefill_metadata_list = []
     decode_metadata_list = []
@@ -297,9 +294,9 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
     decode_batch_size = batch_size - prefill_batch_size
     for i in range(prefill_batch_size):
         # make sure all tokens fit into one block
-        prompt_len = i % (model_runner.block_size - 1) + 1
-        prompt_lens.append(prompt_len)
-        seq_data = SequenceData(list(range(prompt_len)))
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_lens.append(seq_len)
+        seq_data = SequenceData(list(range(seq_len)))
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
             is_prompt=True,
@@ -314,8 +311,8 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
     # Add decode requests
     for i in range(prefill_batch_size, batch_size):
         # make sure all tokens fit into one block
-        prompt_len = i % (model_runner.block_size - 1) + 1
-        prompt_toks = list(range(prompt_len))
+        seq_len = i % (model_runner.block_size - 1) + 1
+        prompt_toks = list(range(seq_len))
         seq_data = SequenceData(prompt_toks)
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
@@ -343,7 +340,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
     else:
         assert attn_metadata.num_decode_tokens == _get_graph_batch_size(
             decode_batch_size)
-    assert attn_metadata.num_prefill_tokens == sum(prompt_lens)
+    assert attn_metadata.num_prefill_tokens == sum(seq_lens)
 
     # Verify attn metadata is consistent. We don't need to test individual
     # values here because they are tested above.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3faed5ea85307..b43f646fec88e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -39,17 +39,17 @@ def paged_attention_v1(
     num_kv_heads: int,
     scale: float,
     block_tables: torch.Tensor,
-    context_lens: torch.Tensor,
+    seq_lens: torch.Tensor,
     block_size: int,
-    max_context_len: int,
+    max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
     kv_cache_dtype: str,
     kv_scale: float,
 ) -> None:
     vllm_ops.paged_attention_v1(out, query, key_cache, value_cache,
-                                num_kv_heads, scale, block_tables,
-                                context_lens, block_size, max_context_len,
-                                alibi_slopes, kv_cache_dtype, kv_scale)
+                                num_kv_heads, scale, block_tables, seq_lens,
+                                block_size, max_seq_len, alibi_slopes,
+                                kv_cache_dtype, kv_scale)
 
 
 def paged_attention_v2(
@@ -63,17 +63,17 @@ def paged_attention_v2(
     num_kv_heads: int,
     scale: float,
     block_tables: torch.Tensor,
-    context_lens: torch.Tensor,
+    seq_lens: torch.Tensor,
     block_size: int,
-    max_context_len: int,
+    max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
     kv_cache_dtype: str,
     kv_scale: float,
 ) -> None:
     vllm_ops.paged_attention_v2(out, exp_sum, max_logits, tmp_out, query,
                                 key_cache, value_cache, num_kv_heads, scale,
-                                block_tables, context_lens, block_size,
-                                max_context_len, alibi_slopes, kv_cache_dtype,
+                                block_tables, seq_lens, block_size,
+                                max_seq_len, alibi_slopes, kv_cache_dtype,
                                 kv_scale)
 
 
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 10b8c19b7499e..fc7501ed5e91f 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -66,27 +66,24 @@ class FlashAttentionMetadata(AttentionMetadataPerStage,
     # Currently, input sequences can only contain all prompts
     # or all decoding. True if all sequences are prompts.
     is_prompt: bool
-    # (batch_size,). The prompt length per sequence. None if it is a decoding.
-    prompt_lens: Optional[List[int]]
-    # prompt_lens stored as a tensor.
-    prompt_lens_tensor: Optional[torch.Tensor]
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
 
-    # NOTE(sang): Definition of context_len, subquery_len, and seqlen.
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
     # |---------- N-1 iteration --------|
     # |---------------- N iteration ---------------------|
     # |- tokenA -|......................|-- newTokens ---|
     # |---------- context_len ----------|
-    # |-------------------- seqlen ----------------------|
-    #                                   |- subquery_len -|
+    # |-------------------- seq_len ----------------------|
+    #                                   |-- query_len ---|
 
-    # WARNING(sang): context_len has different definition depending on if it is
-    # prefill vs decoding. When it is prefill, it doesn't include new tokens.
-    # When it is for decoding, it includes a new token.
-
-    # Maximum subquery length in the batch.
-    max_subquery_len: Optional[int]
-    # Maximum prompt length in the batch.
-    max_prompt_len: Optional[int]
+    # Maximum query length in the batch.
+    max_query_len: Optional[int]
+    # Maximum sequence length in the batch.
+    max_seq_len: Optional[int]
     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
     # the batch, used to index into subquery. E.g., if the subquery length
     # is [4, 6], it is [0, 4, 10].
@@ -95,6 +92,9 @@ class FlashAttentionMetadata(AttentionMetadataPerStage,
     # the batch, used to index into sequence. E.g., if the sequence length is
     # [4, 6], it is [0, 4, 10].
     seq_start_loc: Optional[torch.Tensor]
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
 
     # Whether or not if cuda graph is enabled.
     # Cuda-graph is currently enabled for decoding only.
@@ -223,8 +223,8 @@ def forward(
                     v=value,
                     cu_seqlens_q=prefill_meta.seq_start_loc,
                     cu_seqlens_k=prefill_meta.seq_start_loc,
-                    max_seqlen_q=prefill_meta.max_prompt_len,
-                    max_seqlen_k=prefill_meta.max_prompt_len,
+                    max_seqlen_q=prefill_meta.max_seq_len,
+                    max_seqlen_k=prefill_meta.max_seq_len,
                     softmax_scale=self.scale,
                     causal=True,
                     window_size=self.sliding_window,
@@ -245,9 +245,9 @@ def forward(
                     value_cache,
                     prefill_meta.block_tables,
                     prefill_meta.subquery_start_loc,
-                    prefill_meta.prompt_lens_tensor,
-                    prefill_meta.context_lens,
-                    prefill_meta.max_subquery_len,
+                    prefill_meta.seq_lens_tensor,
+                    prefill_meta.context_lens_tensor,
+                    prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window[0],
                 )
@@ -258,8 +258,8 @@ def forward(
                 key_cache,
                 value_cache,
                 decode_meta.block_tables,
-                decode_meta.context_lens,
-                decode_meta.max_context_len,
+                decode_meta.seq_lens_tensor,
+                decode_meta.max_seq_len,
                 attn_metadata.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 3bc436315c3de..c411b3971b8f1 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -64,27 +64,24 @@ class ROCmFlashAttentionMetadata(AttentionMetadataPerStage,
     # Currently, input sequences can only contain all prompts
     # or all decoding. True if all sequences are prompts.
     is_prompt: bool
-    # (batch_size,). The prompt length per sequence. None if it is a decoding.
-    prompt_lens: Optional[List[int]]
-    # prompt_lens stored as a tensor.
-    prompt_lens_tensor: Optional[torch.Tensor]
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
 
-    # NOTE(sang): Definition of context_len, subquery_len, and seqlen.
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
     # |---------- N-1 iteration --------|
     # |---------------- N iteration ---------------------|
     # |- tokenA -|......................|-- newTokens ---|
     # |---------- context_len ----------|
-    # |-------------------- seqlen ----------------------|
-    #                                   |- subquery_len -|
+    # |-------------------- seq_len ----------------------|
+    #                                   |-- query_len ---|
 
-    # WARNING(sang): context_len has different definition depending on if it is
-    # prefill vs decoding. When it is prefill, it doesn't include new tokens.
-    # When it is for decoding, it includes a new token.
-
-    # Maximum subquery length in the batch.
-    max_subquery_len: Optional[int]
-    # Maximum prompt length in the batch.
-    max_prompt_len: Optional[int]
+    # Maximum query length in the batch.
+    max_query_len: Optional[int]
+    # Maximum sequence length in the batch.
+    max_seq_len: Optional[int]
     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
     # the batch, used to index into subquery. E.g., if the subquery length
     # is [4, 6], it is [0, 4, 10].
@@ -98,6 +95,9 @@ class ROCmFlashAttentionMetadata(AttentionMetadataPerStage,
     # Cuda-graph is currently enabled for decoding only.
     # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
     use_cuda_graph: bool
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
 
 
 class ROCmFlashAttentionImpl(AttentionImpl):
@@ -247,7 +247,7 @@ def forward(
 
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
-            assert prefill_meta.prompt_lens is not None
+            assert prefill_meta.seq_lens is not None
             if kv_cache is None or prefill_meta.block_tables.numel() == 0:
                 # triton attention
                 # When block_tables are not filled, it means q and k are the
@@ -260,8 +260,8 @@ def forward(
                         None,
                         prefill_meta.seq_start_loc,
                         prefill_meta.seq_start_loc,
-                        prefill_meta.max_prompt_len,
-                        prefill_meta.max_prompt_len,
+                        prefill_meta.max_seq_len,
+                        prefill_meta.max_seq_len,
                         True,
                         self.scale,
                     )
@@ -274,7 +274,7 @@ def forward(
                         query,
                         key,
                         value,
-                        prefill_meta.prompt_lens,
+                        prefill_meta.seq_lens,
                         self.scale,
                     )
                 else:
@@ -284,8 +284,8 @@ def forward(
                         v=value,
                         cu_seqlens_q=prefill_meta.seq_start_loc,
                         cu_seqlens_k=prefill_meta.seq_start_loc,
-                        max_seqlen_q=prefill_meta.max_prompt_len,
-                        max_seqlen_k=prefill_meta.max_prompt_len,
+                        max_seqlen_q=prefill_meta.max_seq_len,
+                        max_seqlen_k=prefill_meta.max_seq_len,
                         softmax_scale=self.scale,
                         causal=True,
                     )
@@ -303,9 +303,9 @@ def forward(
                     value_cache,
                     prefill_meta.block_tables,
                     prefill_meta.subquery_start_loc,
-                    prefill_meta.prompt_lens_tensor,
-                    prefill_meta.context_lens,
-                    prefill_meta.max_subquery_len,
+                    prefill_meta.seq_lens_tensor,
+                    prefill_meta.context_lens_tensor,
+                    prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window[0],
                 )
@@ -317,8 +317,8 @@ def forward(
                 key_cache,
                 value_cache,
                 decode_meta.block_tables,
-                decode_meta.context_lens,
-                decode_meta.max_context_len,
+                decode_meta.seq_lens_tensor,
+                decode_meta.max_seq_len,
                 attn_metadata.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
@@ -334,13 +334,13 @@ def _naive_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    prompt_lens: List[int],
+    seq_lens: List[int],
     scale: float,
 ) -> torch.Tensor:
     output = torch.empty_like(query)
     start = 0
-    for _, prompt_len in enumerate(prompt_lens):
-        end = start + prompt_len
+    for _, seq_len in enumerate(seq_lens):
+        end = start + seq_len
         out = _naive_masked_attention(
             query[start:end],
             key[start:end],
@@ -349,7 +349,7 @@ def _naive_attention(
         )
         # TODO(woosuk): Unnecessary copy. Optimize.
         output[start:end].copy_(out)
-        start += prompt_len
+        start += seq_len
 
     return output
 
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 55a7ce59ac6e0..f75a279086a26 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -58,7 +58,7 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata,
     # or all decoding. True if all sequences are prompts.
     is_prompt: bool
     slot_mapping: torch.Tensor
-    prompt_lens: Optional[List[int]]
+    seq_lens: Optional[List[int]]
 
     def __post_init__(self):
         # Set during the execution of the first attention op.
@@ -136,7 +136,7 @@ def forward(
                                                 kv_scale)
 
         if attn_metadata.is_prompt:
-            assert attn_metadata.prompt_lens is not None
+            assert attn_metadata.seq_lens is not None
             if (kv_cache is None or attn_metadata.block_tables.numel() == 0):
                 if self.num_kv_heads != self.num_heads:
                     key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
@@ -147,13 +147,13 @@ def forward(
                     if self.alibi_slopes is not None:
                         att_masks = _make_alibi_bias(
                             self.alibi_slopes, query.dtype,
-                            attn_metadata.prompt_lens)  # type: ignore
+                            attn_metadata.seq_lens)  # type: ignore
                     elif self.sliding_window is not None:
                         att_masks = _make_sliding_window_bias(
-                            attn_metadata.prompt_lens, self.sliding_window,
+                            attn_metadata.seq_lens, self.sliding_window,
                             query.dtype)  # type: ignore
                     else:
-                        att_masks = [None] * len(attn_metadata.prompt_lens)
+                        att_masks = [None] * len(attn_metadata.seq_lens)
                     attn_metadata.attn_bias = att_masks
 
                 query = query.movedim(0, query.dim() - 2)
@@ -164,9 +164,9 @@ def forward(
                 output = torch.empty(
                     (num_tokens, self.num_heads, self.head_size),
                     dtype=query.dtype)
-                for prompt_len, mask in zip(attn_metadata.prompt_lens,
-                                            attn_metadata.attn_bias):
-                    end = start + prompt_len
+                for seq_len, mask in zip(attn_metadata.seq_lens,
+                                         attn_metadata.attn_bias):
+                    end = start + seq_len
                     sub_out = scaled_dot_product_attention(
                         query[:, start:end, :],
                         key[:, start:end, :],
@@ -189,8 +189,8 @@ def forward(
                 key_cache,
                 value_cache,
                 attn_metadata.block_tables,
-                attn_metadata.context_lens,
-                attn_metadata.max_context_len,
+                attn_metadata.seq_lens_tensor,
+                attn_metadata.max_seq_len,
                 attn_metadata.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
@@ -205,13 +205,13 @@ def forward(
 def _make_alibi_bias(
     alibi_slopes: torch.Tensor,
     dtype: torch.dtype,
-    prompt_lens: List[int],
+    seq_lens: List[int],
 ) -> List[torch.Tensor]:
     attn_biases = []
-    for prompt_len in prompt_lens:
-        bias = torch.arange(prompt_len, dtype=dtype)
+    for seq_len in seq_lens:
+        bias = torch.arange(seq_len, dtype=dtype)
         # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(prompt_len, 1)`
+        #     `bias = bias[None, :].repeat(seq_len, 1)`
         # here. We find that both biases give the same results, but
         # the bias below more accurately follows the original ALiBi
         # paper.
@@ -221,7 +221,7 @@ def _make_alibi_bias(
         bias = bias[None, :].repeat((num_heads, 1, 1))
         bias.mul_(alibi_slopes[:, None, None])
         inf_mask = torch.empty(
-            (1, prompt_len, prompt_len),
+            (1, seq_len, seq_len),
             dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1)
         attn_biases.append((bias + inf_mask).to(dtype))
 
@@ -229,14 +229,14 @@ def _make_alibi_bias(
 
 
 def _make_sliding_window_bias(
-    prompt_lens: List[int],
+    seq_lens: List[int],
     window_size: Optional[int],
     dtype: torch.dtype,
 ) -> List[torch.Tensor]:
     attn_biases = []
-    for prompt_len in prompt_lens:
+    for seq_len in seq_lens:
         tensor = torch.full(
-            (1, prompt_len, prompt_len),
+            (1, seq_len, seq_len),
             dtype=dtype,
             fill_value=1,
         )
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index dc64ac0bf985d..60f6d43f2eaa4 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -66,28 +66,24 @@ class XFormersMetadata(AttentionMetadataPerStage, PagedAttentionMetadata):
     # Currently, input sequences can only contain all prompts
     # or all decoding. True if all sequences are prompts.
     is_prompt: bool
-    # (batch_size,). The prompt length per sequence. None if it is a decoding.
-    prompt_lens: Optional[List[int]]
-    # prompt_lens stored as a tensor.
-    prompt_lens_tensor: Optional[torch.Tensor]
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
 
-    # NOTE(sang): Definition of context_len, subquery_len, and seqlen.
     # |---------- N-1 iteration --------|
     # |---------------- N iteration ---------------------|
     # |- tokenA -|......................|-- newTokens ---|
     # |---------- context_len ----------|
-    # |-------------------- seqlen ----------------------|
-    #                                   |- subquery_len -|
+    # |-------------------- seq_len ----------------------|
+    #                                   |-- query_len ---|
 
-    # WARNING(sang): context_len has different definition depending on if it is
-    # prefill vs decoding. When it is prefill, it doesn't include new tokens.
-    # When it is for decoding, it includes a new token.
-
-    # Maximum subquery length in the batch.
-    max_subquery_len: Optional[int]
+    # Maximum query length in the batch.
+    max_query_len: Optional[int]
     # FIXME: It is for flash attn.
-    # Maximum prompt length in the batch.
-    max_prompt_len: Optional[int]
+    # Maximum sequence length in the batch.
+    max_seq_len: Optional[int]
     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
     # the batch, used to index into subquery. E.g., if the subquery length
     # is [4, 6], it is [0, 4, 10].
@@ -97,6 +93,9 @@ class XFormersMetadata(AttentionMetadataPerStage, PagedAttentionMetadata):
     # the batch, used to index into sequence. E.g., if the sequence length is
     # [4, 6], it is [0, 4, 10].
     seq_start_loc: Optional[torch.Tensor]
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
 
     # Whether or not if cuda graph is enabled.
     # Cuda-graph is currently enabled for decoding only.
@@ -242,9 +241,9 @@ def forward(
                     value_cache,
                     prefill_meta.block_tables,
                     prefill_meta.subquery_start_loc,
-                    prefill_meta.prompt_lens_tensor,
-                    prefill_meta.context_lens,
-                    prefill_meta.max_subquery_len,
+                    prefill_meta.seq_lens_tensor,
+                    prefill_meta.context_lens_tensor,
+                    prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window,
                 )
@@ -257,8 +256,8 @@ def forward(
                 key_cache,
                 value_cache,
                 decode_meta.block_tables,
-                decode_meta.context_lens,
-                decode_meta.max_context_len,
+                decode_meta.seq_lens_tensor,
+                decode_meta.max_seq_len,
                 attn_metadata.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
@@ -289,7 +288,7 @@ def _run_memory_efficient_xformers_forward(
             value: shape = [num_prefill_tokens, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         """
-        assert attn_metadata.prompt_lens is not None
+        assert attn_metadata.seq_lens is not None
         original_query = query
         if self.num_kv_heads != self.num_heads:
             # GQA/MQA requires the shape [B, M, G, H, K].
@@ -310,7 +309,7 @@ def _run_memory_efficient_xformers_forward(
         if attn_metadata.attn_bias is None:
             if self.alibi_slopes is None:
                 attn_bias = BlockDiagonalCausalMask.from_seqlens(
-                    attn_metadata.prompt_lens)
+                    attn_metadata.seq_lens)
                 if self.sliding_window is not None:
                     attn_bias = attn_bias.make_local_attention(
                         self.sliding_window)
@@ -318,7 +317,7 @@ def _run_memory_efficient_xformers_forward(
             else:
                 attn_metadata.attn_bias = _make_alibi_bias(
                     self.alibi_slopes, self.num_kv_heads, query.dtype,
-                    attn_metadata.prompt_lens)
+                    attn_metadata.seq_lens)
 
         # No alibi slopes.
         # TODO(woosuk): Too many view operations. Let's try to reduce
@@ -343,8 +342,8 @@ def _run_memory_efficient_xformers_forward(
         # one. This is inefficient, especially when we have many short prompts.
         output = torch.empty_like(original_query)
         start = 0
-        for i, prompt_len in enumerate(attn_metadata.prompt_lens):
-            end = start + prompt_len
+        for i, seq_len in enumerate(attn_metadata.seq_lens):
+            end = start + seq_len
             out = xops.memory_efficient_attention_forward(
                 query[None, start:end],
                 key[None, start:end],
@@ -354,7 +353,7 @@ def _run_memory_efficient_xformers_forward(
                 scale=self.scale)
             # TODO(woosuk): Unnecessary copy. Optimize.
             output[start:end].copy_(out.view_as(original_query[start:end]))
-            start += prompt_len
+            start += seq_len
         return output
 
 
@@ -362,13 +361,13 @@ def _make_alibi_bias(
     alibi_slopes: torch.Tensor,
     num_kv_heads: int,
     dtype: torch.dtype,
-    prompt_lens: List[int],
+    seq_lens: List[int],
 ) -> LowerTriangularMaskWithTensorBias:
     attn_biases = []
-    for prompt_len in prompt_lens:
-        bias = torch.arange(prompt_len, dtype=dtype)
+    for seq_len in seq_lens:
+        bias = torch.arange(seq_len, dtype=dtype)
         # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(prompt_len, 1)`
+        #     `bias = bias[None, :].repeat(seq_len, 1)`
         # here. We find that both biases give the same results, but
         # the bias below more accurately follows the original ALiBi
         # paper.
@@ -376,16 +375,16 @@ def _make_alibi_bias(
         # element.
         bias = bias[None, :] - bias[:, None]
 
-        padded_len = (prompt_len + 7) // 8 * 8
+        padded_len = (seq_len + 7) // 8 * 8
         num_heads = alibi_slopes.shape[0]
         bias = torch.empty(
             1,  # batch size
             num_heads,
-            prompt_len,
+            seq_len,
             padded_len,
             device=alibi_slopes.device,
             dtype=dtype,
-        )[:, :, :, :prompt_len].copy_(bias)
+        )[:, :, :, :seq_len].copy_(bias)
         bias.mul_(alibi_slopes[:, None, None])
         if num_heads != num_kv_heads:
             bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index c20b94ac8315b..00a0f10c0950b 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -13,12 +13,11 @@
 @dataclass
 class PagedAttentionMetadata:
     """Metadata for PagedAttention."""
-    # (batch_size,). The length of context (tokens stored in KV cache) per
-    # sequence. WARNING: When it is a prefill request, it doesn't include new
-    # tokens. When it is for decoding, it includes a new token.
-    context_lens: Optional[torch.Tensor]
-    # Maximum context length in the batch.
-    max_context_len: Optional[int]
+    # (batch_size,). The length of sequences (entire tokens seen so far) per
+    # sequence.
+    seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length in the batch.
+    max_seq_len: Optional[int]
     # (batch_size, max_blocks_per_seq).
     # Block addresses per sequence. (Seq id -> list of physical block)
     # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
@@ -85,8 +84,8 @@ def forward_decode(
         key_cache: torch.Tensor,
         value_cache: torch.Tensor,
         block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
+        seq_lens: torch.Tensor,
+        max_seq_len: int,
         kv_cache_dtype: str,
         num_kv_heads: int,
         scale: float,
@@ -97,7 +96,7 @@ def forward_decode(
 
         block_size = value_cache.shape[3]
         num_seqs, num_heads, head_size = query.shape
-        max_num_partitions = ((max_context_len + _PARTITION_SIZE - 1) //
+        max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
                               _PARTITION_SIZE)
         # NOTE(woosuk): We use a simple heuristic to decide whether to use
         # PagedAttention V1 or V2. If the number of partitions is 1, we use
@@ -106,7 +105,7 @@ def forward_decode(
         # to parallelize.
         # TODO(woosuk): Tune this heuristic.
         # For context len > 8192, use V2 kernel to avoid shared memory shortage.
-        use_v1 = (max_context_len <= 8192
+        use_v1 = (max_seq_len <= 8192
                   and (max_num_partitions == 1 or num_seqs * num_heads > 512))
         if use_v1:
             # Run PagedAttention V1.
@@ -118,9 +117,9 @@ def forward_decode(
                 num_kv_heads,
                 scale,
                 block_tables,
-                context_lens,
+                seq_lens,
                 block_size,
-                max_context_len,
+                max_seq_len,
                 alibi_slopes,
                 kv_cache_dtype,
                 kv_scale,
@@ -150,9 +149,9 @@ def forward_decode(
                 num_kv_heads,
                 scale,
                 block_tables,
-                context_lens,
+                seq_lens,
                 block_size,
-                max_context_len,
+                max_seq_len,
                 alibi_slopes,
                 kv_cache_dtype,
                 kv_scale,
@@ -168,9 +167,9 @@ def forward_prefix(
         value_cache: torch.Tensor,
         block_tables: torch.Tensor,
         subquery_start_loc: torch.Tensor,
-        prompt_lens_tensor: torch.Tensor,
+        seq_lens_tensor: torch.Tensor,
         context_lens: torch.Tensor,
-        max_subquery_len: int,
+        max_query_len: int,
         alibi_slopes: Optional[torch.Tensor],
         sliding_window: Optional[int],
     ) -> torch.Tensor:
@@ -185,9 +184,9 @@ def forward_prefix(
             block_tables,
             # subquery_start_loc is (batch_size + 1,)
             subquery_start_loc[:-1],
-            prompt_lens_tensor,
+            seq_lens_tensor,
             context_lens,
-            max_subquery_len,
+            max_query_len,
             alibi_slopes,
             sliding_window,
         )
diff --git a/vllm/config.py b/vllm/config.py
index aaa2f60739d55..3bdd3f774bc27 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -63,7 +63,10 @@ class ModelConfig:
             If False, we will use CUDA graph and eager execution in hybrid.
         max_context_len_to_capture: Maximum context len covered by CUDA graphs.
             When a sequence has context length larger than this, we fall back
-            to eager mode.
+            to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
+        max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
+            When a sequence has context length larger than this, we fall back
+            to eager mode
         skip_tokenizer_init: If true, skip initialization of tokenizer and
             detokenizer.
     """
@@ -84,6 +87,7 @@ def __init__(
         quantization_param_path: Optional[str] = None,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
+        max_seq_len_to_capture: Optional[int] = None,
         max_logprobs: int = 5,
         skip_tokenizer_init: bool = False,
     ) -> None:
@@ -99,6 +103,11 @@ def __init__(
         self.quantization_param_path = quantization_param_path
         self.enforce_eager = enforce_eager
         self.max_context_len_to_capture = max_context_len_to_capture
+        if self.max_context_len_to_capture is not None:
+            raise ValueError("`max_context_len_to_capture` is deprecated. "
+                             "Use `max_seq_len_to_capture` instead.")
+        self.max_seq_len_to_capture = (max_seq_len_to_capture
+                                       or max_context_len_to_capture)
         self.max_logprobs = max_logprobs
         self.skip_tokenizer_init = skip_tokenizer_init
 
@@ -190,10 +199,10 @@ def _verify_quantization(self) -> None:
                     "non-quantized models.", self.quantization)
 
     def _verify_cuda_graph(self) -> None:
-        if self.max_context_len_to_capture is None:
-            self.max_context_len_to_capture = self.max_model_len
-        self.max_context_len_to_capture = min(self.max_context_len_to_capture,
-                                              self.max_model_len)
+        if self.max_seq_len_to_capture is None:
+            self.max_seq_len_to_capture = self.max_model_len
+        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
+                                          self.max_model_len)
 
     def verify_with_parallel_config(
         self,
@@ -772,8 +781,8 @@ def maybe_create_spec_config(
                 max_model_len=None,
                 quantization=draft_quantization,
                 enforce_eager=target_model_config.enforce_eager,
-                max_context_len_to_capture=target_model_config.
-                max_context_len_to_capture,
+                max_seq_len_to_capture=target_model_config.
+                max_seq_len_to_capture,
                 max_logprobs=target_model_config.max_logprobs,
             )
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7637616ae6089..1c8e1079bed58 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -44,7 +44,8 @@ class EngineArgs:
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
     enforce_eager: bool = False
-    max_context_len_to_capture: int = 8192
+    max_context_len_to_capture: Optional[int] = None
+    max_seq_len_to_capture: int = 8192
     disable_custom_all_reduce: bool = False
     tokenizer_pool_size: int = 0
     tokenizer_pool_type: str = "ray"
@@ -322,6 +323,14 @@ def add_cli_args(
                             default=EngineArgs.max_context_len_to_capture,
                             help='Maximum context length covered by CUDA '
                             'graphs. When a sequence has context length '
+                            'larger than this, we fall back to eager mode. '
+                            '(DEPRECATED. Use --max-seq_len-to-capture instead'
+                            ')')
+        parser.add_argument('--max-seq_len-to-capture',
+                            type=int,
+                            default=EngineArgs.max_seq_len_to_capture,
+                            help='Maximum sequence length covered by CUDA '
+                            'graphs. When a sequence has context length '
                             'larger than this, we fall back to eager mode.')
         parser.add_argument('--disable-custom-all-reduce',
                             action='store_true',
@@ -492,7 +501,8 @@ def create_engine_config(self, ) -> EngineConfig:
             self.code_revision, self.tokenizer_revision, self.max_model_len,
             self.quantization, self.quantization_param_path,
             self.enforce_eager, self.max_context_len_to_capture,
-            self.max_logprobs, self.skip_tokenizer_init)
+            self.max_seq_len_to_capture, self.max_logprobs,
+            self.skip_tokenizer_init)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index b022707794a78..3ed660e183360 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -69,6 +69,9 @@ class LLM:
             disable CUDA graph and always execute the model in eager mode.
             If False, we will use CUDA graph and eager execution in hybrid.
         max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+            When a sequence has context length larger than this, we fall back
+            to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead).
+        max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
             When a sequence has context length larger than this, we fall back
             to eager mode.
         disable_custom_all_reduce: See ParallelConfig
@@ -90,7 +93,8 @@ def __init__(
         gpu_memory_utilization: float = 0.9,
         swap_space: int = 4,
         enforce_eager: bool = False,
-        max_context_len_to_capture: int = 8192,
+        max_context_len_to_capture: Optional[int] = None,
+        max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
         **kwargs,
     ) -> None:
@@ -112,6 +116,7 @@ def __init__(
             swap_space=swap_space,
             enforce_eager=enforce_eager,
             max_context_len_to_capture=max_context_len_to_capture,
+            max_seq_len_to_capture=max_seq_len_to_capture,
             disable_custom_all_reduce=disable_custom_all_reduce,
             **kwargs,
         )
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index d79c99e5d0a45..2de7763605dfc 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1033,8 +1033,8 @@ def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]:
     assert seq_group.is_prompt, (
         "Caller should ensure the sequence group is in a prefill stage.")
     seq_ids = seq_group.seq_ids
-    subquery_len = seq_group.subquery_len
-    assert subquery_len is not None
+    query_len = seq_group.query_len
+    assert query_len is not None
     # prompt has only 1 seq id.
     assert len(seq_ids) == 1
     seq_data = seq_group.seq_data[seq_ids[0]]
@@ -1042,7 +1042,7 @@ def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]:
     prompt_tokens = seq_data.prompt_token_ids
     # +1 because we are looking for a next prompt token.
     next_token_index_start = computed_len + 1
-    next_token_index_end = min(computed_len + subquery_len + 1,
+    next_token_index_end = min(computed_len + query_len + 1,
                                len(prompt_tokens))
     next_prompt_tokens = prompt_tokens[
         next_token_index_start:next_token_index_end]
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 12156b2ba1aa2..9969c45963e9a 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -16,17 +16,26 @@
 
 @dataclass
 class SequenceGroupToSample:
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ----------------------|
+    #                                   |-- query_len ---|
+
     # Sequence ids for the sequence group in a previous step.
     seq_ids: List[int]
     sampling_params: SamplingParams
     # seq_id -> sequence data.
     seq_data: Dict[int, SequenceData]
-    # The length of the prompt of the sequence group. None if it is in a decode
+    # The length of the sequence (all tokens seen in the past + new token to
+    # compute attention) of the sequence group. None if it is in a decode
     # stage.
-    prompt_len: Optional[int]
-    # The length of the query tokens to compute in the current step. None if it
-    # is in a decode stage. The length of subquery_len <= prompt_len.
-    subquery_len: Optional[int]
+    seq_len: Optional[int]
+    # The length of new query tokens to compute in the current step. None if it
+    # is in a decode stage. The length of query_len <= seq_len if chunked
+    # prefill is enabled.
+    query_len: Optional[int]
     # A random number generator for sampling.
     generator: Optional[torch.Generator]
     # True if the sequence group is in prefill stage. False if it is in a
@@ -46,8 +55,8 @@ def __post_init__(self):
         if len(self.prompt_logprob_indices) > 0:
             assert self.sampling_params.prompt_logprobs is not None
         if self.is_prompt:
-            assert self.prompt_len is not None
-            assert self.subquery_len is not None
+            assert self.seq_len is not None
+            assert self.query_len is not None
 
 
 class SamplingMetadata:
@@ -94,8 +103,8 @@ def __init__(
     @staticmethod
     def prepare(
         seq_group_metadata_list: List[SequenceGroupMetadata],
-        prompt_lens: List[int],
-        subquery_lens: Optional[List[int]],
+        seq_lens: List[int],
+        query_lens: Optional[List[int]],
         device: str,
         pin_memory: bool,
     ) -> "SamplingMetadata":
@@ -104,8 +113,8 @@ def prepare(
             selected_token_indices,
             categorized_sample_indices,
             num_prompts,
-        ) = _prepare_seq_groups(seq_group_metadata_list, prompt_lens,
-                                subquery_lens, device)
+        ) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens,
+                                device)
         selected_token_indices = async_tensor_h2d(selected_token_indices,
                                                   dtype=torch.long,
                                                   target_device=device,
@@ -137,8 +146,8 @@ def __repr__(self) -> str:
 
 def _prepare_seq_groups(
     seq_group_metadata_list: List[SequenceGroupMetadata],
-    prompt_lens: List[int],
-    subquery_lens: Optional[List[int]],
+    seq_lens: List[int],
+    query_lens: Optional[List[int]],
     device: str,
 ) -> Tuple[List[SequenceGroupToSample], List[int], Dict[
         SamplingType, List[Tuple[int, int]]], int]:
@@ -146,9 +155,9 @@ def _prepare_seq_groups(
 
     Args:
         seq_group_metadata_list: A list of sequence group to batch.
-        prompt_lens: A list of prompt lens per sequence group.
+        seq_lens: A list of sequence lens per sequence group.
             Index of prompt len should match with seq_group_metadata_list.
-        subquery_lens: A list of query lengths. Prompt lens include the length
+        query_lens: A list of query lengths. Prompt lens include the length
             of entire prompt tokens, and it could be shorter.
         device: A device to use for random number generator,
             `SequenceGroupToSample.generator`.
@@ -189,8 +198,8 @@ def _prepare_seq_groups(
         is_prompt = seq_group_metadata.is_prompt
         generator: Optional[torch.Generator] = None
         # If the current seq group is in decode stage, it is None.
-        prompt_len: Optional[int] = None
-        subquery_len: Optional[int] = None
+        seq_len: Optional[int] = None
+        query_len: Optional[int] = None
         prompt_logprob_indices: List[int] = []
         sample_indices: List[int] = []
         do_sample = seq_group_metadata.do_sample
@@ -203,12 +212,12 @@ def _prepare_seq_groups(
             num_prompts += 1
             num_prefill_sample = len(seq_ids)
             assert num_prefill_sample == 1
-            assert subquery_lens is not None and prompt_lens is not None
-            subquery_len, prompt_len = subquery_lens[i], prompt_lens[i]
+            assert query_lens is not None and seq_lens is not None
+            query_len, seq_len = query_lens[i], seq_lens[i]
             # If we need sampling, exclude num_prefill_sample tokens from
             # prompt logprob.
-            prompt_logprob_len = (subquery_len - num_prefill_sample
-                                  if do_sample else subquery_len)
+            prompt_logprob_len = (query_len - num_prefill_sample
+                                  if do_sample else query_len)
             sample_len = num_prefill_sample if do_sample else 0
         else:
             # Decode
@@ -267,8 +276,8 @@ def sample(logits):
                 seq_ids=seq_ids,
                 sampling_params=sampling_params,
                 seq_data=seq_group_metadata.seq_data,
-                prompt_len=prompt_len,
-                subquery_len=subquery_len,
+                seq_len=seq_len,
+                query_len=query_len,
                 generator=generator,
                 is_prompt=is_prompt,
                 prompt_logprob_indices=list(prompt_logprob_indices),
@@ -367,8 +376,8 @@ def from_sampling_metadata(
                     and sampling_params.prompt_logprobs is not None):
                 # For tokens in the prompt that we only need to get
                 # their logprobs
-                subquery_len = seq_group.subquery_len
-                assert subquery_len is not None
+                query_len = seq_group.query_len
+                assert query_len is not None
                 prefill_len = len(seq_group.prompt_logprob_indices)
                 temperatures += [temperature] * prefill_len
                 top_ps += [top_p] * prefill_len
@@ -397,8 +406,8 @@ def from_sampling_metadata(
 
             if is_prompt:
                 prompt_best_of.append(sampling_params.best_of)
-                subquery_len = seq_group.subquery_len
-                assert subquery_len is not None
+                query_len = seq_group.query_len
+                assert query_len is not None
 
             for seq_id in seq_ids:
                 seq_data = seq_group.seq_data[seq_id]
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 34d7d3dffea18..193b021b7a11e 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -80,7 +80,7 @@ def _prepare_prompt(
         input_tokens: List[int] = []
         input_positions: List[int] = []
         slot_mapping: List[int] = []
-        prompt_lens: List[int] = []
+        seq_lens: List[int] = []
         multi_modal_input_list: List[torch.Tensor] = []
 
         for seq_group_metadata in seq_group_metadata_list:
@@ -92,15 +92,15 @@ def _prepare_prompt(
             seq_data = seq_group_metadata.seq_data[seq_id]
             prompt_tokens = seq_data.get_token_ids()
             computed_len = seq_data.get_num_computed_tokens()
-            prompt_len = len(prompt_tokens)
+            seq_len = len(prompt_tokens)
 
-            prompt_lens.append(prompt_len)  # Prompt token num
+            seq_lens.append(seq_len)  # Prompt token num
             input_tokens.extend(prompt_tokens)  # Token ids
 
             # Token position ids
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
-            input_positions.extend(list(range(computed_len, prompt_len)))
+            input_positions.extend(list(range(computed_len, seq_len)))
 
             if seq_group_metadata.multi_modal_data:
                 multi_modal_input_list.append(
@@ -109,15 +109,15 @@ def _prepare_prompt(
             # Compute the slot mapping.
             block_table = seq_group_metadata.block_tables[seq_id]
             # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
-            # where start_idx is max(0, prompt_len - sliding_window).
+            # where start_idx is max(0, seq_len - sliding_window).
             # For example, if the prompt len is 10, sliding window is 8, and
             # block size is 4, the first two tokens are masked and the slot
             # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
             start_idx = 0
             if self.sliding_window is not None:
-                start_idx = max(0, prompt_len - self.sliding_window)
+                start_idx = max(0, seq_len - self.sliding_window)
 
-            for i in range(computed_len, prompt_len):
+            for i in range(computed_len, seq_len):
                 if i < start_idx:
                     slot_mapping.append(_PAD_SLOT_ID)
                     continue
@@ -151,19 +151,19 @@ def _prepare_prompt(
 
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
-            prompt_lens=prompt_lens,
-            num_prefills=len(prompt_lens),
+            seq_lens=seq_lens,
+            seq_lens_tensor=None,
+            max_seq_len=None,
+            num_prefills=len(seq_lens),
             num_prefill_tokens=num_prompt_tokens,
             num_decode_tokens=0,
             prefill_metadata=None,
             decode_metadata=None,
-            max_context_len=None,
-            context_lens=None,
             block_tables=torch.tensor([]),
             slot_mapping=slot_mapping,
             kv_cache_dtype=self.kv_cache_dtype,
         )
-        return (input_tokens, input_positions, attn_metadata, prompt_lens,
+        return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_input)
 
     def _prepare_decode(
@@ -174,7 +174,7 @@ def _prepare_decode(
         input_tokens: List[int] = []
         input_positions: List[int] = []
         slot_mapping: List[int] = []
-        context_lens: List[int] = []
+        seq_lens: List[int] = []
         block_tables: List[List[int]] = []
 
         for seq_group_metadata in seq_group_metadata_list:
@@ -192,9 +192,9 @@ def _prepare_decode(
                 position = seq_len - 1
                 input_positions.append(position)
 
-                context_len = seq_len if self.sliding_window is None else min(
+                seq_len = seq_len if self.sliding_window is None else min(
                     seq_len, self.sliding_window)
-                context_lens.append(context_len)
+                seq_lens.append(seq_len)
 
                 block_table = seq_group_metadata.block_tables[seq_id]
                 block_number = block_table[position // self.block_size]
@@ -208,7 +208,7 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
-        max_context_len = max(context_lens)
+        max_seq_len = max(seq_lens)
 
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.long,
@@ -219,9 +219,9 @@ def _prepare_decode(
         slot_mapping = torch.tensor(slot_mapping,
                                     dtype=torch.long,
                                     device=self.device)
-        context_lens = torch.tensor(context_lens,
-                                    dtype=torch.int,
-                                    device=self.device)
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=self.device)
 
         max_block_table_len = max(
             len(block_table) for block_table in block_tables)
@@ -236,14 +236,14 @@ def _prepare_decode(
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
             slot_mapping=slot_mapping,
-            prompt_lens=None,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_seq_len=max_seq_len,
             num_prefill_tokens=0,
             num_decode_tokens=len(input_tokens),
-            max_context_len=max_context_len,
             num_prefills=0,
             prefill_metadata=None,
             decode_metadata=None,
-            context_lens=context_lens,
             block_tables=block_tables,
             kv_cache_dtype=self.kv_cache_dtype,
         )
@@ -265,20 +265,20 @@ def prepare_input_tensors(
             is_prompt = seq_group_metadata_list[0].is_prompt
             # Prepare input tensors.
             if is_prompt:
-                (input_tokens, input_positions, attn_metadata, prompt_lens,
+                (input_tokens, input_positions, attn_metadata, seq_lens,
                  multi_modal_input
                  ) = self._prepare_prompt(seq_group_metadata_list)
             else:
                 (input_tokens, input_positions,
                  attn_metadata) = self._prepare_decode(seq_group_metadata_list)
-                prompt_lens = []
+                seq_lens = []
             sampling_metadata = SamplingMetadata.prepare(
                 seq_group_metadata_list,
-                prompt_lens,
-                # subquery_lens is not needed if chunked prefill is not
+                seq_lens,
+                # query_lens is not needed if chunked prefill is not
                 # supported. Since CPU worker doesn't support chunked prefill
-                # just use prompt_lens instead.
-                prompt_lens,
+                # just use seq_lens instead.
+                seq_lens,
                 self.device,
                 pin_memory=False)
             # Broadcast the metadata.
@@ -300,7 +300,7 @@ def prepare_input_tensors(
             sampling_metadata = SamplingMetadata(
                 seq_groups=None,
                 seq_data=None,
-                prompt_lens=None,
+                seq_lens=None,
                 selected_token_indices=selected_token_indices,
                 categorized_sample_indices=None,
                 generators=None,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0704f5fec54d0..bbb1f5205af5e 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -42,8 +42,8 @@ class PreparePromptMetadata(NamedTuple):
     input_tokens: List[int]
     input_positions: List[int]
     attn_metadata: Optional[AttentionMetadataPerStage]
-    prompt_lens: List[int]
-    subquery_lens: List[int]
+    seq_lens: List[int]
+    query_lens: List[int]
     lora_index_mapping: List[int]
     lora_prompt_mapping: List[int]
     lora_requests: Set[LoRARequest]
@@ -56,8 +56,8 @@ def empty(cls):
             input_tokens=[],
             input_positions=[],
             attn_metadata=None,
-            prompt_lens=[],
-            subquery_lens=[],
+            seq_lens=[],
+            query_lens=[],
             lora_index_mapping=[],
             lora_prompt_mapping=[],
             lora_requests=set(),
@@ -134,9 +134,8 @@ def __init__(
         self.graph_memory_pool: Optional[Tuple[
             int, int]] = None  # Set during graph capture.
 
-        self.max_context_len_to_capture = (
-            self.model_config.max_context_len_to_capture
-            if self.model_config is not None else 0)
+        self.max_seq_len_to_capture = (self.model_config.max_seq_len_to_capture
+                                       if self.model_config is not None else 0)
 
         self.pin_memory = is_pin_memory_available()
         self.kv_cache_dtype = kv_cache_dtype
@@ -149,7 +148,7 @@ def __init__(
         self.model: torch.nn.Module  # Set after load_model
         self.block_size: int  # Set after initial profiling.
         # When using CUDA graph, the input block tables must be padded to
-        # max_context_len_to_capture. However, creating the block table in
+        # max_seq_len_to_capture. However, creating the block table in
         # Python can be expensive. To optimize this, we cache the block table
         # in numpy and only copy the actual input content at every iteration.
         # The shape of the cached block table will be
@@ -218,7 +217,7 @@ def set_block_size(self, block_size: int) -> None:
 
     def get_max_block_per_batch(self) -> int:
         block_size = self.block_size
-        return (self.max_context_len_to_capture + block_size - 1) // block_size
+        return (self.max_seq_len_to_capture + block_size - 1) // block_size
 
     def _prepare_prompt(
         self,
@@ -231,9 +230,9 @@ def _prepare_prompt(
         lora_prompt_mapping: List[int] = []
         lora_requests: Set[LoRARequest] = set()
 
-        prompt_lens: List[int] = []
+        seq_lens: List[int] = []
         context_lens: List[int] = []
-        subquery_lens: List[int] = []
+        query_lens: List[int] = []
         prefix_block_tables: List[List[int]] = []
         multi_modal_input_list: List[torch.Tensor] = []
 
@@ -257,21 +256,19 @@ def _prepare_prompt(
 
             token_chunk_size = seq_group_metadata.token_chunk_size
             seq_data = seq_group_metadata.seq_data[seq_id]
-            computed_len = seq_data.get_num_computed_tokens()
+            context_len = seq_data.get_num_computed_tokens()
             # We should use get_len here because in case of preemption
             # it contains output tokens.
-            prefill_end = min(seq_data.get_len(),
-                              computed_len + token_chunk_size)
-            prompt_tokens = seq_data.get_token_ids()[computed_len:prefill_end]
-            prompt_len = prefill_end
-            prompt_lens.append(prompt_len)
+            seq_len = min(seq_data.get_len(), context_len + token_chunk_size)
+            prompt_tokens = seq_data.get_token_ids()[context_len:seq_len]
+            seq_lens.append(seq_len)
 
             # NOTE: This only works for oooooooxxx style attention.
             if computed_block_nums is not None and len(
                     computed_block_nums) > 0 and self.sliding_window is None:
                 # Prefix is not supported with sliding_window
-                computed_len = len(computed_block_nums) * self.block_size
-                prompt_tokens = prompt_tokens[computed_len:]
+                context_len = len(computed_block_nums) * self.block_size
+                prompt_tokens = prompt_tokens[context_len:]
                 prefix_block_tables.append(computed_block_nums)
             elif self.scheduler_config.chunked_prefill_enabled:
                 if seq_group_metadata.block_tables is not None:
@@ -285,25 +282,25 @@ def _prepare_prompt(
                 prefix_block_tables.append([])
                 # Right now, prefill start is always 0. However, this
                 # assumption can be changed once chunked prefill is introduced.
-                assert computed_len == 0
+                assert context_len == 0
 
             # actual prompt lens
-            context_lens.append(computed_len)
-            subquery_lens.append(prompt_len - computed_len)
+            context_lens.append(context_len)
+            query_lens.append(seq_len - context_len)
 
             input_tokens.extend(prompt_tokens)
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
-            input_positions.extend(list(range(computed_len, prefill_end)))
+            input_positions.extend(list(range(context_len, seq_len)))
             lora_id = seq_group_metadata.lora_int_id
 
             if lora_id > 0:
                 lora_requests.add(seq_group_metadata.lora_request)
 
-            lora_index_mapping += [lora_id] * (prompt_len - computed_len)
+            lora_index_mapping += [lora_id] * (seq_len - context_len)
             lora_prompt_mapping.extend(
                 [lora_id] *
-                (prompt_len - computed_len
+                (seq_len - context_len
                  if seq_group_metadata.sampling_params.prompt_logprobs else 1))
 
             if seq_group_metadata.multi_modal_data:
@@ -313,24 +310,24 @@ def _prepare_prompt(
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
                 # yet. In this case, we just use a dummy slot mapping.
-                slot_mapping.extend([_PAD_SLOT_ID] * prompt_len)
+                slot_mapping.extend([_PAD_SLOT_ID] * seq_len)
                 continue
 
             # Compute the slot mapping.
             block_table = seq_group_metadata.block_tables[seq_id]
             # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
-            # where start_idx is max(0, prompt_len - sliding_window).
+            # where start_idx is max(0, seq_len - sliding_window).
             # For example, if the prompt len is 10, sliding window is 8, and
             # block size is 4, the first two tokens are masked and the slot
             # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
             start_idx = 0
             if self.sliding_window is not None:
-                assert computed_len == 0, (
+                assert context_len == 0, (
                     "Prefix caching is currently not supported with "
                     "sliding window attention")
-                start_idx = max(0, prompt_len - self.sliding_window)
+                start_idx = max(0, seq_len - self.sliding_window)
 
-            for i in range(computed_len, prefill_end):
+            for i in range(context_len, seq_len):
                 if i < start_idx:
                     slot_mapping.append(_PAD_SLOT_ID)
                     continue
@@ -340,9 +337,9 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
-        max_subquery_len = max(subquery_lens)
-        max_prompt_len = max(prompt_lens)
-        assert max_subquery_len > 0
+        max_query_len = max(query_lens)
+        max_seq_len = max(seq_lens)
+        assert max_query_len > 0
 
         context_lens_tensor = torch.tensor(context_lens,
                                            dtype=torch.int,
@@ -369,40 +366,39 @@ def _prepare_prompt(
 
         # Query length can be shorter than key (i.e., prompt) when prefill
         # is chunked or prefix cached.
-        subquery_lens_tensor = torch.tensor(subquery_lens,
-                                            dtype=torch.long,
-                                            device=self.device)
-        subquery_start_loc = torch.zeros(subquery_lens_tensor.shape[0] + 1,
+        query_lens_tensor = torch.tensor(query_lens,
+                                         dtype=torch.long,
+                                         device=self.device)
+        subquery_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
                                          dtype=torch.int32,
                                          device=self.device)
 
-        prompt_lens_tensor = torch.tensor(prompt_lens,
-                                          dtype=torch.long,
-                                          device=self.device)
-        seq_start_loc = torch.zeros(prompt_lens_tensor.shape[0] + 1,
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=self.device)
+        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=self.device)
 
-        torch.cumsum(subquery_lens_tensor,
+        torch.cumsum(query_lens_tensor,
                      dim=0,
                      dtype=subquery_start_loc.dtype,
                      out=subquery_start_loc[1:])
 
-        torch.cumsum(prompt_lens_tensor,
+        torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
                      out=seq_start_loc[1:])
 
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
-            prompt_lens=prompt_lens,
-            prompt_lens_tensor=prompt_lens_tensor,
-            max_subquery_len=max_subquery_len,
-            max_context_len=None,
-            max_prompt_len=max_prompt_len,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_seq_len=max_seq_len,
             subquery_start_loc=subquery_start_loc,
             seq_start_loc=seq_start_loc,
-            context_lens=context_lens_tensor,
+            context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=False,
         )
@@ -411,8 +407,8 @@ def _prepare_prompt(
             input_tokens=input_tokens,
             input_positions=input_positions,
             attn_metadata=attn_metadata,
-            prompt_lens=prompt_lens,
-            subquery_lens=subquery_lens,
+            seq_lens=seq_lens,
+            query_lens=query_lens,
             lora_index_mapping=lora_index_mapping,
             lora_prompt_mapping=lora_prompt_mapping,
             lora_requests=lora_requests,
@@ -427,7 +423,7 @@ def _prepare_decode(
         input_tokens: List[int] = []
         input_positions: List[int] = []
         slot_mapping: List[int] = []
-        context_lens: List[int] = []
+        seq_lens: List[int] = []
         block_tables: List[List[int]] = []
         lora_index_mapping: List[int] = []
         lora_prompt_mapping: List[int] = []
@@ -455,9 +451,9 @@ def _prepare_decode(
                 position = seq_len - 1
                 input_positions.append(position)
 
-                context_len = seq_len if self.sliding_window is None else min(
+                seq_len = seq_len if self.sliding_window is None else min(
                     seq_len, self.sliding_window)
-                context_lens.append(context_len)
+                seq_lens.append(seq_len)
 
                 block_table = seq_group_metadata.block_tables[seq_id]
                 block_number = block_table[position // self.block_size]
@@ -477,11 +473,10 @@ def _prepare_decode(
         # See `capture_model` API for more details.
         # For decoding requests, batch_size == input_tokens.
         batch_size = len(input_tokens)
-        max_context_len = max(context_lens)
-        use_captured_graph = (
-            not self.model_config.enforce_eager
-            and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
-            and max_context_len <= self.max_context_len_to_capture)
+        max_seq_len = max(seq_lens)
+        use_captured_graph = (not self.model_config.enforce_eager
+                              and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
+                              and max_seq_len <= self.max_seq_len_to_capture)
         if use_captured_graph:
             graph_batch_size = _get_graph_batch_size(batch_size)
             assert graph_batch_size >= batch_size
@@ -489,21 +484,21 @@ def _prepare_decode(
                 input_tokens.append(0)
                 input_positions.append(0)
                 slot_mapping.append(_PAD_SLOT_ID)
-                context_lens.append(1)
+                seq_lens.append(1)
                 block_tables.append([])
                 lora_index_mapping.append(0)
             batch_size = graph_batch_size
 
-        context_lens_tensor = torch.tensor(context_lens,
-                                           dtype=torch.int,
-                                           device=self.device)
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=self.device)
 
         if use_captured_graph:
             # When using cuda-graph all these tensors should be
             # padded.
-            assert context_lens_tensor.shape[0] == len(input_tokens)
-            assert context_lens_tensor.shape[0] == len(input_positions)
-            assert context_lens_tensor.shape[0] == len(slot_mapping)
+            assert seq_lens_tensor.shape[0] == len(input_tokens)
+            assert seq_lens_tensor.shape[0] == len(input_positions)
+            assert seq_lens_tensor.shape[0] == len(slot_mapping)
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
@@ -525,14 +520,13 @@ def _prepare_decode(
 
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
-            prompt_lens=None,
-            prompt_lens_tensor=None,
-            max_subquery_len=None,
-            max_context_len=max_context_len,
-            max_prompt_len=None,
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=None,
+            max_seq_len=max_seq_len,
             subquery_start_loc=None,
             seq_start_loc=None,
-            context_lens=context_lens_tensor,
+            context_lens_tensor=None,
             block_tables=block_tables,
             use_cuda_graph=use_captured_graph,
         )
@@ -565,8 +559,8 @@ def prepare_input_tensors(
                 input_tokens,
                 input_positions,
                 prefill_attn_metadata,
-                prompt_lens,
-                subquery_lens,
+                seq_lens,
+                query_lens,
                 lora_index_mapping,
                 lora_prompt_mapping,
                 lora_requests,
@@ -583,13 +577,13 @@ def prepare_input_tensors(
                 decode_slot_mapping,
             ) = self._prepare_decode(decode_reqs)
             sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list, prompt_lens, subquery_lens,
-                self.device, self.pin_memory)
+                seq_group_metadata_list, seq_lens, query_lens, self.device,
+                self.pin_memory)
 
             if not self.scheduler_config.chunked_prefill_enabled:
                 assert (len(prefill_reqs) and len(decode_reqs)) == 0
 
-            num_prefills = len(prompt_lens)
+            num_prefills = len(seq_lens)
             num_prefill_tokens = len(input_tokens)
             num_decode_tokens = len(decode_input_tokens)
 
@@ -886,7 +880,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
         slot_mapping = torch.empty(max_batch_size, dtype=torch.long).cuda()
         slot_mapping.fill_(_PAD_SLOT_ID)
-        context_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
+        seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
 
         graph_batch_size = _get_graph_batch_size(
@@ -908,14 +902,13 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                 # Create dummy attn_metadata.
                 decode_metadata = self.attn_backend.make_metadata(
                     is_prompt=False,
-                    prompt_lens=None,
-                    prompt_lens_tensor=None,
-                    max_subquery_len=None,
-                    max_context_len=self.max_context_len_to_capture,
-                    max_prompt_len=None,
+                    seq_lens=None,
+                    seq_lens_tensor=seq_lens[:batch_size],
+                    max_query_len=None,
+                    max_seq_len=self.max_seq_len_to_capture,
                     subquery_start_loc=None,
                     seq_start_loc=None,
-                    context_lens=context_lens[:batch_size],
+                    context_lens_tensor=None,
                     block_tables=block_tables[:batch_size],
                     use_cuda_graph=True,
                 )
@@ -1025,7 +1018,7 @@ def capture(
             "positions": positions,
             "kv_caches": kv_caches,
             "slot_mapping": attn_metadata.slot_mapping,
-            "context_lens": attn_metadata.decode_metadata.context_lens,
+            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
             "block_tables": attn_metadata.decode_metadata.block_tables,
         }
         self.output_buffers = {"hidden_states": hidden_states}
@@ -1047,8 +1040,8 @@ def forward(
         self.input_buffers["positions"].copy_(positions, non_blocking=True)
         self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
                                                  non_blocking=True)
-        self.input_buffers["context_lens"].copy_(
-            attn_metadata.decode_metadata.context_lens, non_blocking=True)
+        self.input_buffers["seq_lens_tensor"].copy_(
+            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
         self.input_buffers["block_tables"].copy_(
             attn_metadata.decode_metadata.block_tables, non_blocking=True)
         # Run the graph.
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index a974e85c22f45..a336be04e124f 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -52,7 +52,7 @@ def _prepare_prompt(
         input_positions: List[List[int]] = []
         input_block_ids: List[int] = []
 
-        prompt_lens: List[int] = []
+        seq_lens: List[int] = []
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
             seq_ids = list(seq_group_metadata.seq_data.keys())
@@ -61,26 +61,26 @@ def _prepare_prompt(
 
             seq_data = seq_group_metadata.seq_data[seq_id]
             prompt_tokens = seq_data.get_token_ids()
-            prompt_len = len(prompt_tokens)
-            prompt_lens.append(prompt_len)
+            seq_len = len(prompt_tokens)
+            seq_lens.append(seq_len)
 
             input_tokens.append(prompt_tokens)
-            input_positions.append(list(range(prompt_len)))
+            input_positions.append(list(range(seq_len)))
 
             assert seq_group_metadata.block_tables is not None
             block_table = seq_group_metadata.block_tables[seq_id]
             assert len(block_table) == 1
             input_block_ids.append(block_table[0])
 
-        max_prompt_len = max(prompt_lens)
-        assert max_prompt_len > 0
+        max_seq_len = max(seq_lens)
+        assert max_seq_len > 0
         input_tokens = make_tensor_with_pad(input_tokens,
-                                            max_prompt_len,
+                                            max_seq_len,
                                             pad=0,
                                             dtype=torch.long,
                                             device=self.device)
         input_positions = make_tensor_with_pad(input_positions,
-                                               max_prompt_len,
+                                               max_seq_len,
                                                pad=0,
                                                dtype=torch.long,
                                                device=self.device)
@@ -88,7 +88,7 @@ def _prepare_prompt(
                                        dtype=torch.long,
                                        device=self.device)
 
-        return input_tokens, input_positions, input_block_ids, prompt_lens
+        return input_tokens, input_positions, input_block_ids, seq_lens
 
     def _prepare_decode(
         self,
@@ -149,18 +149,18 @@ def prepare_input_tensors(
         # Prepare input tensors.
         if is_prompt:
             (input_tokens, input_positions, input_block_ids,
-             prompt_lens) = self._prepare_prompt(seq_group_metadata_list)
+             seq_lens) = self._prepare_prompt(seq_group_metadata_list)
         else:
             (input_tokens, input_positions,
              input_block_ids) = self._prepare_decode(seq_group_metadata_list)
-            prompt_lens = []
+            seq_lens = []
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list,
-            prompt_lens,
-            # subquery_lens is not needed if chunked prefill is not
+            seq_lens,
+            # query_lens is not needed if chunked prefill is not
             # supported. Since neuron worker doesn't support chunked prefill
-            # just use prompt_lens instead.
-            prompt_lens,
+            # just use seq_lens instead.
+            seq_lens,
             self.device,
             self.pin_memory)
 

From 7e65477e5e737927c2f07c913ede0763134504a3 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 3 May 2024 13:32:21 -0400
Subject: [PATCH 47/65] [Bugfix] Allow "None" or "" to be passed to CLI for
 string args that default to None (#4586)

---
 vllm/engine/arg_utils.py            | 32 +++++++++++++++++------------
 vllm/entrypoints/openai/cli_args.py | 27 +++++++++++++-----------
 2 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1c8e1079bed58..78cd07575f17d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -11,6 +11,12 @@
 from vllm.utils import str_to_int_tuple
 
 
+def nullable_str(val: str):
+    if not val or val == "None":
+        return None
+    return val
+
+
 @dataclass
 class EngineArgs:
     """Arguments for vLLM engine."""
@@ -96,7 +102,7 @@ def add_cli_args(
             help='Name or path of the huggingface model to use.')
         parser.add_argument(
             '--tokenizer',
-            type=str,
+            type=nullable_str,
             default=EngineArgs.tokenizer,
             help='Name or path of the huggingface tokenizer to use.')
         parser.add_argument(
@@ -105,21 +111,21 @@ def add_cli_args(
             help='Skip initialization of tokenizer and detokenizer')
         parser.add_argument(
             '--revision',
-            type=str,
+            type=nullable_str,
             default=None,
             help='The specific model version to use. It can be a branch '
             'name, a tag name, or a commit id. If unspecified, will use '
             'the default version.')
         parser.add_argument(
             '--code-revision',
-            type=str,
+            type=nullable_str,
             default=None,
             help='The specific revision to use for the model code on '
             'Hugging Face Hub. It can be a branch name, a tag name, or a '
             'commit id. If unspecified, will use the default version.')
         parser.add_argument(
             '--tokenizer-revision',
-            type=str,
+            type=nullable_str,
             default=None,
             help='The specific tokenizer version to use. It can be a branch '
             'name, a tag name, or a commit id. If unspecified, will use '
@@ -136,7 +142,7 @@ def add_cli_args(
                             action='store_true',
                             help='Trust remote code from huggingface.')
         parser.add_argument('--download-dir',
-                            type=str,
+                            type=nullable_str,
                             default=EngineArgs.download_dir,
                             help='Directory to download and load the weights, '
                             'default to the default cache dir of '
@@ -187,7 +193,7 @@ def add_cli_args(
             'supported for common inference criteria.')
         parser.add_argument(
             '--quantization-param-path',
-            type=str,
+            type=nullable_str,
             default=None,
             help='Path to the JSON file containing the KV cache '
             'scaling factors. This should generally be supplied, when '
@@ -304,7 +310,7 @@ def add_cli_args(
         # Quantization settings.
         parser.add_argument('--quantization',
                             '-q',
-                            type=str,
+                            type=nullable_str,
                             choices=[*QUANTIZATION_METHODS, None],
                             default=EngineArgs.quantization,
                             help='Method used to quantize the weights. If '
@@ -349,7 +355,7 @@ def add_cli_args(
                             'asynchronous tokenization. Ignored '
                             'if tokenizer_pool_size is 0.')
         parser.add_argument('--tokenizer-pool-extra-config',
-                            type=str,
+                            type=nullable_str,
                             default=EngineArgs.tokenizer_pool_extra_config,
                             help='Extra config for tokenizer pool. '
                             'This should be a JSON string that will be '
@@ -404,7 +410,7 @@ def add_cli_args(
         # Related to Vision-language models such as llava
         parser.add_argument(
             '--image-input-type',
-            type=str,
+            type=nullable_str,
             default=None,
             choices=[
                 t.name.lower() for t in VisionLanguageConfig.ImageInputType
@@ -417,7 +423,7 @@ def add_cli_args(
                             help=('Input id for image token.'))
         parser.add_argument(
             '--image-input-shape',
-            type=str,
+            type=nullable_str,
             default=None,
             help=('The biggest image input shape (worst for memory footprint) '
                   'given an input type. Only used for vLLM\'s profile_run.'))
@@ -440,7 +446,7 @@ def add_cli_args(
 
         parser.add_argument(
             '--speculative-model',
-            type=str,
+            type=nullable_str,
             default=EngineArgs.speculative_model,
             help=
             'The name of the draft model to be used in speculative decoding.')
@@ -454,7 +460,7 @@ def add_cli_args(
 
         parser.add_argument(
             '--speculative-max-model-len',
-            type=str,
+            type=int,
             default=EngineArgs.speculative_max_model_len,
             help='The maximum sequence length supported by the '
             'draft model. Sequences over this length will skip '
@@ -475,7 +481,7 @@ def add_cli_args(
             'decoding.')
 
         parser.add_argument('--model-loader-extra-config',
-                            type=str,
+                            type=nullable_str,
                             default=EngineArgs.model_loader_extra_config,
                             help='Extra config for model loader. '
                             'This will be passed to the model loader '
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 16c5b6c08d37f..2b57ab26bfd31 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -8,7 +8,7 @@
 import json
 import ssl
 
-from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.openai.serving_engine import LoRAModulePath
 
 
@@ -25,7 +25,10 @@ def __call__(self, parser, namespace, values, option_string=None):
 def make_arg_parser():
     parser = argparse.ArgumentParser(
         description="vLLM OpenAI-Compatible RESTful API server.")
-    parser.add_argument("--host", type=str, default=None, help="host name")
+    parser.add_argument("--host",
+                        type=nullable_str,
+                        default=None,
+                        help="host name")
     parser.add_argument("--port", type=int, default=8000, help="port number")
     parser.add_argument(
         "--uvicorn-log-level",
@@ -49,13 +52,13 @@ def make_arg_parser():
                         default=["*"],
                         help="allowed headers")
     parser.add_argument("--api-key",
-                        type=str,
+                        type=nullable_str,
                         default=None,
                         help="If provided, the server will require this key "
                         "to be presented in the header.")
     parser.add_argument("--served-model-name",
                         nargs="+",
-                        type=str,
+                        type=nullable_str,
                         default=None,
                         help="The model name(s) used in the API. If multiple "
                         "names are provided, the server will respond to any "
@@ -65,33 +68,33 @@ def make_arg_parser():
                         "same as the `--model` argument.")
     parser.add_argument(
         "--lora-modules",
-        type=str,
+        type=nullable_str,
         default=None,
         nargs='+',
         action=LoRAParserAction,
         help="LoRA module configurations in the format name=path. "
         "Multiple modules can be specified.")
     parser.add_argument("--chat-template",
-                        type=str,
+                        type=nullable_str,
                         default=None,
                         help="The file path to the chat template, "
                         "or the template in single-line form "
                         "for the specified model")
     parser.add_argument("--response-role",
-                        type=str,
+                        type=nullable_str,
                         default="assistant",
                         help="The role name to return if "
                         "`request.add_generation_prompt=true`.")
     parser.add_argument("--ssl-keyfile",
-                        type=str,
+                        type=nullable_str,
                         default=None,
                         help="The file path to the SSL key file")
     parser.add_argument("--ssl-certfile",
-                        type=str,
+                        type=nullable_str,
                         default=None,
                         help="The file path to the SSL cert file")
     parser.add_argument("--ssl-ca-certs",
-                        type=str,
+                        type=nullable_str,
                         default=None,
                         help="The CA certificates file")
     parser.add_argument(
@@ -102,12 +105,12 @@ def make_arg_parser():
     )
     parser.add_argument(
         "--root-path",
-        type=str,
+        type=nullable_str,
         default=None,
         help="FastAPI root_path when app is behind a path based routing proxy")
     parser.add_argument(
         "--middleware",
-        type=str,
+        type=nullable_str,
         action="append",
         default=[],
         help="Additional ASGI middleware to apply to the app. "

From f8e7adda21810104382bdf3febe3ea02c72f7348 Mon Sep 17 00:00:00 2001
From: Sebastian Schoennenbeck <schoennenbeck@gmail.com>
Date: Fri, 3 May 2024 20:04:14 +0200
Subject: [PATCH 48/65] Fix/async chat serving (#2727)

---
 tests/async_engine/test_chat_template.py      | 25 +++++++------
 tests/entrypoints/openai/test_serving_chat.py | 37 +++++++++++++++++++
 tests/entrypoints/test_openai_server.py       |  2 +-
 vllm/entrypoints/openai/serving_chat.py       | 12 ++++--
 vllm/entrypoints/openai/serving_engine.py     | 18 ++++++---
 5 files changed, 73 insertions(+), 21 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_serving_chat.py

diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
index 8d6ad6706fb0e..64bcba67c3437 100644
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -60,12 +60,13 @@ class MockServingChat:
     tokenizer: MockTokenizer
 
 
-def test_load_chat_template():
+@pytest.mark.asyncio
+async def test_load_chat_template():
     # Testing chatml template
     tokenizer = MockTokenizer()
     mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=chatml_jinja_path)
+    await OpenAIServingChat._load_chat_template(
+        mock_serving_chat, chat_template=chatml_jinja_path)
 
     template_content = tokenizer.chat_template
 
@@ -76,7 +77,8 @@ def test_load_chat_template():
 {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
 
 
-def test_no_load_chat_template_filelike():
+@pytest.mark.asyncio
+async def test_no_load_chat_template_filelike():
     # Testing chatml template
     template = "../../examples/does_not_exist"
     tokenizer = MockTokenizer()
@@ -84,18 +86,19 @@ def test_no_load_chat_template_filelike():
     mock_serving_chat = MockServingChat(tokenizer)
 
     with pytest.raises(ValueError, match="looks like a file path"):
-        OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                              chat_template=template)
+        await OpenAIServingChat._load_chat_template(mock_serving_chat,
+                                                    chat_template=template)
 
 
-def test_no_load_chat_template_literallike():
+@pytest.mark.asyncio
+async def test_no_load_chat_template_literallike():
     # Testing chatml template
     template = "{{ messages }}"
     tokenizer = MockTokenizer()
 
     mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=template)
+    await OpenAIServingChat._load_chat_template(mock_serving_chat,
+                                                chat_template=template)
     template_content = tokenizer.chat_template
 
     assert template_content == template
@@ -110,8 +113,8 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
     # Initialize the tokenizer
     tokenizer = get_tokenizer(tokenizer_name=model)
     mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=template)
+    await OpenAIServingChat._load_chat_template(mock_serving_chat,
+                                                chat_template=template)
 
     # Create a mock request object using keyword arguments
     mock_request = ChatCompletionRequest(
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
new file mode 100644
index 0000000000000..269b0823fec05
--- /dev/null
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -0,0 +1,37 @@
+import asyncio
+from dataclasses import dataclass
+
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+MODEL_NAME = "openai-community/gpt2"
+CHAT_TEMPLATE = "Dummy chat template for testing {}"
+
+
+@dataclass
+class MockModelConfig:
+    tokenizer = MODEL_NAME
+    trust_remote_code = False
+    tokenizer_mode = "auto"
+    max_model_len = 100
+    tokenizer_revision = None
+
+
+@dataclass
+class MockEngine:
+
+    async def get_model_config(self):
+        return MockModelConfig
+
+
+async def _async_serving_chat_init():
+    serving_completion = OpenAIServingChat(MockEngine(),
+                                           served_model_names=[MODEL_NAME],
+                                           response_role="assistant",
+                                           chat_template=CHAT_TEMPLATE)
+    return serving_completion
+
+
+def test_async_serving_chat_init():
+    serving_completion = asyncio.run(_async_serving_chat_init())
+    assert serving_completion.tokenizer is not None
+    assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 1323dba469117..e53e64a0c1ff8 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -150,7 +150,7 @@ def server(zephyr_lora_files):
     ray.shutdown()
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def client():
     client = openai.AsyncOpenAI(
         base_url="http://localhost:8000/v1",
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 599f99e56a726..c8f4a6b315db0 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,3 +1,4 @@
+import asyncio
 import codecs
 import time
 from typing import (AsyncGenerator, AsyncIterator, Awaitable, Iterable, List,
@@ -40,9 +41,11 @@ def __init__(self,
                  chat_template: Optional[str] = None):
         super().__init__(engine=engine,
                          served_model_names=served_model_names,
-                         lora_modules=lora_modules)
+                         lora_modules=lora_modules,
+                         await_post_init=self._load_chat_template(
+                             chat_template=chat_template))
+
         self.response_role = response_role
-        self._load_chat_template(chat_template)
 
     def _parse_chat_message_content(
         self,
@@ -356,7 +359,10 @@ async def chat_completion_full_generator(
 
         return response
 
-    def _load_chat_template(self, chat_template: Optional[str]):
+    async def _load_chat_template(self, chat_template: Optional[str]):
+        while self.tokenizer is None:
+            # Give the parent class time to load the tokenizer
+            await asyncio.sleep(0.1)
         tokenizer = self.tokenizer
 
         if chat_template is not None:
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 3d5ed328b9d19..21baea2e5e7f6 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -2,7 +2,7 @@
 import json
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
 
 from pydantic import Field
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -29,8 +29,11 @@ class LoRAModulePath:
 
 class OpenAIServing:
 
-    def __init__(self, engine: AsyncLLMEngine, served_model_names: List[str],
-                 lora_modules: Optional[List[LoRAModulePath]]):
+    def __init__(self,
+                 engine: AsyncLLMEngine,
+                 served_model_names: List[str],
+                 lora_modules: Optional[List[LoRAModulePath]],
+                 await_post_init: Optional[Awaitable[Any]] = None):
         self.engine = engine
         self.served_model_names = served_model_names
         if lora_modules is None:
@@ -56,12 +59,12 @@ def __init__(self, engine: AsyncLLMEngine, served_model_names: List[str],
         if event_loop is not None and event_loop.is_running():
             # If the current is instanced by Ray Serve,
             # there is already a running event loop
-            event_loop.create_task(self._post_init())
+            event_loop.create_task(self._post_init(await_post_init))
         else:
             # When using single vLLM without engine_use_ray
-            asyncio.run(self._post_init())
+            asyncio.run(self._post_init(await_post_init))
 
-    async def _post_init(self):
+    async def _post_init(self, await_post_init):
         engine_model_config = await self.engine.get_model_config()
         self.max_model_len = engine_model_config.max_model_len
 
@@ -73,6 +76,9 @@ async def _post_init(self):
             trust_remote_code=engine_model_config.trust_remote_code,
             truncation_side="left")
 
+        if await_post_init is not None:
+            await await_post_init
+
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [

From 43c413ec570e94869ee7b7d275de720219a34357 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Fri, 3 May 2024 15:51:27 -0700
Subject: [PATCH 49/65] [Kernel] Use flashinfer for decoding (#4353)

Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com>
---
 csrc/cache.h                                  |   8 +
 csrc/cache_kernels.cu                         |  80 +++++++
 csrc/pybind.cpp                               |   4 +
 .../test_basic_correctness.py                 |  12 +-
 .../test_basic_distributed_correctness.py     |  14 +-
 tests/kernels/conftest.py                     |   8 +-
 tests/kernels/test_cache.py                   |  77 ++++++
 vllm/_custom_ops.py                           |  12 +
 vllm/attention/backends/abstract.py           |  13 +-
 vllm/attention/backends/flashinfer.py         | 220 ++++++++++++++++++
 vllm/attention/selector.py                    |   6 +
 vllm/config.py                                |   5 +
 vllm/sequence.py                              |   4 +-
 vllm/utils.py                                 |  67 ++++--
 vllm/worker/model_runner.py                   | 123 +++++++---
 15 files changed, 600 insertions(+), 53 deletions(-)
 create mode 100644 vllm/attention/backends/flashinfer.py

diff --git a/csrc/cache.h b/csrc/cache.h
index 718a5f6cfd7f7..4c142ce17f1b9 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -24,6 +24,14 @@ void reshape_and_cache(
   const std::string& kv_cache_dtype,
   const float kv_scale);
 
+void reshape_and_cache_flash(
+  torch::Tensor& key,
+  torch::Tensor& value,
+  torch::Tensor& key_cache,
+  torch::Tensor& value_cache,
+  torch::Tensor& slot_mapping,
+  const std::string& kv_cache_dtype);
+
 // Just for unittest
 void convert_fp8(
   torch::Tensor& src_cache,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 24aaa2ff3e263..42f884c76c620 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -215,6 +215,41 @@ __global__ void reshape_and_cache_kernel(
   }
 }
 
+template<typename scalar_t>
+__global__ void reshape_and_cache_flash_kernel(
+  const scalar_t* __restrict__ key,           // [num_tokens, num_heads, head_size]
+  const scalar_t* __restrict__ value,         // [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ k_cache,             // [num_blocks, block_size, num_heads, head_size]
+  scalar_t* __restrict__ v_cache,             // [num_blocks, block_size, num_heads, head_size]
+  const int64_t* __restrict__ slot_mapping,   // [num_tokens]
+  const int block_stride,
+  const int key_stride,
+  const int value_stride,
+  const int num_heads,
+  const int head_size,
+  const int block_size) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+  const int n = num_heads * head_size;
+  for (int i = threadIdx.x; i < n; i += blockDim.x) {
+    const int64_t src_key_idx = token_idx * key_stride + i;
+    const int64_t src_value_idx = token_idx * value_stride + i;
+    const int head_idx = i / head_size;
+    const int head_offset = i % head_size;
+    const int64_t tgt_value_idx = block_idx * block_stride
+                              + block_offset * num_heads * head_size
+                              + head_idx * head_size
+                              + head_offset;
+    k_cache[tgt_value_idx] = key[src_key_idx];
+    v_cache[tgt_value_idx] = value[src_value_idx];
+  }
+}
 } // namespace vllm
 
 #define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE)                                     \
@@ -275,6 +310,51 @@ void reshape_and_cache(
   }
 }
 
+void reshape_and_cache_flash(
+  torch::Tensor& key,           // [num_tokens, num_heads, head_size]
+  torch::Tensor& value,         // [num_tokens, num_heads, head_size]
+  torch::Tensor& k_cache,       // [num_blocks, block_size, num_heads, head_size]
+  torch::Tensor& v_cache,       // [num_blocks, block_size, num_heads, head_size]
+  torch::Tensor& slot_mapping,  // [num_tokens]
+  const std::string& kv_cache_dtype)
+{
+  // FIXME: only support auto datatype, does not support fp8
+  if (kv_cache_dtype != "auto") {
+    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
+  }
+  int num_tokens = key.size(0);
+  int num_heads = key.size(1);
+  int head_size = key.size(2);
+  int block_size = k_cache.size(1);
+
+  int key_stride = key.stride(0);
+  int value_stride = value.stride(0);
+  int block_stride = k_cache.stride(0);
+  TORCH_CHECK(k_cache.stride(0) == v_cache.stride(0));
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(num_heads * head_size, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+    key.scalar_type(),
+    "reshape_and_cache_flash",
+    [&] {
+      vllm::reshape_and_cache_flash_kernel<scalar_t><<<grid, block, 0, stream>>>(
+        key.data_ptr<scalar_t>(),
+        value.data_ptr<scalar_t>(),
+        k_cache.data_ptr<scalar_t>(),
+        v_cache.data_ptr<scalar_t>(),
+        slot_mapping.data_ptr<int64_t>(),
+        block_stride,
+        key_stride,
+        value_stride,
+        num_heads,
+        head_size,
+        block_size);
+    });
+}
+
 namespace vllm {
 
 template<typename Tout, typename Tin>
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 9839bfc0331c4..173e0b1732e13 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -96,6 +96,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     "reshape_and_cache",
     &reshape_and_cache,
     "Reshape the key and value tensors and cache them");
+  cache_ops.def(
+    "reshape_and_cache_flash",
+    &reshape_and_cache_flash,
+    "Reshape the key and value tensors and cache them");
   cache_ops.def(
     "convert_fp8",
     &convert_fp8,
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 97cff623c5e1d..d75279dd9cfa9 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -2,12 +2,15 @@
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
+import os
+
 import pytest
 
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
+VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -23,11 +26,18 @@ def test_models(
     max_tokens: int,
     enforce_eager: bool,
 ) -> None:
+    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
+    if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
+        pytest.skip("Skipping non-eager test for FlashInferBackend.")
+
     hf_model = hf_runner(model, dtype=dtype)
     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
     del hf_model
 
-    vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=enforce_eager)
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             enforce_eager=enforce_eager,
+                             gpu_memory_utilization=0.7)
     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
     del vllm_model
 
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 77aa90b12bf8f..527452630c9f5 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -18,6 +18,7 @@
 MODELS = [
     os.environ["TEST_DIST_MODEL"],
 ]
+VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -33,16 +34,19 @@ def test_models(
     dtype: str,
     max_tokens: int,
 ) -> None:
+    enforce_eager = False
+    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
+    if backend_by_env_var == "FLASHINFER":
+        enforce_eager = True
 
     hf_model = hf_runner(model, dtype=dtype)
     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
     del hf_model
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        tensor_parallel_size=2,
-    )
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             tensor_parallel_size=2,
+                             enforce_eager=enforce_eager)
     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
     del vllm_model
 
diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py
index d26da2c7fe4ee..4f2f9cc3dac7d 100644
--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
@@ -1,8 +1,14 @@
 import pytest
 
-from vllm.utils import create_kv_caches_with_random
+from vllm.utils import (create_kv_caches_with_random,
+                        create_kv_caches_with_random_flash)
 
 
 @pytest.fixture()
 def kv_cache_factory():
     return create_kv_caches_with_random
+
+
+@pytest.fixture()
+def kv_cache_factory_flashinfer():
+    return create_kv_caches_with_random_flash
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index d1051fd7e2f4d..ca215bb75837a 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm._C import cache_ops
 from vllm.utils import is_hip
 
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
@@ -191,6 +192,82 @@ def test_reshape_and_cache(
         assert torch.allclose(value_cache, cloned_value_cache)
 
 
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_reshape_and_cache_flash(
+    kv_cache_factory_flashinfer,
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    if kv_cache_dtype == "fp8":
+        pytest.skip()
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    # Create a random slot mapping.
+    num_slots = block_size * num_blocks
+    slot_mapping = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device='cuda')
+
+    qkv = torch.randn(num_tokens,
+                      3,
+                      num_heads,
+                      head_size,
+                      dtype=dtype,
+                      device=device)
+    _, key, value = qkv.unbind(dim=1)
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory_flashinfer(
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Clone the KV caches.
+    cloned_key_cache = key_cache.clone()
+    cloned_value_cache = value_cache.clone()
+
+    # Call the reshape_and_cache kernel.
+    cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
+                                      slot_mapping, kv_cache_dtype)
+
+    # Run the reference implementation.
+    block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
+    block_indicies = block_indicies.cpu().tolist()
+    block_offsets = slot_mapping % block_size
+    block_offsets = block_offsets.cpu().tolist()
+    for i in range(num_tokens):
+        block_idx = block_indicies[i]
+        block_offset = block_offsets[i]
+        cloned_key_cache[block_idx, block_offset, :, :] = key[i]
+        cloned_value_cache[block_idx, block_offset, :, :] = value[i]
+
+    assert torch.allclose(key_cache, cloned_key_cache)
+    assert torch.allclose(value_cache, cloned_value_cache)
+
+
 @pytest.mark.parametrize("direction", COPYING_DIRECTION)
 @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index b43f646fec88e..5b56437487477 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -222,6 +222,18 @@ def reshape_and_cache(
                                      slot_mapping, kv_cache_dtype, kv_scale)
 
 
+def reshape_and_cache_flash(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+) -> None:
+    vllm_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
+                                           slot_mapping, kv_cache_dtype)
+
+
 def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor,
                 block_mapping: torch.Tensor) -> None:
     vllm_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index be747c9900368..61c9c81d8a7b8 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, fields
-from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar
+from typing import (Any, Dict, Generic, List, Optional, Set, Tuple, Type,
+                    TypeVar)
 
 import torch
 
@@ -15,7 +16,7 @@ def get_impl_cls() -> Type["AttentionImpl"]:
 
     @staticmethod
     @abstractmethod
-    def make_metadata(*args, **kwargs) -> "AttentionMetadata":
+    def make_metadata(*args, **kwargs) -> "AttentionMetadataPerStage":
         raise NotImplementedError
 
     @staticmethod
@@ -50,13 +51,17 @@ def copy_blocks(
 class AttentionMetadataPerStage:
     """Attention metadata for a specific stage. I.e., prefill or decode."""
 
-    def asdict_zerocopy(self) -> Dict[str, Any]:
+    def asdict_zerocopy(self,
+                        skip_fields: Optional[Set[str]] = None
+                        ) -> Dict[str, Any]:
         """Similar to dataclasses.asdict, but avoids deepcopying."""
+        if skip_fields is None:
+            skip_fields = set()
         # Note that if we add dataclasses as fields, they will need
         # similar handling.
         return {
             field.name: getattr(self, field.name)
-            for field in fields(self)
+            for field in fields(self) if field.name not in skip_fields
         }
 
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
new file mode 100644
index 0000000000000..8ab4b1f12ee36
--- /dev/null
+++ b/vllm/attention/backends/flashinfer.py
@@ -0,0 +1,220 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Set, Tuple, Type
+
+try:
+    import flashinfer
+    from flash_attn import flash_attn_varlen_func
+    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+except ImportError:
+    flashinfer = None
+    flash_attn_varlen_func = None
+    BatchDecodeWithPagedKVCacheWrapper = None
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata,
+                                              AttentionMetadataPerStage)
+
+
+class FlashInferBackend(AttentionBackend):
+
+    @staticmethod
+    def get_impl_cls() -> Type["FlashInferImpl"]:
+        return FlashInferImpl
+
+    @staticmethod
+    def make_metadata(*args, **kwargs) -> "FlashInferMetadata":
+        return FlashInferMetadata(*args, **kwargs)
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, 2, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 128, 256]
+
+
+@dataclass
+class FlashInferMetadata(AttentionMetadataPerStage):
+
+    is_prompt: bool
+
+    use_cuda_graph: bool = False
+
+    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
+
+    # Metadata for the prefill stage since we still
+    # use flash attention for prefill.
+    seq_start_loc: Optional[torch.Tensor] = None
+    max_seq_len: Optional[int] = None
+    block_tables: Optional[torch.Tensor] = None
+
+    # Metadata for the decode stage
+    # Workspace buffer required by the kernel, the buffer should not
+    # be allocated/deacollated by the FalshInfermetadata object.
+    workspace_buffer: Optional[torch.Tensor] = None
+    # An example for paged_kv_indices, paged_kv_indptr:
+    # request 1, page indices [0, 5, 8]
+    # request 2, page indices [1, 6, 7]
+    # request 3, page indices [3, 4]
+    # paged_kv_indices is a concatenation of page indices of all requests:
+    # [0, 5, 8, 1, 6, 7, 3, 4]
+    # paged_kv_indptr is used to index into paged_kv_indices:
+    # [0, 3, 6, 8]
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: Optional[torch.Tensor] = None
+    # The page indices of the paged kv cache
+    paged_kv_indices: Optional[torch.Tensor] = None
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_len: Optional[torch.Tensor] = None
+    # The number of query/output heads
+    num_qo_heads: Optional[int] = None
+    # The number of key/value heads
+    num_kv_heads: Optional[int] = None
+    # The dimension of the attention heads
+    head_dim: Optional[int] = None
+    # Block size of vllm
+    page_size: Optional[int] = None
+    # The data type of the paged kv cache
+    data_type: torch.dtype = None
+
+    def __post_init__(self):
+        # Refer to
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
+        supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f"received {self.head_dim}.")
+
+        # When using flashinfer, we are also creating the FlashInferMetadata,
+        # which will also call post_init by default, here we want to skip the
+        # post_init if it's the prefill phase.
+        if not self.is_prompt:
+            self.decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+                self.workspace_buffer, "NHD")
+            self.decode_wrapper.begin_forward(
+                self.paged_kv_indptr,
+                self.paged_kv_indices,
+                self.paged_kv_last_page_len,
+                self.num_qo_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                self.page_size,
+                # Disable flashinfer's pos encoding and use vllm's rope.
+                pos_encoding_mode="NONE",
+                data_type=self.data_type)
+
+    def asdict_zerocopy(self,
+                        skip_fields: Optional[Set[str]] = None
+                        ) -> Dict[str, Any]:
+        if skip_fields is None:
+            skip_fields = set()
+        # We need to skip the decode_wrapper field since it cannot be
+        # broadcasted with nccl when TP is enabled.
+        skip_fields.add('decode_wrapper')
+        return super().asdict_zerocopy(skip_fields)
+
+
+class FlashInferImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        if sliding_window is not None:
+            raise ValueError("Sliding window is not supported in FlashInfer.")
+        self.sliding_window = (-1, -1)
+        self.alibi_slopes = alibi_slopes
+        self.scale = scale
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+
+    def forward(self, query: torch.Tensor, key: torch.Tensor,
+                value: torch.Tensor, kv_cache: Optional[torch.Tensor],
+                attn_metadata: AttentionMetadata[FlashInferMetadata],
+                kv_scale: float):
+        num_tokens, hidden_size = query.shape
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
+        if attn_metadata.num_prefill_tokens > 0:
+            assert attn_metadata.num_decode_tokens == 0, (
+                "Chunked prefill is not supported with flashinfer yet.")
+        if attn_metadata.num_decode_tokens > 0:
+            assert attn_metadata.num_prefill_tokens == 0, (
+                "Chunked prefill is not supported with flashinfer yet.")
+
+        if kv_cache is not None:
+            # Use the same reshape and cache kernel as flash attention.
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[:, 0],
+                kv_cache[:, 1],
+                attn_metadata.slot_mapping.flatten(),
+                attn_metadata.kv_cache_dtype,
+            )
+
+        if prefill_meta := attn_metadata.prefill_metadata:
+            assert prefill_meta.block_tables is not None
+            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
+                output = flash_attn_varlen_func(
+                    q=query,
+                    k=key,
+                    v=value,
+                    cu_seqlens_q=prefill_meta.seq_start_loc,
+                    cu_seqlens_k=prefill_meta.seq_start_loc,
+                    max_seqlen_q=prefill_meta.max_seq_len,
+                    max_seqlen_k=prefill_meta.max_seq_len,
+                    softmax_scale=self.scale,
+                    causal=True,
+                    window_size=self.sliding_window,
+                    alibi_slopes=self.alibi_slopes,
+                )
+            else:
+                raise NotImplementedError(
+                    "Prefix caching is not supported with flashinfer yet.")
+        else:
+            assert attn_metadata.decode_metadata is not None
+            assert attn_metadata.decode_metadata.decode_wrapper is not None
+            query = query.contiguous(
+            )  # Flashinfer requires query to be contiguous
+            output = attn_metadata.decode_metadata.decode_wrapper.forward(
+                query,
+                kv_cache,
+                sm_scale=self.scale,
+            )
+        return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 7ae8c31fae1ac..34da0f6c6cdfc 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -17,6 +17,7 @@ class _Backend(enum.Enum):
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
     TORCH_SDPA = enum.auto()
+    FLASHINFER = enum.auto()
 
 
 @lru_cache(maxsize=None)
@@ -41,6 +42,11 @@ def get_attn_backend(dtype: torch.dtype) -> Type[AttentionBackend]:
         logger.info("Using Torch SDPA backend.")
         from vllm.attention.backends.torch_sdpa import TorchSDPABackend
         return TorchSDPABackend
+    elif backend == _Backend.FLASHINFER:
+        logger.info("Using Flashinfer backend.")
+        logger.warning("Eager mode is enforced for the Flashinfer backend. ")
+        from vllm.attention.backends.flashinfer import FlashInferBackend
+        return FlashInferBackend
     else:
         raise ValueError("Invalid attention backend.")
 
diff --git a/vllm/config.py b/vllm/config.py
index 3bdd3f774bc27..fe54c54bed48e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -298,6 +298,11 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
         return max(1,
                    total_num_kv_heads // parallel_config.tensor_parallel_size)
 
+    def get_num_attention_heads(self,
+                                parallel_config: "ParallelConfig") -> int:
+        return self.hf_text_config.num_attention_heads // \
+                    parallel_config.tensor_parallel_size
+
     def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
         total_num_hidden_layers = self.hf_text_config.num_hidden_layers
         return total_num_hidden_layers // parallel_config.pipeline_parallel_size
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 0e931ebbb6571..8caf97d30d539 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -579,8 +579,10 @@ class SequenceGroupMetadata:
             query tokens for prefill, we don't need sampling.
         token_chunk_size: The number of tokens to be processed (per sequence).
             None if chunking is not required.
-        state: Internal state tied to this sequence group.
         lora_request: LoRA request.
+        computed_block_nums: The block numbers that are already computed,
+            used in prefix caching.
+        state: Internal state tied to this sequence group.
         multi_modal_data: Multi modal data.
     """
 
diff --git a/vllm/utils.py b/vllm/utils.py
index ce55253ce2199..b06c8508757c5 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -355,21 +355,9 @@ def _generate_random_fp8(
     del tensor_tmp
 
 
-def create_kv_caches_with_random(
-    num_blocks: int,
-    block_size: int,
-    num_layers: int,
-    num_heads: int,
-    head_size: int,
-    cache_dtype: Optional[Union[str, torch.dtype]],
-    model_dtype: Optional[Union[str, torch.dtype]] = None,
-    seed: int = 0,
-    device: Optional[str] = "cuda",
-) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-
+def get_kv_cache_torch_dtype(
+        cache_dtype: Optional[Union[str, torch.dtype]],
+        model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype:
     if isinstance(cache_dtype, str):
         if cache_dtype == "auto":
             if isinstance(model_dtype, str):
@@ -388,6 +376,55 @@ def create_kv_caches_with_random(
         torch_dtype = cache_dtype
     else:
         raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
+    return torch_dtype
+
+
+def create_kv_caches_with_random_flash(
+    num_blocks: int,
+    block_size: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    cache_dtype: Optional[Union[str, torch.dtype]],
+    model_dtype: Optional[Union[str, torch.dtype]] = None,
+    seed: int = 0,
+    device: Optional[str] = "cuda",
+) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    assert cache_dtype != "fp8"
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+    key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
+    scale = head_size**-0.5
+    key_caches, value_caches = [], []
+    for _ in range(num_layers):
+        key_value_cache = torch.empty(size=key_value_cache_shape,
+                                      dtype=torch_dtype,
+                                      device=device)
+        key_value_cache.uniform_(-scale, scale)
+        key_caches.append(key_value_cache[:, 0])
+        value_caches.append(key_value_cache[:, 1])
+    return key_caches, value_caches
+
+
+def create_kv_caches_with_random(
+    num_blocks: int,
+    block_size: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    cache_dtype: Optional[Union[str, torch.dtype]],
+    model_dtype: Optional[Union[str, torch.dtype]] = None,
+    seed: int = 0,
+    device: Optional[str] = "cuda",
+) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
 
     scale = head_size**-0.5
     x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index bbb1f5205af5e..ab248596490f6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -9,6 +9,7 @@
 
 from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage,
                             get_attn_backend)
+from vllm.attention.backends.flashinfer import FlashInferBackend
 from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict, with_pynccl_for_all_reduce
@@ -23,8 +24,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import (CudaMemoryProfiler, is_hip, is_pin_memory_available,
-                        make_tensor_with_pad)
+from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
+                        is_pin_memory_available, make_tensor_with_pad)
 
 logger = init_logger(__name__)
 
@@ -155,6 +156,9 @@ def __init__(
         # (max batch size to capture, max context len to capture / block size).
         self.graph_block_tables: torch.Tensor  # Set after initial profiling.
 
+        # Set if the backend is flashinfer.
+        self.flashinfer_workspace_buffer: torch.Tensor
+
     def load_model(self) -> None:
         with CudaMemoryProfiler() as m:
             self.model = get_model(
@@ -315,6 +319,7 @@ def _prepare_prompt(
 
             # Compute the slot mapping.
             block_table = seq_group_metadata.block_tables[seq_id]
+
             # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
             # where start_idx is max(0, seq_len - sliding_window).
             # For example, if the prompt len is 10, sliding window is 8, and
@@ -390,18 +395,26 @@ def _prepare_prompt(
                      dtype=seq_start_loc.dtype,
                      out=seq_start_loc[1:])
 
-        attn_metadata = self.attn_backend.make_metadata(
-            is_prompt=True,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=max_query_len,
-            max_seq_len=max_seq_len,
-            subquery_start_loc=subquery_start_loc,
-            seq_start_loc=seq_start_loc,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=block_tables,
-            use_cuda_graph=False,
-        )
+        if self.attn_backend is FlashInferBackend:
+            attn_metadata = self.attn_backend.make_metadata(
+                is_prompt=True,
+                use_cuda_graph=False,
+                seq_start_loc=seq_start_loc,
+                max_seq_len=max_seq_len,
+                block_tables=block_tables)
+        else:
+            attn_metadata = self.attn_backend.make_metadata(
+                is_prompt=True,
+                seq_lens=seq_lens,
+                seq_lens_tensor=seq_lens_tensor,
+                max_query_len=max_query_len,
+                max_seq_len=max_seq_len,
+                subquery_start_loc=subquery_start_loc,
+                seq_start_loc=seq_start_loc,
+                context_lens_tensor=context_lens_tensor,
+                block_tables=block_tables,
+                use_cuda_graph=False,
+            )
 
         return PreparePromptMetadata(
             input_tokens=input_tokens,
@@ -429,6 +442,24 @@ def _prepare_decode(
         lora_prompt_mapping: List[int] = []
         lora_requests: Set[LoRARequest] = set()
 
+        # The following fields are only for flashinfer
+        # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout
+        # for the precise definition of the following fields.
+        # An example:
+        # request 1, page indices [0, 5, 8]
+        # request 2, page indices [1, 6, 7]
+        # request 3, page indices [3, 4]
+        # paged_kv_indices is a concatenation of page indices of all requests:
+        # [0, 5, 8, 1, 6, 7, 3, 4]
+        # paged_kv_indptr is used to index into paged_kv_indices:
+        # [0, 3, 6, 8]
+        paged_kv_indices: List[int] = []
+        # 0 at the beginning of paged_kv_indptr indicates the start of the
+        # first request’s page indices in the paged_kv_indices list.
+        paged_kv_indptr: List[int] = [0]
+        # paged_kv_last_page_len is the length of the last page of each request
+        paged_kv_last_page_len: List[int] = []
+
         if len(seq_group_metadata_list) == 0:
             return PrepareDecodeMetadata.empty()
 
@@ -469,6 +500,13 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
+                paged_kv_indices.extend(block_table)
+                paged_kv_indptr.append(paged_kv_indptr[-1] + len(block_table))
+                last_page_len = seq_data.get_len() % self.block_size
+                if last_page_len == 0:
+                    last_page_len = self.block_size
+                paged_kv_last_page_len.append(last_page_len)
+
         # vLLM uses cuda graph only for decoding requests.
         # See `capture_model` API for more details.
         # For decoding requests, batch_size == input_tokens.
@@ -518,18 +556,51 @@ def _prepare_decode(
                 device=self.device,
             )
 
-        attn_metadata = self.attn_backend.make_metadata(
-            is_prompt=False,
-            seq_lens=None,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=None,
-            max_seq_len=max_seq_len,
-            subquery_start_loc=None,
-            seq_start_loc=None,
-            context_lens_tensor=None,
-            block_tables=block_tables,
-            use_cuda_graph=use_captured_graph,
-        )
+        if self.attn_backend is FlashInferBackend:
+            if not hasattr(self, "flashinfer_workspace_buffer"):
+                # Allocate 16MB workspace buffer
+                # Follow the example of flashinfer: https://docs.flashinfer.ai/api/python/decode.html
+                self.flashinfer_workspace_buffer = torch.empty(
+                    16 * 1024 * 1024, dtype=torch.uint8, device=self.device)
+            paged_kv_indptr = torch.tensor(paged_kv_indptr,
+                                           dtype=torch.int,
+                                           device=self.device)
+            paged_kv_indices = torch.tensor(paged_kv_indices,
+                                            dtype=torch.int,
+                                            device=self.device)
+            paged_kv_last_page_len = torch.tensor(paged_kv_last_page_len,
+                                                  dtype=torch.int,
+                                                  device=self.device)
+            kv_cache_dtype = get_kv_cache_torch_dtype(self.kv_cache_dtype,
+                                                      self.model_config.dtype)
+
+            attn_metadata = self.attn_backend.make_metadata(
+                is_prompt=False,
+                use_cuda_graph=False,
+                workspace_buffer=self.flashinfer_workspace_buffer,
+                paged_kv_indptr=paged_kv_indptr,
+                paged_kv_indices=paged_kv_indices,
+                paged_kv_last_page_len=paged_kv_last_page_len,
+                num_qo_heads=self.model_config.get_num_attention_heads(
+                    self.parallel_config),
+                num_kv_heads=self.model_config.get_num_kv_heads(
+                    self.parallel_config),
+                head_dim=self.model_config.get_head_size(),
+                page_size=self.block_size,
+                data_type=kv_cache_dtype)
+        else:
+            attn_metadata = self.attn_backend.make_metadata(
+                is_prompt=False,
+                seq_lens=None,
+                seq_lens_tensor=seq_lens_tensor,
+                max_query_len=None,
+                max_seq_len=max_seq_len,
+                subquery_start_loc=None,
+                seq_start_loc=None,
+                context_lens_tensor=None,
+                block_tables=block_tables,
+                use_cuda_graph=use_captured_graph,
+            )
         return PrepareDecodeMetadata(
             input_tokens=input_tokens,
             input_positions=input_positions,

From ab502751117d3785384b9c33ee88e0aff93bbf05 Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Fri, 3 May 2024 15:52:01 -0700
Subject: [PATCH 50/65] [Speculative decoding] Support target-model logprobs
 (#4378)

---
 tests/spec_decode/e2e/conftest.py             |  66 +++-
 tests/spec_decode/e2e/test_logprobs.py        | 335 ++++++++++++++++++
 .../e2e/test_multistep_correctness.py         |  63 +++-
 tests/spec_decode/test_multi_step_worker.py   |   8 +
 tests/spec_decode/test_spec_decode_worker.py  |  29 +-
 tests/spec_decode/utils.py                    |   2 +
 vllm/engine/output_processor/multi_step.py    |  18 +-
 vllm/model_executor/layers/sampler.py         |  16 +-
 vllm/sequence.py                              |   3 +
 vllm/spec_decode/batch_expansion.py           |  59 ++-
 vllm/spec_decode/interfaces.py                |   5 +
 vllm/spec_decode/ngram_worker.py              |   6 +
 vllm/spec_decode/spec_decode_worker.py        | 100 ++++--
 vllm/spec_decode/top1_proposer.py             |   2 +-
 vllm/spec_decode/util.py                      | 103 +++++-
 15 files changed, 728 insertions(+), 87 deletions(-)
 create mode 100644 tests/spec_decode/e2e/test_logprobs.py

diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 492620cf6e2cf..b1ab8a07ca636 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,9 +1,13 @@
 import asyncio
+import time
 from itertools import cycle
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import pytest
 import ray
+import torch
+from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
+                    nvmlInit)
 
 from tests.conftest import cleanup
 from vllm import LLM
@@ -13,7 +17,7 @@
 from vllm.model_executor.utils import set_random_seed
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import MultiModalData
+from vllm.sequence import Logprob, MultiModalData
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, random_uuid
 
@@ -153,12 +157,19 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
     test_name = request.node.name
 
     def generator_inner():
-        print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
+
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(torch.cuda.device_count())),
+            threshold_bytes=2 * 2**30,
+            timeout_s=60,
+        )
 
         use_async = False
         if "use_async" in kwargs:
             use_async = kwargs.pop("use_async")
+        print(f'{use_async=}')
 
+        print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
         llm = AsyncLLM(**kwargs) if use_async else LLM(**kwargs)
         set_random_seed(seed)
 
@@ -188,6 +199,20 @@ def get_output_from_llm_generator(
     return tokens, token_ids
 
 
+def get_logprobs_from_llm_generator(
+        llm_generator, prompts,
+        sampling_params) -> List[List[Dict[int, Logprob]]]:
+    """Returns a dict of (token_id: Logprob) for each generated position, for
+    each sequence in the batch.
+    """
+    for llm in llm_generator():
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        logprobs = [output.outputs[0].logprobs[:] for output in outputs]
+        del llm
+
+    return logprobs
+
+
 def run_greedy_equality_correctness_test(baseline_llm_generator,
                                          test_llm_generator,
                                          batch_size,
@@ -243,3 +268,38 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
         print(f'{i=} {baseline_token_ids=}')
         print(f'{i=}     {spec_token_ids=}')
         assert baseline_token_ids == spec_token_ids
+
+
+def wait_for_gpu_memory_to_clear(devices: List[int],
+                                 threshold_bytes: int,
+                                 timeout_s: float = 120) -> None:
+    # Use nvml instead of pytorch to reduce measurement error from torch cuda
+    # context.
+    nvmlInit()
+    start_time = time.time()
+    while True:
+        output = {}
+        output_raw = {}
+        for device in devices:
+            dev_handle = nvmlDeviceGetHandleByIndex(device)
+            mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
+            gb_used = mem_info.used / 2**30
+            output_raw[device] = gb_used
+            output[device] = f'{gb_used:.02f}'
+
+        print('gpu memory used (GB): ', end='')
+        for k, v in output.items():
+            print(f'{k}={v}; ', end='')
+        print('')
+
+        dur_s = time.time() - start_time
+        if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
+            print(f'Done waiting for free GPU memory on devices {devices=} '
+                  f'({threshold_bytes/2**30=}) {dur_s=:.02f}')
+            break
+
+        if dur_s >= timeout_s:
+            raise ValueError(f'Memory of devices {devices=} not free after '
+                             f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
+
+        time.sleep(5)
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
new file mode 100644
index 0000000000000..9572aac7df6e0
--- /dev/null
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -0,0 +1,335 @@
+import math
+from itertools import cycle
+
+import pytest
+
+from vllm import SamplingParams
+
+from .conftest import get_logprobs_from_llm_generator
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "max_logprobs": 6,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_model": "JackFram/llama-160m",
+    "num_speculative_tokens": 3,
+}])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        7,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int):
+    """Verify output logprobs are equal with and without speculative decoding.
+    """
+    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "max_logprobs": 6,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_model": "JackFram/llama-160m",
+    "num_speculative_tokens": 3,
+}])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("num_logprobs", [6])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        7,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int,
+                           num_logprobs: int):
+    """Verify output logprobs are equal with and without spec decode.
+    This specifies a number of logprobs >1.
+    """
+    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True,
+                                         logprob_rank=num_logprobs)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_model": "JackFram/llama-160m",
+    "num_speculative_tokens": 3,
+}, {
+    "speculative_model": "JackFram/llama-160m",
+    "num_speculative_tokens": 6,
+}])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
+                              batch_size: int, output_len: int):
+    """Veriy logprob greedy equality with different speculation lens.
+    """
+    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [{
+        "speculative_model": "JackFram/llama-160m",
+        "num_speculative_tokens": 3,
+
+        # Artificially limit the draft model max model len; this forces vLLM
+        # to skip speculation once the sequences grow beyond 32-k tokens.
+        "speculative_max_model_len": 32,
+    }])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_logprobs_when_skip_speculation(baseline_llm_generator,
+                                        test_llm_generator, batch_size: int,
+                                        output_len: int):
+    """Verify logprobs greedy equality when some sequences skip speculation.
+    """
+    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_model": "JackFram/llama-160m",
+    "num_speculative_tokens": 3,
+}])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_logprobs_temp_1(baseline_llm_generator, test_llm_generator,
+                         batch_size: int, output_len: int):
+    """Verify at least one logprob result has num_logprobs+1, which tests the
+    case where the sampled token is not in top-k logprobs.
+
+    Ideally, this test should validate equality with non-spec by getting
+    logprobs. This is left as future improvement.
+    """
+    batch_size = 8
+    max_output_len = output_len
+    force_output_len = True
+    logprob_rank = 5
+
+    temperature = 1.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+        "San Francisco is know for its",
+        "Facebook was created in 2004 by",
+        "Curious George is a",
+        "Python 3.11 brings improvements to its",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    # If the test requires that we generated max_output_len tokens, then set the
+    # sampling params to ignore eos token.
+    ignore_eos = force_output_len
+
+    sampling_params = SamplingParams(
+        max_tokens=max_output_len,
+        ignore_eos=ignore_eos,
+        temperature=temperature,
+        logprobs=logprob_rank,
+    )
+
+    spec_batch_logprobs = get_logprobs_from_llm_generator(
+        test_llm_generator, prompts, sampling_params)
+
+    num_returned_logprobs = [
+        len(logprob_dict) for seq_logprobs in spec_batch_logprobs
+        for logprob_dict in seq_logprobs
+    ]
+
+    # Assert one of the returned logprobs has > num_logprobs (indicating the
+    # sampled token is not in top-k).
+    assert any([
+        num_returned > logprob_rank for num_returned in num_returned_logprobs
+    ])
+
+
+def run_greedy_logprobs_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len,
+                                         force_output_len: bool,
+                                         logprob_rank: int = 1):
+    """Helper method that compares the logprobs outputs of both the baseline LLM
+    and the test LLM. It asserts greedy equality of the logprobs when the
+    temperature is zero.
+    """
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+        "San Francisco is know for its",
+        "Facebook was created in 2004 by",
+        "Curious George is a",
+        "Python 3.11 brings improvements to its",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    # If the test requires that we generated max_output_len tokens, then set the
+    # sampling params to ignore eos token.
+    ignore_eos = force_output_len
+
+    sampling_params = SamplingParams(
+        max_tokens=max_output_len,
+        ignore_eos=ignore_eos,
+        temperature=temperature,
+        logprobs=logprob_rank,
+    )
+
+    spec_batch_logprobs = get_logprobs_from_llm_generator(
+        test_llm_generator, prompts, sampling_params)
+    baseline_batch_logprobs = get_logprobs_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    assert len(baseline_batch_logprobs) == len(prompts)
+    assert len(spec_batch_logprobs) == len(prompts)
+
+    # For each sequence in the batch.
+    for i, (baseline_logprobs, spec_logprobs) in enumerate(
+            zip(baseline_batch_logprobs, spec_batch_logprobs)):
+        assert len(spec_logprobs) == len(baseline_logprobs)
+
+        # For each generated position of the sequence.
+        for pos, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
+                zip(spec_logprobs, baseline_logprobs)):
+
+            # Map rank to token/logprob in spec output.
+            spec_rank_to_token_id = {
+                value.rank: key
+                for key, value in spec_pos_logprobs.items()
+            }
+            spec_rank_to_logprob = {
+                value.rank: value.logprob
+                for key, value in spec_pos_logprobs.items()
+            }
+
+            # Map rank to token/logprob in baseline output.
+            baseline_rank_to_token_id = {
+                value.rank: key
+                for key, value in baseline_pos_logprobs.items()
+            }
+            baseline_rank_to_logprob = {
+                value.rank: value.logprob
+                for key, value in baseline_pos_logprobs.items()
+            }
+
+            # Assert set of ranks returned is equal.
+            assert set(spec_rank_to_token_id.keys()) == set(
+                baseline_rank_to_token_id.keys())
+
+            # Assert each logprob/token id is correct, keyed by rank.
+            for rank in sorted(set(spec_rank_to_token_id.keys())):
+                assert spec_rank_to_token_id[
+                    rank] == baseline_rank_to_token_id[rank], f"{rank}"
+                assert math.isclose(
+                    a=spec_rank_to_logprob[rank],
+                    b=baseline_rank_to_logprob[rank],
+                    abs_tol=1e-1,
+                )
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index f99e0f6778e59..f15fcc4746d20 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -41,24 +41,17 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [
-        {
-            # Use a small model for a fast test.
-            # Note this is repeated in the test body; to initialize a tokenizer.
-            "model": "JackFram/llama-68m",
-
-            # Skip cuda graph recording for fast test.
-            "enforce_eager": True,
+    [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
 
-            # Required for spec decode.
-            "use_v2_block_manager": True,
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
 
-            # whether use AsyncLLM engine
-            "use_async": async_mode,
-        }
-        # Try both async and sync engine execution
-        for async_mode in [True, False]
-    ])
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+    }])
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
@@ -117,6 +110,44 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         assert actual_tokens.strip() == expected_tokens.strip()
 
 
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Use AsyncLLM engine
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_with_async_engine(test_llm_generator,
+                                           baseline_llm_generator,
+                                           batch_size: int):
+    """Verify spec decode works well with async LLM engine.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=32,
+                                         force_output_len=True)
+
+
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index cc0427633e688..a33fd71459455 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -292,6 +292,10 @@ def test_draft_proposals_full_speculation_len():
                                            vocab_size,
                                            device=device,
                                            dtype=torch.float32),
+            logprobs=torch.rand(batch_size,
+                                vocab_size,
+                                device=device,
+                                dtype=torch.float32),
             sampled_token_ids=torch.randint(low=0,
                                             high=vocab_size,
                                             size=(batch_size, ),
@@ -392,6 +396,10 @@ def test_draft_proposals_mixed_k():
                                            vocab_size,
                                            device=device,
                                            dtype=torch.float32),
+            logprobs=torch.rand(expected_num_proposal_seqs,
+                                vocab_size,
+                                device=device,
+                                dtype=torch.float32),
             sampled_token_ids=torch.randint(
                 low=0,
                 high=vocab_size,
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 91315df9b5e60..6763583aa85cc 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -192,8 +192,14 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
                                     vocab_size,
                                     dtype=torch.float32,
                                     device='cuda')
+    target_token_logprobs = torch.rand(1,
+                                       batch_size * (k + 1),
+                                       vocab_size,
+                                       dtype=torch.float32,
+                                       device='cuda')
     target_output = create_sampler_output_list(target_token_ids,
-                                               target_token_probs)
+                                               target_token_probs,
+                                               target_token_logprobs)
 
     target_worker.execute_model.return_value = [target_output[0]]
 
@@ -273,8 +279,14 @@ def test_correctly_formats_output(k: int, batch_size: int):
                                     vocab_size,
                                     dtype=torch.float32,
                                     device='cuda')
+    target_token_logprobs = torch.rand(1,
+                                       batch_size * (k + 1),
+                                       vocab_size,
+                                       dtype=torch.float32,
+                                       device='cuda')
     target_output = create_sampler_output_list(target_token_ids,
-                                               target_token_probs)
+                                               target_token_probs,
+                                               target_token_logprobs)
 
     target_worker.execute_model.return_value = [target_output[0]]
 
@@ -294,7 +306,9 @@ def test_correctly_formats_output(k: int, batch_size: int):
                                   num_lookahead_slots=k)
 
     expected_output = create_sampler_output_list(
-        rejection_sampler_output.transpose(0, 1), [None for _ in range(k + 1)])
+        token_ids=rejection_sampler_output.transpose(0, 1),
+        probs=[None for _ in range(k + 1)],
+        logprobs=[None for _ in range(k + 1)])
 
     seq_ids = [
         next(iter(seq_group_metadata.seq_data.keys()))
@@ -328,7 +342,6 @@ def test_correctly_formats_output(k: int, batch_size: int):
                 continue
             assert actual_by_step[i].output_token == expected_by_step[
                 i].output_token
-            assert actual_by_step[i].logprobs == expected_by_step[i].logprobs
 
 
 @pytest.mark.parametrize('k', [1, 2])
@@ -387,8 +400,14 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
                                     vocab_size,
                                     dtype=torch.float32,
                                     device='cuda')
+    target_token_logprobs = torch.rand(1,
+                                       batch_size * (k + 1),
+                                       vocab_size,
+                                       dtype=torch.float32,
+                                       device='cuda')
     target_output = create_sampler_output_list(target_token_ids,
-                                               target_token_probs)
+                                               target_token_probs,
+                                               target_token_logprobs)
 
     target_worker.execute_model.return_value = [target_output[0]]
 
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index 87c7d88a80f42..f0f0d09106a00 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -201,6 +201,7 @@ def assert_logprobs_dict_allclose(
 def create_sampler_output_list(
         token_ids: torch.Tensor,
         probs: Iterable[Optional[torch.Tensor]],
+        logprobs: Iterable[Optional[torch.Tensor]],
         seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]:
     num_steps, batch_size = token_ids.shape
     token_ids_by_step = token_ids.tolist()
@@ -222,6 +223,7 @@ def create_sampler_output_list(
             ) for seq_index, token_id in enumerate(token_ids_by_step[step])
         ],
                       sampled_token_probs=probs[step],
+                      logprobs=logprobs[step],
                       sampled_token_ids=token_ids[step])
         for step in range(num_steps)
     ]
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 9abd87a4d5a9a..5f2f433aa811f 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -1,3 +1,4 @@
+import functools
 from typing import Callable, List
 
 from transformers import PreTrainedTokenizer
@@ -8,8 +9,8 @@
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (Logprob, Sequence, SequenceGroup,
-                           SequenceGroupOutput, SequenceOutput, SequenceStatus)
+from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
+                           SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.utils import Counter
 
@@ -48,10 +49,14 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
                                outputs: List[SequenceGroupOutput]) -> None:
         # TODO(sang): Prompt logprob currently not implemented in multi step
         # workers.
+        self._log_prompt_logprob_unsupported_warning_once()
+
+    @staticmethod
+    @functools.lru_cache()
+    def _log_prompt_logprob_unsupported_warning_once():
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
             "(e.g., speculative decode uses multi step workers).")
-        pass
 
     def process_outputs(self, sequence_group: SequenceGroup,
                         outputs: List[SequenceGroupOutput]) -> None:
@@ -89,6 +94,7 @@ def _process_seq_outputs(self, seq: Sequence,
                              valid_samples: List[SequenceOutput],
                              sampling_params: SamplingParams) -> None:
         output_token_ids = [sample.output_token for sample in valid_samples]
+        output_logprobs = [sample.logprobs for sample in valid_samples]
 
         # Truncate to max_tokens if necessary.
         remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
@@ -113,11 +119,11 @@ def _process_seq_outputs(self, seq: Sequence,
 
         # Incrementally append tokens to the sequence, as if we had only one new
         # token.
-        for output_token_id in output_token_ids:
+        for output_token_id, output_logprob in zip(output_token_ids,
+                                                   output_logprobs):
             seq.append_token_id(
                 token_id=output_token_id,
-                # TODO emit logprobs in multi-step decoding.
-                logprobs={output_token_id: Logprob(0.0)},
+                logprobs=output_logprob,
             )
 
             new_char_count = 0
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 2de7763605dfc..1f19d2053d996 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -103,8 +103,7 @@ def forward(
 
         if self.include_gpu_probs_tensor:
             assert maybe_sampled_tokens_tensor is not None
-            sampled_tokens_tensor = maybe_sampled_tokens_tensor
-            on_device_tensors = (probs, sampled_tokens_tensor)
+            on_device_tensors = (probs, logprobs, maybe_sampled_tokens_tensor)
         else:
             on_device_tensors = None
 
@@ -965,8 +964,7 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
     has implications on the overall design of the sampler, e.g. how to record
     accurate logprobs for the user, so this improvement is deferred to later.
     """
-    logprobs[sample_indices, :] = -float('inf')
-    logprobs[sample_indices, greedy_samples] = 0.0
+    # NOTE: logprobs are not modified so they can be returned to the user.
     probs[sample_indices, :] = 0
     probs[sample_indices, greedy_samples] = 1.0
 
@@ -976,7 +974,8 @@ def _build_sampler_output(
     sampling_metadata: SamplingMetadata,
     prompt_logprobs: List[Optional[PromptLogprobs]],
     sample_logprobs: List[SampleLogprobs],
-    on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor]],
+    on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor,
+                                      torch.Tensor]],
 ) -> SamplerOutput:
     """Construct Python objects with the output of sampling.
 
@@ -1005,14 +1004,17 @@ def _build_sampler_output(
 
     # If not specified, store None values in SamplerOutput.
     if on_device_tensors is not None:
-        sampled_token_probs, sampled_token_ids = on_device_tensors
+        (sampled_token_probs, logprobs_tensor,
+         sampled_token_ids) = on_device_tensors
     else:
-        sampled_token_probs, sampled_token_ids = (None, None)
+        sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None,
+                                                                   None)
 
     return SamplerOutput(
         outputs=sampler_output,
         sampled_token_probs=sampled_token_probs,
         sampled_token_ids=sampled_token_ids,
+        logprobs=logprobs_tensor,
     )
 
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 8caf97d30d539..35ac59d69f117 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -700,6 +700,9 @@ class SamplerOutput:
     # On-device tensor containing probabilities of each token.
     sampled_token_probs: Optional["torch.Tensor"] = None
 
+    # On-device tensor containing the logprobs of each token.
+    logprobs: Optional["torch.Tensor"] = None
+
     # On-device tensor containing the sampled token ids.
     sampled_token_ids: Optional["torch.Tensor"] = None
 
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 8b113e93474ff..8b302ba1aabeb 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -94,7 +94,7 @@ def score_proposals(
         assert len(target_sampler_output) == 1, "expected single-step output"
         target_sampler_output = target_sampler_output[0]
 
-        all_tokens, all_probs = self._contract_batch(
+        all_tokens, all_probs, spec_logprobs = self._contract_batch(
             contracted_bs=len(seq_group_metadata_list),
             target_sampler_output=target_sampler_output,
             proposals=proposals,
@@ -107,6 +107,7 @@ def score_proposals(
         return SpeculativeScores(
             probs=all_probs,
             token_ids=all_tokens,
+            logprobs=spec_logprobs,
         )
 
     def _expand_batch(
@@ -148,12 +149,12 @@ def _expand_batch(
         return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
                 num_scoring_tokens)
 
-    def _contract_batch(self, contracted_bs: int,
-                        target_sampler_output: List[SamplerOutput],
-                        proposals: SpeculativeProposals,
-                        num_scoring_tokens: int, non_spec_indices: List[int],
-                        spec_indices: List[int],
-                        k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    def _contract_batch(
+            self, contracted_bs: int,
+            target_sampler_output: List[SamplerOutput],
+            proposals: SpeculativeProposals, num_scoring_tokens: int,
+            non_spec_indices: List[int], spec_indices: List[int],
+            k: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Contract the expanded batch back into its original size.
         This maps the scores of speculative tokens back to their original
         sequences.
@@ -161,8 +162,9 @@ def _contract_batch(self, contracted_bs: int,
         contracted_bs is the original batch size, and the batch size that the
         target_sampler_output will be contracted to.
         """
-        (target_token_ids, target_probs, non_spec_target_token_ids,
-         non_spec_target_probs) = self._split_scoring_output(
+        (target_token_ids, target_probs, target_logprobs,
+         non_spec_target_token_ids, non_spec_target_probs,
+         non_spec_target_logprobs) = self._split_scoring_output(
              target_sampler_output, num_scoring_tokens)
 
         # Map distinct sequences used to score each token
@@ -179,6 +181,8 @@ def _contract_batch(self, contracted_bs: int,
             spec_expanded_bs, k + 1)
         target_probs = target_probs.squeeze().reshape(spec_expanded_bs, k + 1,
                                                       self._vocab_size)
+        target_logprobs = target_logprobs.squeeze().reshape(
+            spec_expanded_bs, k + 1, self._vocab_size)
 
         all_tokens = torch.full(size=(contracted_bs, k + 1),
                                 fill_value=-1,
@@ -189,16 +193,26 @@ def _contract_batch(self, contracted_bs: int,
                                 self._vocab_size,
                                 device=self._device,
                                 dtype=torch.float32)
+        all_logprobs = torch.full(size=(
+            contracted_bs,
+            k + 1,
+            self._vocab_size,
+        ),
+                                  fill_value=-float("inf"),
+                                  device=self._device,
+                                  dtype=torch.float32)
 
         if non_spec_indices:
             all_tokens[non_spec_indices, :1] = non_spec_target_token_ids
             all_probs[non_spec_indices, :1, :] = non_spec_target_probs
+            all_logprobs[non_spec_indices, :1, :] = non_spec_target_logprobs
 
         if spec_indices:
             all_tokens[spec_indices] = target_token_ids
             all_probs[spec_indices] = target_probs
+            all_logprobs[spec_indices] = target_logprobs
 
-        return all_tokens, all_probs
+        return all_tokens, all_probs, all_logprobs
 
     def _create_scoring_model_input(
         self,
@@ -308,7 +322,8 @@ def _create_single_target_seq_group_metadata(
 
     def _split_scoring_output(
         self, sampler_output: SamplerOutput, num_scoring_tokens: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+               torch.Tensor, torch.Tensor]:
         """Split the target model output into speculative and non-speculative
         output.
         """
@@ -328,21 +343,29 @@ def _split_scoring_output(
          ) = sampler_output.sampled_token_probs.split(split_sizes)
         (spec_sampled_tokens, non_spec_sampled_tokens
          ) = sampler_output.sampled_token_ids.flatten().split(split_sizes)
+        (
+            spec_logprobs,
+            non_spec_logprobs,
+        ) = sampler_output.logprobs.split(split_sizes)
 
         # Convert scores to tensors.
         sampler_output.sampled_token_probs = spec_probs
         sampler_output.sampled_token_ids = spec_sampled_tokens
-        target_token_ids, target_probs = sampler_output_to_torch(
-            [sampler_output], True)
+        sampler_output.logprobs = spec_logprobs
+        (target_token_ids, target_probs,
+         target_logprobs) = sampler_output_to_torch([sampler_output], True)
 
         # Convert non-speculative output tokens to tensors.
         sampler_output.sampled_token_probs = non_spec_probs
         sampler_output.sampled_token_ids = non_spec_sampled_tokens
-        non_spec_target_token_ids, non_spec_target_probs = (
-            sampler_output_to_torch([sampler_output], True))
-
-        return (target_token_ids, target_probs, non_spec_target_token_ids,
-                non_spec_target_probs)
+        sampler_output.logprobs = non_spec_logprobs
+        (non_spec_target_token_ids, non_spec_target_probs,
+         non_spec_target_logprobs) = sampler_output_to_torch([sampler_output],
+                                                             True)
+
+        return (target_token_ids, target_probs, target_logprobs,
+                non_spec_target_token_ids, non_spec_target_probs,
+                non_spec_target_logprobs)
 
     def _create_target_seq_id_iterator(
             self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]:
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index dd040779922e9..489d940a88856 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -38,6 +38,11 @@ class SpeculativeScores:
     # Probabilities of the speculative tokens according to the scoring model.
     probs: torch.Tensor
 
+    # Log-probabilities of the speculative tokens according to the scoring
+    # model. These values can be used to generate Logprob objects that are
+    # returned to the user.
+    logprobs: torch.Tensor
+
     # Token ids sampled from the scoring model. Used for speculative bonus
     # tokens and also non-speculative normal decoding.
     token_ids: torch.Tensor
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 696ca964328cf..cacaca697526c 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -140,11 +140,17 @@ def sampler_output(
             device=self.device,
         )
         token_probs.scatter_(2, indices, 1)
+        token_logprobs = torch.zeros(
+            (len(seq_group_metadata_list), sample_len, self.vocab_size),
+            dtype=torch.float32,
+            device=self.device,
+        )
         for i in range(len(seq_group_metadata_list)):
             outputs.append(
                 SamplerOutput(
                     outputs=None,
                     sampled_token_probs=token_probs[i],
+                    logprobs=token_logprobs,
                     sampled_token_ids=token_ids[i],
                 ))
         return outputs, False
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index e33bb4f3f6337..503519a0dfc4b 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -5,15 +5,16 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.sequence import (Logprob, SamplerOutput, SequenceGroupMetadata,
-                           SequenceGroupOutput, SequenceOutput)
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.metrics import AsyncMetricsCollector
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.ngram_worker import NGramWorker
-from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
+from vllm.spec_decode.util import (create_sequence_group_output,
+                                   get_all_num_logprobs, get_all_seq_ids,
+                                   get_sampled_token_logprobs, nvtx_range,
                                    split_batch_by_proposal_len)
 from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
 
@@ -258,6 +259,7 @@ def _run_no_spec(
         # overhead when the engine runs in a different process than the workers.
         sampler_output.probs = None
         sampler_output.sampled_tokens = None
+        sampler_output.logprobs = None
         return [sampler_output]
 
     @nvtx_range("spec_decode_worker._run_speculative_decoding_step")
@@ -298,12 +300,15 @@ def _run_speculative_decoding_step(
         )
 
         #logger.info("verify proposals")
-        accepted_token_ids = self._verify_tokens(seq_group_metadata_list,
-                                                 proposal_scores, proposals, k)
+        accepted_token_ids, target_logprobs = self._verify_tokens(
+            seq_group_metadata_list, proposal_scores, proposals, k)
 
         #logger.info("create output list")
-        return self._create_output_sampler_list(seq_group_metadata_list,
-                                                accepted_token_ids, k)
+        return self._create_output_sampler_list(
+            seq_group_metadata_list,
+            accepted_token_ids,
+            target_logprobs=target_logprobs,
+            k=k)
 
     @nvtx_range("spec_decode_worker._verify_tokens")
     def _verify_tokens(
@@ -312,9 +317,12 @@ def _verify_tokens(
         proposal_scores: SpeculativeScores,
         proposals: SpeculativeProposals,
         max_proposal_len: int,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Determine which speculative tokens are accepted using the
         probabilities of each token according to the proposer and scorer models.
+
+        Returns a tuple of Tensors, one for the accepted token ids and one for
+        the logprobs according to the scoring model.
         """
         proposal_lens_list = proposals.proposal_lens.tolist()
 
@@ -361,17 +369,19 @@ def _verify_tokens(
         non_spec_token_ids[:, 1:] = -1
         accepted_token_ids = torch.cat(
             [accepted_token_ids, non_spec_token_ids])
+        logprobs = proposal_scores.logprobs
 
         # Rearrange so that results are in the order of the original seq group
         # metadata.
         accepted_token_ids[original_indices] = accepted_token_ids.clone()
 
-        return accepted_token_ids
+        return accepted_token_ids, logprobs
 
     def _create_output_sampler_list(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         accepted_token_ids: torch.Tensor,  # shape: [batch_size, k+1]
+        target_logprobs: torch.Tensor,  # shape: [batch_size, k+1, vocab_size]
         k: int,
     ) -> List[SamplerOutput]:
         """Given the accepted token ids, create a list of SamplerOutput.
@@ -379,30 +389,68 @@ def _create_output_sampler_list(
         The output is padded with -1 tokens such that each sequence has
         the same number of outputs.
         """
+        batch_size, num_steps = accepted_token_ids.shape
+
+        # Organize input tensors by step instead of by sequence.
+        target_logprobs_by_step = target_logprobs.transpose(0, 1)
+        accepted_token_ids_by_step = accepted_token_ids.transpose(0, 1)
+
+        # Get the logprobs/rank of the accepted tokens.
+        (accepted_token_id_ranks_by_step,
+         accepted_token_id_logprobs_by_step) = get_sampled_token_logprobs(
+             logprob_tensor=target_logprobs_by_step,
+             sampled_token_ids=accepted_token_ids_by_step,
+         )
+
+        # Get the top-k logprobs (which may or may not include the logprob of
+        # the accepted token).
+        (topk_logprobs_by_step,
+         topk_indices_by_step) = target_logprobs_by_step.topk(
+             k=self.scorer_worker.model_config.max_logprobs,
+             dim=-1,
+         )
+
+        # Get the sequence ids and num_logprobs (sampling parameter) in the
+        # batch.
         seq_ids = get_all_seq_ids(seq_group_metadata_list)
-
-        # shape: [k+1, batch_size]
-        accepted_token_ids_by_step = accepted_token_ids.transpose(0,
-                                                                  1).tolist()
+        num_logprobs_per_seq = get_all_num_logprobs(seq_group_metadata_list)
+
+        # Serialize all tensors to CPU Python lists.
+        accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
+        accepted_token_id_ranks_by_step = (
+            accepted_token_id_ranks_by_step.tolist())
+        accepted_token_id_logprobs_by_step = (
+            accepted_token_id_logprobs_by_step.tolist())
+        topk_logprobs_by_step = topk_logprobs_by_step.tolist()
+        topk_indices_by_step = topk_indices_by_step.tolist()
+
+        # Construct the output on a per-step, per-sequence basis.
         sampler_output_list = []
-        for token_ids_by_step in accepted_token_ids_by_step:
-            if all(token_id == -1 for token_id in token_ids_by_step):
+        for step_index in range(num_steps):
+            if all(token_id == -1
+                   for token_id in accepted_token_ids_by_step[step_index]):
                 break
 
             step_output_token_ids = []
-            for token_id, seq_id in zip(token_ids_by_step, seq_ids):
+            for sequence_index in range(batch_size):
+                # Each sequence may have a different num_logprobs; retrieve it.
+                num_logprobs = num_logprobs_per_seq[sequence_index]
+
                 step_output_token_ids.append(
-                    SequenceGroupOutput(
-                        samples=[
-                            SequenceOutput(
-                                parent_seq_id=seq_id,
-                                output_token=token_id,
-                                # TODO Add verifier logprobs.
-                                logprobs={token_id: Logprob(0.0)},
-                            )
-                        ],
-                        prompt_logprobs=None,
+                    create_sequence_group_output(
+                        token_id=accepted_token_ids_by_step[step_index]
+                        [sequence_index],
+                        token_id_logprob_rank=accepted_token_id_ranks_by_step[
+                            step_index][sequence_index],
+                        token_id_logprob=accepted_token_id_logprobs_by_step[
+                            step_index][sequence_index],
+                        seq_id=seq_ids[sequence_index],
+                        topk_token_ids=topk_indices_by_step[step_index]
+                        [sequence_index][:num_logprobs],
+                        topk_logprobs=topk_logprobs_by_step[step_index]
+                        [sequence_index][:num_logprobs],
                     ))
+
             sampler_output_list.append(
                 SamplerOutput(outputs=step_output_token_ids))
 
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 6766a2deb8eb8..56c63887b0315 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -166,7 +166,7 @@ def _merge_outputs(
             return proposal_tokens, proposal_probs, proposal_lens_tensor
 
         sampler_output = maybe_sampler_output
-        proposal_tokens, proposal_probs = sampler_output_to_torch(
+        proposal_tokens, proposal_probs, _ = sampler_output_to_torch(
             sampler_output, sampler_transposed)
 
         # Now, reformat the output GPU tensors such that each sequence has
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 894d2fd915948..d6f80c82b80bf 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -1,10 +1,11 @@
 from contextlib import contextmanager
 from itertools import chain
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 import torch
 
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import (Logprob, SamplerOutput, SequenceGroupMetadata,
+                           SequenceGroupOutput, SequenceOutput)
 
 SeqId = int
 
@@ -21,6 +22,89 @@ def get_all_seq_ids(
         ]))
 
 
+def get_all_num_logprobs(
+        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]:
+    """Given a list of SequenceGroupMetadata, create a list of all num_logprobs.
+
+    If the sampling params do not call for any logprobs, return 0 for that
+    sequence.
+    """
+
+    all_num_logprobs = []
+    for seq_group_metadata in seq_group_metadata_list:
+        num_logprobs = seq_group_metadata.sampling_params.logprobs
+        if seq_group_metadata.sampling_params.logprobs is None:
+            num_logprobs = 0
+        all_num_logprobs.append(num_logprobs)
+
+    return all_num_logprobs
+
+
+def get_sampled_token_logprobs(
+        # shape [num_steps, batch_size, vocab_size]
+        logprob_tensor: torch.Tensor,
+        sampled_token_ids: torch.Tensor,  # shape [num_steps, batch_size]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Get the logprobs for the sampled tokens. Returns the ranks and logprobs.
+    """
+    num_steps, batch_size, vocab_size = logprob_tensor.shape
+
+    selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1),
+                                       torch.arange(batch_size),
+                                       sampled_token_ids, ]
+    expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
+        -1, -1, vocab_size)
+    sampled_token_ids_ranks = (logprob_tensor >=
+                               expanded_selected_logprobs).sum(-1)
+
+    return sampled_token_ids_ranks, selected_logprobs
+
+
+def create_sequence_group_output(
+    token_id: int,
+    token_id_logprob_rank: int,
+    token_id_logprob: float,
+    seq_id: SeqId,
+    topk_token_ids: List[int],
+    topk_logprobs: List[float],
+) -> SequenceGroupOutput:
+    """Create a SequenceGroupOutput given the sampling results.
+
+    Args:
+        token_id (int): The sampled token for the sequence.
+        token_id_logprob_rank (int): The logprob rank of the sampled token.
+        token_id_logprob (float): The logprob value of the sampled token.
+        seq_id (int): The sequence id.
+        topk_token_ids (List[int]): The list of top-k token ids.
+        topk_logprobs (List[float]): The list of top-k logprobs.
+    """
+    # vLLM logprobs always include the sampled token. In addition, the user may
+    # request topk-logprobs (where top-k varies per user up to max_logprobs).
+    logprobs: Dict[int, Logprob] = {
+        token_id: Logprob(
+            logprob=token_id_logprob,
+            rank=token_id_logprob_rank,
+        ),
+    }
+    logprobs.update({
+        topk_token_ids[topk_logprob_index]: Logprob(
+            logprob=topk_logprobs[topk_logprob_index],
+            rank=topk_logprob_index + 1,
+        )
+        for topk_logprob_index, _ in enumerate(topk_token_ids)
+    })
+
+    return SequenceGroupOutput(
+        samples=[
+            SequenceOutput(parent_seq_id=seq_id,
+                           output_token=token_id,
+                           logprobs=logprobs)
+        ],
+        # TODO add prompt logprobs support.
+        prompt_logprobs=None,
+    )
+
+
 def split_batch_by_proposal_len(
     seq_group_metadata_list: List[SequenceGroupMetadata],
     proposal_lens: List[int], select_proposal_len_zero: bool
@@ -49,8 +133,8 @@ def split_batch_by_proposal_len(
 
 
 def sampler_output_to_torch(
-        sampler_output_list: List[SamplerOutput],
-        sampler_transposed: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+    sampler_output_list: List[SamplerOutput], sampler_transposed: bool
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Utility function which converts a list of SamplerOutput to tensors.
 
         sampler_transposed here is used as the indicator for whether
@@ -76,6 +160,15 @@ def sampler_output_to_torch(
     if sampler_transposed:
         sampled_token_probs = sampled_token_probs.transpose(0, 1)
 
+    # shape: [batch_size, num_sampler_output, vocab_size]
+    sampled_token_logprobs = torch.stack(
+        [sampler_output.logprobs for sampler_output in sampler_output_list],
+        dim=0,
+    )
+
+    if sampler_transposed:
+        sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
+
     # shape: [batch_size, num_sampler_output]
     sampled_token_ids = torch.stack(
         [
@@ -87,7 +180,7 @@ def sampler_output_to_torch(
     if sampler_transposed:
         sampled_token_ids = sampled_token_ids.transpose(0, 1)
 
-    return sampled_token_ids, sampled_token_probs
+    return sampled_token_ids, sampled_token_probs, sampled_token_logprobs
 
 
 def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int,

From 344bf7cd2d66a8b13f216f61c7a6d5d70576a498 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 3 May 2024 15:55:56 -0700
Subject: [PATCH 51/65] [Misc] add installation time env vars (#4574)

---
 setup.py     | 33 ++++++++++++++++++--------
 vllm/envs.py | 66 +++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 81 insertions(+), 18 deletions(-)

diff --git a/setup.py b/setup.py
index 801d8d50db1dc..3768daf9d6fab 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
+import importlib.util
 import io
 import logging
 import os
@@ -13,10 +14,23 @@
 from setuptools.command.build_ext import build_ext
 from torch.utils.cpp_extension import CUDA_HOME
 
+
+def load_module_from_path(module_name, path):
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)
-# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
-VLLM_TARGET_DEVICE = os.getenv("VLLM_TARGET_DEVICE", "cuda")
+
+# cannot import envs directly because it depends on vllm,
+#  which is not installed yet
+envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
+
+VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
 # vLLM only supports Linux platform
 assert sys.platform.startswith(
@@ -60,7 +74,7 @@ class cmake_build_ext(build_ext):
     def compute_num_jobs(self):
         # `num_jobs` is either the value of the MAX_JOBS environment variable
         # (if defined) or the number of CPUs available.
-        num_jobs = os.environ.get("MAX_JOBS", None)
+        num_jobs = envs.MAX_JOBS
         if num_jobs is not None:
             num_jobs = int(num_jobs)
             logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
@@ -78,7 +92,7 @@ def compute_num_jobs(self):
             # environment variable (if defined) or 1.
             # when it is set, we reduce `num_jobs` to avoid
             # overloading the system.
-            nvcc_threads = os.getenv("NVCC_THREADS", None)
+            nvcc_threads = envs.NVCC_THREADS
             if nvcc_threads is not None:
                 nvcc_threads = int(nvcc_threads)
                 logger.info(
@@ -104,7 +118,7 @@ def configure(self, ext: CMakeExtension) -> None:
         # Select the build type.
         # Note: optimization level + debug info are set by the build type
         default_cfg = "Debug" if self.debug else "RelWithDebInfo"
-        cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
+        cfg = envs.CMAKE_BUILD_TYPE or default_cfg
 
         # where .so files will be written, should be the same for all extensions
         # that use the same CMakeLists.txt.
@@ -118,7 +132,7 @@ def configure(self, ext: CMakeExtension) -> None:
             '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
         ]
 
-        verbose = bool(int(os.getenv('VERBOSE', '0')))
+        verbose = envs.VERBOSE
         if verbose:
             cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
 
@@ -205,8 +219,7 @@ def _is_neuron() -> bool:
         subprocess.run(["neuron-ls"], capture_output=True, check=True)
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
         torch_neuronx_installed = False
-    return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
-                                                     False)
+    return torch_neuronx_installed or envs.VLLM_BUILD_WITH_NEURON
 
 
 def _is_cpu() -> bool:
@@ -214,7 +227,7 @@ def _is_cpu() -> bool:
 
 
 def _install_punica() -> bool:
-    return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
+    return envs.VLLM_INSTALL_PUNICA_KERNELS
 
 
 def get_hipcc_rocm_version():
@@ -377,7 +390,7 @@ def _read_requirements(filename: str) -> List[str]:
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
-if os.environ.get("VLLM_USE_PRECOMPILED"):
+if envs.VLLM_USE_PRECOMPILED:
     ext_modules = []
     package_data["vllm"].append("*.so")
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 2dbb57e6253a7..91cc8f3be775c 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -27,6 +27,14 @@
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+    VLLM_TARGET_DEVICE: str = "cuda"
+    MAX_JOBS: Optional[str] = None
+    NVCC_THREADS: Optional[str] = None
+    VLLM_BUILD_WITH_NEURON: bool = False
+    VLLM_USE_PRECOMPILED: bool = False
+    VLLM_INSTALL_PUNICA_KERNELS: bool = False
+    CMAKE_BUILD_TYPE: Optional[str] = None
+    VERBOSE: bool = False
 
 # The begin-* and end* here are used by the documentation generator
 # to extract the used env vars.
@@ -34,6 +42,56 @@
 # begin-env-vars-definition
 
 environment_variables: Dict[str, Callable[[], Any]] = {
+
+    # ================== Installation Time Env Vars ==================
+
+    # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
+    "VLLM_TARGET_DEVICE":
+    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
+
+    # Maximum number of compilation jobs to run in parallel.
+    # By default this is the number of CPUs
+    "MAX_JOBS":
+    lambda: os.getenv("MAX_JOBS", None),
+
+    # Number of threads to use for nvcc
+    # By default this is 1.
+    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
+    "NVCC_THREADS":
+    lambda: os.getenv("NVCC_THREADS", None),
+
+    # If set, vllm will build with Neuron support
+    "VLLM_BUILD_WITH_NEURON":
+    lambda: bool(os.environ.get("VLLM_BUILD_WITH_NEURON", False)),
+
+    # If set, vllm will use precompiled binaries (*.so)
+    "VLLM_USE_PRECOMPILED":
+    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
+
+    # If set, vllm will install Punica kernels
+    "VLLM_INSTALL_PUNICA_KERNELS":
+    lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
+
+    # CMake build type
+    # If not set, defaults to "Debug" or "RelWithDebInfo"
+    # Available options: "Debug", "Release", "RelWithDebInfo"
+    "CMAKE_BUILD_TYPE":
+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
+
+    # If set, vllm will print verbose logs during installation
+    "VERBOSE":
+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
+
+    # Root directory for VLLM configuration files
+    # Note that this not only affects how vllm finds its configuration files
+    # during runtime, but also affects how vllm installs its configuration
+    # files during **installation**.
+    "VLLM_CONFIG_ROOT":
+    lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
+        "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
+
+    # ================== Runtime Env Vars ==================
+
     # used in distributed environment to determine the master address
     'VLLM_HOST_IP':
     lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
@@ -93,14 +151,6 @@
     "S3_ENDPOINT_URL":
     lambda: os.environ.get("S3_ENDPOINT_URL", None),
 
-    # Root directory for VLLM configuration files
-    # Note that this not only affects how vllm finds its configuration files
-    # during runtime, but also affects how vllm installs its configuration
-    # files during **installation**.
-    "VLLM_CONFIG_ROOT":
-    lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
-        "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
-
     # Usage stats collection
     "VLLM_USAGE_STATS_SERVER":
     lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),

From bc8ad68455ce41ba672764f4a53df5a87d1dbe99 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 3 May 2024 17:47:07 -0700
Subject: [PATCH 52/65] [Misc][Refactor] Introduce ExecuteModelData (#4540)

---
 tests/spec_decode/test_multi_step_worker.py  | 98 ++++++++++----------
 tests/spec_decode/test_ngram_worker.py       | 64 ++++++-------
 tests/spec_decode/test_spec_decode_worker.py | 95 +++++++++----------
 tests/spec_decode/utils.py                   | 50 +---------
 tests/worker/test_swap.py                    | 30 ++++--
 vllm/core/scheduler.py                       |  4 +
 vllm/engine/async_llm_engine.py              | 16 ++--
 vllm/engine/llm_engine.py                    | 12 ++-
 vllm/executor/cpu_executor.py                | 37 ++------
 vllm/executor/executor_base.py               | 22 ++---
 vllm/executor/gpu_executor.py                | 33 ++-----
 vllm/executor/neuron_executor.py             | 33 +++----
 vllm/executor/ray_gpu_executor.py            | 19 +---
 vllm/sequence.py                             | 32 ++++++-
 vllm/spec_decode/batch_expansion.py          | 30 ++----
 vllm/spec_decode/interfaces.py               | 15 +--
 vllm/spec_decode/multi_step_worker.py        | 54 +++++------
 vllm/spec_decode/ngram_worker.py             | 62 +++++--------
 vllm/spec_decode/spec_decode_worker.py       | 90 +++++-------------
 vllm/spec_decode/top1_proposer.py            | 22 ++---
 vllm/worker/cpu_worker.py                    | 25 ++---
 vllm/worker/worker.py                        | 23 +++--
 vllm/worker/worker_base.py                   |  8 +-
 23 files changed, 359 insertions(+), 515 deletions(-)

diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index a33fd71459455..cb2de97a4af94 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -5,13 +5,12 @@
 import torch
 
 from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import SamplerOutput
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
 
 from .utils import (assert_logprobs_dict_allclose, create_batch,
-                    create_execute_model_data,
                     create_seq_group_metadata_from_prompts, create_worker,
                     patch_execute_model_with_seeds, zero_kv_cache)
 
@@ -105,31 +104,32 @@ def test_same_output_for_single_step():
 
     final_prompt_lens = [len(prompt) + num_steps for prompt in prompts]
 
-    multi_step_execute_model_data = create_execute_model_data(
-        seq_group_metadata_list=create_seq_group_metadata_from_prompts(
-            prompts,
-            num_gpu_blocks,
-            block_size,
-            final_prompt_lens=final_prompt_lens))
-
-    single_step_execute_model_data = create_execute_model_data(
-        seq_group_metadata_list=create_seq_group_metadata_from_prompts(
-            prompts,
-            num_gpu_blocks,
-            block_size,
-            final_prompt_lens=final_prompt_lens))
+    multi_step_seq_group = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        final_prompt_lens=final_prompt_lens)
 
     zero_kv_cache(multi_step_worker.cache_engine)
     set_random_seed(seed)
     actual_output, _ = multi_step_worker.sampler_output(
-        **multi_step_execute_model_data.to_dict(), sample_len=num_steps)
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=multi_step_seq_group),
+        sample_len=num_steps)
     assert len(actual_output) == num_steps
     actual_output = actual_output[0]
 
+    single_step_seq_group = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        final_prompt_lens=final_prompt_lens)
+
     zero_kv_cache(worker.cache_engine)
     set_random_seed(seed)
     expected_output = worker.execute_model(
-        **single_step_execute_model_data.to_dict(), )[0]
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=single_step_seq_group))[0]
 
     actual_token_ids = [
         output.samples[0].output_token for output in actual_output
@@ -193,19 +193,20 @@ def test_same_output_for_multi_step():
     worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
 
     continuations = [[1] for _ in prompts]
-    execute_model_data = create_execute_model_data(
-        create_seq_group_metadata_from_prompts(
-            prompts,
-            num_gpu_blocks,
-            block_size,
-            continuations=continuations,
-            final_prompt_lens=final_prompt_lens), )
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        continuations=continuations,
+        final_prompt_lens=final_prompt_lens)
 
     # Run multi-step.
     zero_kv_cache(multi_step_worker.cache_engine)
     set_random_seed(seed)
     multi_step_output, _ = multi_step_worker.sampler_output(
-        **execute_model_data.to_dict(), sample_len=num_steps)
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list),
+        sample_len=num_steps)
 
     # Run single-step repeatedly.
     zero_kv_cache(worker.cache_engine)
@@ -215,16 +216,16 @@ def test_same_output_for_multi_step():
 
     for _ in multi_step_output:
 
-        execute_model_data = create_execute_model_data(
-            create_seq_group_metadata_from_prompts(
-                prompts,
-                num_gpu_blocks,
-                block_size,
-                continuations=continuations,
-                final_prompt_lens=final_prompt_lens))
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            continuations=continuations,
+            final_prompt_lens=final_prompt_lens)
 
         single_step_output.extend(
-            worker.execute_model(**execute_model_data.to_dict(), ))
+            worker.execute_model(execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list)))
 
         # Append output tokens to new sequence data.
         for i, seq_group_output in enumerate(single_step_output[-1]):
@@ -304,12 +305,11 @@ def test_draft_proposals_full_speculation_len():
         ) for _ in range(k)
     ], True
 
-    execute_model_data, _, _ = create_batch(batch_size, k)
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
 
-    proposals = proposer.get_proposals(
-        **execute_model_data.to_dict(),
-        proposal_len=k,
-    )
+    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -340,14 +340,13 @@ def test_draft_proposals_no_speculations():
         max_proposal_len=prompt_len + k - 1,
     )
 
-    execute_model_data, _, _ = create_batch(batch_size,
-                                            k,
-                                            prompt_len=prompt_len)
+    seq_group_metadata_list, _, _ = create_batch(batch_size,
+                                                 k,
+                                                 prompt_len=prompt_len)
 
-    proposals = proposer.get_proposals(
-        **execute_model_data.to_dict(),
-        proposal_len=k,
-    )
+    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -409,17 +408,16 @@ def test_draft_proposals_mixed_k():
         ) for _ in range(k)
     ], True
 
-    execute_model_data, _, _ = create_batch(
+    seq_group_metadata_list, _, _ = create_batch(
         batch_size,
         k,
         prompt_len=prompt_len,
         prev_output_token_len=prev_output_token_len,
     )
 
-    proposals = proposer.get_proposals(
-        **execute_model_data.to_dict(),
-        proposal_len=k,
-    )
+    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index e7e2e87f599dd..de305c4030aa9 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -1,10 +1,10 @@
 import torch
 
+from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
-from .utils import (create_execute_model_data,
-                    create_seq_group_metadata_from_prompts, create_worker)
+from .utils import create_seq_group_metadata_from_prompts, create_worker
 
 
 def test_ngram_algo_correctness_for_single_no_match():
@@ -44,17 +44,15 @@ def test_ngram_algo_correctness_for_single_no_match():
 
     proposal_len = 5
     final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
-    ngram_sampler_output_data = create_execute_model_data(
-        seq_group_metadata_list=create_seq_group_metadata_from_prompts(
-            prompts,
-            num_gpu_blocks,
-            block_size,
-            final_prompt_lens=final_prompt_lens))
-
-    proposals = proposer.get_proposals(
-        **ngram_sampler_output_data.to_dict(),
-        proposal_len=proposal_len,
-    )
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        final_prompt_lens=final_prompt_lens)
+
+    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=proposal_len), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -113,17 +111,15 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
 
     proposal_len = 5
     final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
-    ngram_sampler_output_data = create_execute_model_data(
-        seq_group_metadata_list=create_seq_group_metadata_from_prompts(
-            prompts,
-            num_gpu_blocks,
-            block_size,
-            final_prompt_lens=final_prompt_lens))
-
-    proposals = proposer.get_proposals(
-        **ngram_sampler_output_data.to_dict(),
-        proposal_len=proposal_len,
-    )
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        final_prompt_lens=final_prompt_lens)
+
+    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=proposal_len), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -185,17 +181,15 @@ def test_ngram_algo_correctness_for_batches_match_all():
 
     proposal_len = 5
     final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
-    ngram_sampler_output_data = create_execute_model_data(
-        seq_group_metadata_list=create_seq_group_metadata_from_prompts(
-            prompts,
-            num_gpu_blocks,
-            block_size,
-            final_prompt_lens=final_prompt_lens))
-
-    proposals = proposer.get_proposals(
-        **ngram_sampler_output_data.to_dict(),
-        proposal_len=proposal_len,
-    )
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        final_prompt_lens=final_prompt_lens)
+
+    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=proposal_len), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 6763583aa85cc..ef9d32f73d668 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -7,7 +7,7 @@
 
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import SamplerOutput
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.metrics import (AsyncMetricsCollector,
                                       SpecDecodeWorkerMetrics)
@@ -15,8 +15,7 @@
 from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
                                                  split_num_cache_blocks_evenly)
 
-from .utils import (ExecuteModelData, create_batch, create_sampler_output_list,
-                    mock_worker)
+from .utils import create_batch, create_sampler_output_list, mock_worker
 
 
 @pytest.mark.parametrize('k', [1, 2, 6])
@@ -36,24 +35,19 @@ def test_correctly_calls_draft_model(k: int, batch_size: int):
     exception_secret = 'artificial stop'
     draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
 
-    execute_model_data, _, _ = create_batch(batch_size, k)
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)
 
     with pytest.raises(ValueError, match=exception_secret):
-        worker.execute_model(**execute_model_data.to_dict(),
-                             num_lookahead_slots=k)
+        worker.execute_model(execute_model_req=execute_model_req)
 
     call_args_list = draft_worker.get_spec_proposals.call_args_list
     assert len(call_args_list) == 1
 
     for args, _ in call_args_list:
-        (seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out,
-         blocks_to_copy, actual_k) = args
-        actual_execute_model_data = ExecuteModelData(seq_group_metadata_list,
-                                                     blocks_to_swap_in,
-                                                     blocks_to_swap_out,
-                                                     blocks_to_copy)
-        assert actual_execute_model_data == execute_model_data
-        assert actual_k == k
+        actual_execute_model_data = args[0]
+        assert actual_execute_model_data == execute_model_req
 
 
 @pytest.mark.parametrize('k', [1, 2, 6])
@@ -93,7 +87,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
     proposal_lens = torch.ones(batch_size, dtype=torch.int64,
                                device='cuda') * k
 
-    execute_model_data, prompts, prev_output_tokens = create_batch(
+    seq_group_metadata_list, prompts, prev_output_tokens = create_batch(
         batch_size, k)
 
     draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
@@ -105,20 +99,20 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
     target_worker.execute_model.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
-        worker.execute_model(**execute_model_data.to_dict(),
-                             num_lookahead_slots=k)
+        worker.execute_model(execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list,
+            num_lookahead_slots=k))
 
     seen_contexts = []
 
     call_args_list = target_worker.execute_model.call_args_list
     assert len(call_args_list) == 1
-    for args, kwargs in call_args_list:
-        target_execute_model_data = ExecuteModelData.from_dict(kwargs)
+    for _, kwargs in call_args_list:
+        seq_group_metadata_list = kwargs[
+            "execute_model_req"].seq_group_metadata_list
 
-        assert len(target_execute_model_data.seq_group_metadata_list) == (
-            k + 1) * batch_size
-        for seq_group_metadata in (
-                target_execute_model_data.seq_group_metadata_list):
+        assert len(seq_group_metadata_list) == (k + 1) * batch_size
+        for seq_group_metadata in seq_group_metadata_list:
             for seq_data in seq_group_metadata.seq_data.values():
                 seen_contexts.append(seq_data.get_token_ids())
 
@@ -175,7 +169,7 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
     proposal_lens = torch.ones(batch_size, dtype=torch.int64,
                                device='cuda') * k
 
-    execute_model_data, _, _ = create_batch(batch_size, k)
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
 
     draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
         proposal_token_ids=proposal_token_ids,
@@ -207,8 +201,9 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
     rejection_sampler.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
-        worker.execute_model(**execute_model_data.to_dict(),
-                             num_lookahead_slots=k)
+        worker.execute_model(execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list,
+            num_lookahead_slots=k))
 
     assert len(rejection_sampler.call_args_list) == 1
     _, kwargs = rejection_sampler.call_args_list[0]
@@ -262,7 +257,7 @@ def test_correctly_formats_output(k: int, batch_size: int):
     proposal_lens = torch.ones(batch_size, dtype=torch.int64,
                                device='cuda') * k
 
-    execute_model_data, _, _ = create_batch(batch_size, k)
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
 
     draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
         proposal_token_ids=proposal_token_ids,
@@ -302,8 +297,9 @@ def test_correctly_formats_output(k: int, batch_size: int):
 
     rejection_sampler.return_value = rejection_sampler_output
 
-    output = worker.execute_model(**execute_model_data.to_dict(),
-                                  num_lookahead_slots=k)
+    output = worker.execute_model(execute_model_req=ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k))
 
     expected_output = create_sampler_output_list(
         token_ids=rejection_sampler_output.transpose(0, 1),
@@ -312,7 +308,7 @@ def test_correctly_formats_output(k: int, batch_size: int):
 
     seq_ids = [
         next(iter(seq_group_metadata.seq_data.keys()))
-        for seq_group_metadata in execute_model_data.seq_group_metadata_list
+        for seq_group_metadata in seq_group_metadata_list
     ]
     actual_output_by_seq = {seq_id: [] for seq_id in seq_ids}
     expected_output_by_seq = {seq_id: [] for seq_id in seq_ids}
@@ -383,7 +379,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
     proposal_lens = torch.ones(batch_size, dtype=torch.int64,
                                device='cuda') * k
 
-    execute_model_data, _, _ = create_batch(batch_size, k)
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
 
     draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
         proposal_token_ids=proposal_token_ids,
@@ -428,8 +424,9 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
     metrics_collector.maybe_collect_rejsample_metrics.return_value = (
         mock_rejsample_metrics)
 
-    output = worker.execute_model(**execute_model_data.to_dict(),
-                                  num_lookahead_slots=k)
+    output = worker.execute_model(execute_model_req=ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k))
     assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics
 
     call_args_list = (
@@ -462,21 +459,21 @@ def test_k_equals_zero(k: int, batch_size: int):
     worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
                               metrics_collector)
 
-    execute_model_data, prompts, prev_output_tokens = create_batch(
-        batch_size, k, prev_output_token_len=0)
+    seq_group_metadata_list, _, _ = create_batch(batch_size,
+                                                 k,
+                                                 prev_output_token_len=0)
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)
 
-    out = worker.execute_model(**execute_model_data.to_dict(),
-                               num_lookahead_slots=k)
+    out = worker.execute_model(execute_model_req=execute_model_req)
 
     assert len(out) == 1, f"expected only one token output when {k=}"
     assert out[0].probs is None, "expect gpu tensor references to be None"
     assert out[
         0].sampled_tokens is None, "expect gpu tensor references to be None"
 
-    draft_worker.execute_model.assert_called_once_with(
-        **execute_model_data.to_dict())
-    target_worker.execute_model.assert_called_once_with(
-        **execute_model_data.to_dict())
+    draft_worker.execute_model.assert_called_once_with(execute_model_req)
+    target_worker.execute_model.assert_called_once_with(execute_model_req)
 
 
 @pytest.mark.parametrize('k', [0, 5])
@@ -503,21 +500,21 @@ def test_empty_input_batch(k: int, batch_size: int):
     worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
                               metrics_collector)
 
-    execute_model_data, prompts, prev_output_tokens = create_batch(
-        batch_size, k, prev_output_token_len=0)
+    seq_group_metadata_list, _, _ = create_batch(batch_size,
+                                                 k,
+                                                 prev_output_token_len=0)
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)
 
-    out = worker.execute_model(**execute_model_data.to_dict(),
-                               num_lookahead_slots=k)
+    out = worker.execute_model(execute_model_req=execute_model_req)
 
     assert len(out) == 1, f"expected only one token output when {k=}"
     assert out[0].probs is None, "expect gpu tensor references to be None"
     assert out[
         0].sampled_tokens is None, "expect gpu tensor references to be None"
 
-    draft_worker.execute_model.assert_called_once_with(
-        **execute_model_data.to_dict())
-    target_worker.execute_model.assert_called_once_with(
-        **execute_model_data.to_dict())
+    draft_worker.execute_model.assert_called_once_with(execute_model_req)
+    target_worker.execute_model.assert_called_once_with(execute_model_req)
 
 
 @pytest.mark.skip_global_cleanup
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index f0f0d09106a00..f288652d51556 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,4 +1,3 @@
-from dataclasses import dataclass, fields
 from itertools import count
 from typing import Dict, Iterable, List, Optional, Union
 from unittest.mock import MagicMock
@@ -16,50 +15,10 @@
 from vllm.worker.worker import Worker
 
 
-@dataclass
-class ExecuteModelData:
-    """Helper data structure which facilitates cleaner tests.
-    """
-    seq_group_metadata_list: List[SequenceGroupMetadata]
-    blocks_to_swap_in: Dict[int, int]
-    blocks_to_swap_out: Dict[int, int]
-    blocks_to_copy: Dict[int, List[int]]
-
-    def to_dict(self):
-        return dict(
-            (field.name, getattr(self, field.name)) for field in fields(self))
-
-    @classmethod
-    def from_dict(cls, d):
-        cleaned = dict((field.name, d[field.name]) for field in fields(cls))
-        return cls(**cleaned)
-
-
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
     return (seq_len + block_size - 1) // block_size
 
 
-def create_execute_model_data(
-    seq_group_metadata_list: List[SequenceGroupMetadata],
-    blocks_to_swap_in: Optional[Dict[int, int]] = None,
-    blocks_to_swap_out: Optional[Dict[int, int]] = None,
-    blocks_to_copy: Optional[Dict[int, int]] = None,
-) -> ExecuteModelData:
-    if blocks_to_swap_in is None:
-        blocks_to_swap_in = {}
-    if blocks_to_swap_out is None:
-        blocks_to_swap_out = {}
-    if blocks_to_copy is None:
-        blocks_to_copy = {}
-
-    return ExecuteModelData(
-        seq_group_metadata_list=seq_group_metadata_list,
-        blocks_to_swap_in=blocks_to_swap_in,
-        blocks_to_swap_out=blocks_to_swap_out,
-        blocks_to_copy=blocks_to_copy,
-    )
-
-
 def mock_worker(cls=None,
                 vocab_size: int = 30_000,
                 max_model_len: int = 2048,
@@ -258,8 +217,7 @@ def create_batch(batch_size,
         for prompt, prev_output_token in zip(prompts, prev_output_tokens)
     ]
 
-    execute_model_data = create_execute_model_data(
-        create_seq_group_metadata_from_prompts(prompts, num_gpu_blocks,
-                                               block_size, final_prompt_lens,
-                                               prev_output_tokens, seq_ids), )
-    return execute_model_data, prompts, prev_output_tokens
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts, num_gpu_blocks, block_size, final_prompt_lens,
+        prev_output_tokens, seq_ids)
+    return seq_group_metadata_list, prompts, prev_output_tokens
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 1804cf78d8003..07bcd343a96a6 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -1,6 +1,7 @@
 import torch
 
 from vllm.engine.arg_utils import EngineArgs
+from vllm.sequence import ExecuteModelRequest
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.worker import Worker
 
@@ -54,10 +55,14 @@ def test_swap() -> None:
 
     # Test swap out.
     blocks_to_swap_out = {3: 72, 56: 35, 84: 34}
-    worker.execute_model(seq_group_metadata_list=[],
-                         blocks_to_swap_in={},
-                         blocks_to_swap_out=blocks_to_swap_out,
-                         blocks_to_copy={})
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=[],
+        blocks_to_swap_in={},
+        blocks_to_swap_out=blocks_to_swap_out,
+        blocks_to_copy={},
+    )
+    worker.execute_model(execute_model_req=execute_model_req)
+
     for i in range(num_layers):
         gpu_key_cache, gpu_value_cache = gpu_cache[i]
         cpu_key_cache, cpu_value_cache = cpu_cache[i]
@@ -66,14 +71,19 @@ def test_swap() -> None:
             assert allclose(gpu_value_cache[src], cpu_value_cache[dst])
 
     # Test swap in.
-    blocks_to_swap_in = {19: 45, 67: 23, 12: 78, 40: 99, 1: 71}
-    worker.execute_model(seq_group_metadata_list=[],
-                         blocks_to_swap_in=blocks_to_swap_in,
-                         blocks_to_swap_out={},
-                         blocks_to_copy={})
+    execute_model_req.blocks_to_swap_out = {}
+    execute_model_req.blocks_to_swap_in = {
+        19: 45,
+        67: 23,
+        12: 78,
+        40: 99,
+        1: 71
+    }
+    worker.execute_model(execute_model_req=execute_model_req)
+
     for i in range(num_layers):
         gpu_key_cache, gpu_value_cache = gpu_cache[i]
         cpu_key_cache, cpu_value_cache = cpu_cache[i]
-        for src, dst in blocks_to_swap_in.items():
+        for src, dst in execute_model_req.blocks_to_swap_in.items():
             assert allclose(gpu_key_cache[dst], cpu_key_cache[src])
             assert allclose(gpu_value_cache[dst], cpu_value_cache[src])
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 7c55b08d4857d..a9e0b05b8db67 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -128,6 +128,8 @@ class SchedulerOutputs:
     ignored_seq_groups: List[SequenceGroup]
     # The number of slots for lookahead decoding.
     num_lookahead_slots: int
+    # The number of requests in the running queue
+    running_queue_size: int
 
     def __post_init__(self):
         # Swap in and swap out should never happen at the same time.
@@ -797,6 +799,7 @@ def _schedule_default(self) -> SchedulerOutputs:
             ignored_seq_groups=prefills.ignored_seq_groups +
             swapped_in.infeasible_seq_groups,
             num_lookahead_slots=running_scheduled.num_lookahead_slots,
+            running_queue_size=len(self.running),
         )
 
     def _schedule_chunked_prefill(self):
@@ -883,6 +886,7 @@ def _schedule_chunked_prefill(self):
                                        swapped_in.blocks_to_copy),
             ignored_seq_groups=prefills.ignored_seq_groups,
             num_lookahead_slots=running_scheduled.num_lookahead_slots,
+            running_queue_size=len(self.running),
         )
 
     def _schedule(self) -> SchedulerOutputs:
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index cf5053bba1d48..9f72a0d11974f 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -16,7 +16,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import MultiModalData, SamplerOutput
+from vllm.sequence import ExecuteModelRequest, MultiModalData, SamplerOutput
 from vllm.usage.usage_lib import UsageContext
 
 logger = init_logger(__name__)
@@ -210,12 +210,16 @@ async def step_async(self) -> List[RequestOutput]:
 
         if not scheduler_outputs.is_empty():
             # Execute the model.
+            execute_model_req = ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list,
+                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+                blocks_to_copy=scheduler_outputs.blocks_to_copy,
+                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
+                running_queue_size=scheduler_outputs.running_queue_size,
+            )
             output = await self.model_executor.execute_model_async(
-                seq_group_metadata_list,
-                scheduler_outputs.blocks_to_swap_in,
-                scheduler_outputs.blocks_to_swap_out,
-                scheduler_outputs.blocks_to_copy,
-                num_lookahead_slots=scheduler_outputs.num_lookahead_slots)
+                execute_model_req)
         else:
             output = []
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 94a5b397a4d43..342f2c796d6fb 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -22,8 +22,8 @@
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (MultiModalData, SamplerOutput, Sequence,
-                           SequenceGroup, SequenceGroupMetadata,
+from vllm.sequence import (ExecuteModelRequest, MultiModalData, SamplerOutput,
+                           Sequence, SequenceGroup, SequenceGroupMetadata,
                            SequenceStatus)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
@@ -583,12 +583,16 @@ def step(self) -> List[RequestOutput]:
         seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
 
         if not scheduler_outputs.is_empty():
-            output = self.model_executor.execute_model(
+            execute_model_req = ExecuteModelRequest(
                 seq_group_metadata_list=seq_group_metadata_list,
                 blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
                 blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
                 blocks_to_copy=scheduler_outputs.blocks_to_copy,
-                num_lookahead_slots=scheduler_outputs.num_lookahead_slots)
+                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
+                running_queue_size=scheduler_outputs.running_queue_size,
+            )
+            output = self.model_executor.execute_model(
+                execute_model_req=execute_model_req)
         else:
             output = []
 
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 733eef828adc4..a2212459f034e 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Set, Tuple
+from typing import List, Set, Tuple
 
 import torch
 
@@ -7,7 +7,7 @@
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
 
@@ -72,18 +72,10 @@ def initialize_cache(self, num_gpu_blocks: int,
         logger.info("# CPU blocks: %d", num_gpu_blocks)
         self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
-    def execute_model(self,
-                      seq_group_metadata_list: List[SequenceGroupMetadata],
-                      blocks_to_swap_in: Dict[int, int],
-                      blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]],
-                      num_lookahead_slots: int) -> List[SamplerOutput]:
-        output = self.driver_worker.execute_model(
-            seq_group_metadata_list=seq_group_metadata_list,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-        )
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
         return output
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
@@ -104,19 +96,10 @@ def check_health(self) -> None:
 class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
 
     async def execute_model_async(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        num_lookahead_slots: int,
-    ) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model)(
-            seq_group_metadata_list=seq_group_metadata_list,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-            num_lookahead_slots=num_lookahead_slots)
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
         return output
 
     async def check_health_async(self) -> None:
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 96cd18250bb37..08aa58999b1ec 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,11 +1,11 @@
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Set, Tuple
+from typing import List, Optional, Set, Tuple
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          SpeculativeConfig, VisionLanguageConfig)
 from vllm.lora.request import LoRARequest
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 
 
 class ExecutorBase(ABC):
@@ -68,12 +68,9 @@ def initialize_cache(self, num_gpu_blocks: int,
         raise NotImplementedError
 
     @abstractmethod
-    def execute_model(self,
-                      seq_group_metadata_list: List[SequenceGroupMetadata],
-                      blocks_to_swap_in: Dict[int, int],
-                      blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]],
-                      num_lookahead_slots: int) -> List[SamplerOutput]:
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         """Executes at least one model step on the given sequences."""
         raise NotImplementedError
 
@@ -107,13 +104,8 @@ class ExecutorAsyncBase(ExecutorBase):
 
     @abstractmethod
     async def execute_model_async(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        num_lookahead_slots: int,
-    ) -> List[SamplerOutput]:
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         """Executes one model step on the given sequences."""
         raise NotImplementedError
 
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index a58856a12f0c8..1af3bcf380843 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -3,7 +3,7 @@
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -117,20 +117,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
     def execute_model(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        num_lookahead_slots: int,
-    ) -> List[SamplerOutput]:
-        output = self.driver_worker.execute_model(
-            seq_group_metadata_list=seq_group_metadata_list,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-            num_lookahead_slots=num_lookahead_slots,
-        )
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
         return output
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
@@ -154,16 +143,8 @@ class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
 
     async def execute_model_async(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        num_lookahead_slots: int,
+        execute_model_req: ExecuteModelRequest,
     ) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model)(
-            seq_group_metadata_list=seq_group_metadata_list,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-            num_lookahead_slots=num_lookahead_slots)
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
         return output
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 8a3b9cde84311..e7f0e887921b7 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -1,9 +1,9 @@
-from typing import Dict, List, Set, Tuple
+from typing import List, Set, Tuple
 
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import make_async
 
 logger = init_logger(__name__)
@@ -45,20 +45,18 @@ def initialize_cache(self, num_gpu_blocks: int,
         """
         self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
-    def execute_model(self,
-                      seq_group_metadata_list: List[SequenceGroupMetadata],
-                      blocks_to_swap_in: Dict[int, int],
-                      blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]],
-                      num_lookahead_slots: int) -> List[SamplerOutput]:
-        assert (blocks_to_swap_in == {} and blocks_to_swap_out == {}
-                and blocks_to_copy == {}), (
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        assert (execute_model_req.blocks_to_swap_in == {}
+                and execute_model_req.blocks_to_swap_out == {}
+                and execute_model_req.blocks_to_copy == {}), (
                     "Cache operations are not supported for Neuron backend.")
-        assert num_lookahead_slots == 0, (
+        assert execute_model_req.num_lookahead_slots == 0, (
             "lookahead not supported for Neuron backend.")
 
         output = self.driver_worker.execute_model(
-            seq_group_metadata_list=seq_group_metadata_list)
+            execute_model_req.seq_group_metadata_list)
         return output
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
@@ -80,14 +78,11 @@ class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
 
     async def execute_model_async(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        num_lookahead_slots: int,
+        execute_model_req: ExecuteModelRequest,
     ) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model)(
-            seq_group_metadata_list=seq_group_metadata_list, )
+        output = await make_async(
+            self.driver_worker.execute_model
+        )(seq_group_metadata_list=execute_model_req.seq_group_metadata_list, )
         return output
 
     async def check_health_async(self) -> None:
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 4684b857ccd39..afc1c886722e6 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -10,7 +10,7 @@
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         get_vllm_instance_id, make_async)
 
@@ -166,21 +166,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                           max_concurrent_workers=self.parallel_config.
                           max_parallel_loading_workers)
 
-    def execute_model(self,
-                      seq_group_metadata_list: List[SequenceGroupMetadata],
-                      blocks_to_swap_in: Dict[int, int],
-                      blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]],
-                      num_lookahead_slots: int = 0) -> List[SamplerOutput]:
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         all_outputs = self._run_workers(
             "execute_model",
-            driver_kwargs={
-                "seq_group_metadata_list": seq_group_metadata_list,
-                "blocks_to_swap_in": blocks_to_swap_in,
-                "blocks_to_swap_out": blocks_to_swap_out,
-                "blocks_to_copy": blocks_to_copy,
-                "num_lookahead_slots": num_lookahead_slots,
-            },
+            driver_kwargs={"execute_model_req": execute_model_req},
             use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
 
         # Only the driver worker returns the sampling results.
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 35ac59d69f117..f2939eff7959b 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,7 +1,7 @@
 """Sequence and its related classes."""
 import copy
 import enum
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from vllm.block import LogicalTokenBlock
@@ -734,3 +734,33 @@ def __repr__(self) -> str:
             f"sampled_token_probs={sampled_token_probs_repr}, "
             f"sampled_token_ids={sampled_token_ids_repr}, "
             f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})")
+
+
+@dataclass
+class ExecuteModelRequest:
+    """The model execution request."""
+    # The sequence group metadata list.
+    seq_group_metadata_list: List[SequenceGroupMetadata]
+    # Blocks to swap in. Dict of CPU -> GPU block number.
+    blocks_to_swap_in: Dict[int, int] = field(default_factory=dict)
+    # Blocks to swap out. Dict of GPU -> CPU block number.
+    blocks_to_swap_out: Dict[int, int] = field(default_factory=dict)
+    # Blocks to copy. Source to a list of dest blocks.
+    blocks_to_copy: Dict[int, List[int]] = field(default_factory=dict)
+    # The number of slots for lookahead decoding.
+    num_lookahead_slots: int = 0
+    # The number of requests in the running queue.
+    running_queue_size: int = 0
+
+    def clone(
+        self, seq_group_metadata_list: List[SequenceGroupMetadata]
+    ) -> "ExecuteModelRequest":
+        """Clone the request with a new sequence group metadata list."""
+        return ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=self.blocks_to_swap_in.copy(),
+            blocks_to_swap_out=self.blocks_to_swap_out.copy(),
+            blocks_to_copy=self.blocks_to_copy.copy(),
+            num_lookahead_slots=self.num_lookahead_slots,
+            running_queue_size=self.running_queue_size,
+        )
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 8b302ba1aabeb..d5fd96907ddd7 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -1,9 +1,10 @@
 from itertools import chain, count
-from typing import Dict, Iterator, List, Optional, Tuple
+from typing import Iterator, List, Tuple
 
 import torch
 
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
+                           SequenceGroupMetadata)
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
@@ -40,11 +41,7 @@ def __init__(self, scorer_worker: WorkerBase, device: str,
     @nvtx_range("BatchExpansionTop1Scorer.score_proposals")
     def score_proposals(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Optional[Dict[int, int]],
-        blocks_to_swap_out: Optional[Dict[int, int]],
-        blocks_to_copy: Optional[Dict[int, List[int]]],
-        k: int,
+        execute_model_req: ExecuteModelRequest,
         proposals: SpeculativeProposals,
     ) -> SpeculativeScores:
         """Score the proposed tokens via the scorer model.
@@ -57,11 +54,7 @@ def score_proposals(
         no speculation is produced for that sequence.
 
         Args:
-            seq_group_metadata_list: The input sequence group metadata.
-            blocks_to_swap_in: This is passed to the worker during scoring.
-            blocks_to_swap_out: This is passed to the worker during scoring.
-            blocks_to_copy: This is passed to the worker during scoring.
-            k: The fixed proposal length.
+            execute_model_req: The execution request.
             proposals: The speculative proposals to score.
         Returns:
             SpeculativeScores: The scores of each speculative token, along with
@@ -80,28 +73,25 @@ def score_proposals(
 
         (spec_indices, non_spec_indices, target_seq_group_metadata_list,
          num_scoring_tokens) = self._expand_batch(
-             seq_group_metadata_list=seq_group_metadata_list,
+             seq_group_metadata_list=execute_model_req.seq_group_metadata_list,
              proposal_token_ids_list=proposal_token_ids_list_without_skips,
              proposal_lens_list=proposal_lens_list,
          )
 
         target_sampler_output = self._scorer_worker.execute_model(
-            seq_group_metadata_list=target_seq_group_metadata_list,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-        )
+            execute_model_req=execute_model_req.clone(
+                seq_group_metadata_list=target_seq_group_metadata_list, ))
         assert len(target_sampler_output) == 1, "expected single-step output"
         target_sampler_output = target_sampler_output[0]
 
         all_tokens, all_probs, spec_logprobs = self._contract_batch(
-            contracted_bs=len(seq_group_metadata_list),
+            contracted_bs=len(execute_model_req.seq_group_metadata_list),
             target_sampler_output=target_sampler_output,
             proposals=proposals,
             num_scoring_tokens=num_scoring_tokens,
             non_spec_indices=non_spec_indices,
             spec_indices=spec_indices,
-            k=k,
+            k=execute_model_req.num_lookahead_slots,
         )
 
         return SpeculativeScores(
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index 489d940a88856..d311bfe984cbc 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,10 +1,9 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Dict, List, Optional
 
 import torch
 
-from vllm.sequence import SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest
 
 
 @dataclass
@@ -58,11 +57,7 @@ class SpeculativeProposer(ABC):
     @abstractmethod
     def get_proposals(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        max_proposal_len: int,
+        execute_model_req: ExecuteModelRequest,
     ) -> SpeculativeProposals:
         raise NotImplementedError
 
@@ -72,11 +67,7 @@ class SpeculativeScorer(ABC):
     @abstractmethod
     def score_proposals(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Optional[Dict[int, int]],
-        blocks_to_swap_out: Optional[Dict[int, int]],
-        blocks_to_copy: Optional[Dict[int, List[int]]],
-        k: int,
+        execute_model_req: ExecuteModelRequest,
         proposals: SpeculativeProposals,
     ) -> SpeculativeScores:
         raise NotImplementedError
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index d031bc85af160..5044cc1ef85fd 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,9 +1,10 @@
 import copy
-from typing import Dict, List, Tuple
+from typing import List, Tuple
 
 import torch
 
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -44,10 +45,7 @@ def set_include_gpu_probs_tensor(self):
     @torch.inference_mode()
     def sampler_output(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
+        execute_model_req: ExecuteModelRequest,
         sample_len: int,
     ) -> Tuple[List[SamplerOutput], bool]:
         """Run the model forward pass sample_len times. Returns the list of
@@ -57,26 +55,24 @@ def sampler_output(
 
         For multi step worker, this indicator shall be True.
         """
-        self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in,
-                                   blocks_to_swap_out, blocks_to_copy)
+        self._raise_if_unsupported(execute_model_req)
 
         # Shallow copy input data so modifications (such as appending tokens)
         # do not cause side-effects.
         copied_seq_group_metadata_list = self._shallow_copy_inputs(
-            seq_group_metadata_list)
+            execute_model_req.seq_group_metadata_list)
+        copied_execute_model_req = execute_model_req.clone(
+            copied_seq_group_metadata_list)
 
         # Assert enough KV space for sample_len tokens per sequence.
-        self._assert_enough_kv_space(seq_group_metadata_list, sample_len)
+        self._assert_enough_kv_space(execute_model_req.seq_group_metadata_list,
+                                     sample_len)
 
         # Run model sample_len times.
         model_outputs = []
         for _ in range(sample_len):
             model_output = super().execute_model(
-                seq_group_metadata_list=copied_seq_group_metadata_list,
-                blocks_to_swap_in=blocks_to_swap_in,
-                blocks_to_swap_out=blocks_to_swap_out,
-                blocks_to_copy=blocks_to_copy,
-            )
+                execute_model_req=copied_execute_model_req)
             assert (len(model_output) == 1
                     ), "composing multistep workers not supported"
             model_output = model_output[0]
@@ -89,23 +85,13 @@ def sampler_output(
 
     def get_spec_proposals(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        max_proposal_len: int,
+        execute_model_req: ExecuteModelRequest,
     ) -> SpeculativeProposals:
         """Produce speculations given an input batch of sequences. The number of
         speculative tokens per sequence is determined by max_proposal_len.
         """
 
-        return self._proposer.get_proposals(
-            seq_group_metadata_list,
-            blocks_to_swap_in,
-            blocks_to_swap_out,
-            blocks_to_copy,
-            max_proposal_len,
-        )
+        return self._proposer.get_proposals(execute_model_req)
 
     def _append_new_tokens(
             self, model_output: SamplerOutput,
@@ -196,20 +182,22 @@ def _assert_enough_kv_space(
 
     def _raise_if_unsupported(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
+        execute_model_req: ExecuteModelRequest,
     ) -> None:
         """MultiStepWorker does not yet implement support for cache swap
         operations or beam search.
         """
-        if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]):
+        if any([
+                execute_model_req.blocks_to_swap_in,
+                execute_model_req.blocks_to_swap_out,
+                execute_model_req.blocks_to_copy
+        ]):
             raise NotImplementedError(
                 "MultiStepWorker does not support cache operations")
 
         if any(
                 len(seq_group_metadata.seq_data.keys()) != 1
-                for seq_group_metadata in seq_group_metadata_list):
+                for seq_group_metadata in
+                execute_model_req.seq_group_metadata_list):
             raise NotImplementedError(
                 "MultiStepWorker does not support beam search.")
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index cacaca697526c..fed8be42054a5 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -1,8 +1,8 @@
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker_base import LoraNotSupportedWorkerBase
@@ -46,13 +46,7 @@ def set_include_gpu_probs_tensor(self):
         # NGram don't need gpu sampler
         pass
 
-    def execute_model(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Optional[Dict[int, int]],
-        blocks_to_swap_out: Optional[Dict[int, int]],
-        blocks_to_copy: Optional[Dict[int, List[int]]],
-    ) -> None:
+    def execute_model(self, execute_model_req: ExecuteModelRequest) -> None:
         """NGram doesn't depend on model execution, just pass this function"""
         pass
 
@@ -71,10 +65,7 @@ def get_cache_block_size_bytes(self):
 
     def sampler_output(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
+        execute_model_req: ExecuteModelRequest,
         sample_len: int,
     ) -> Tuple[Optional[List[SamplerOutput]], bool]:
         """NGram match algo to pick proposal candidate. Returns the list of
@@ -83,16 +74,11 @@ def sampler_output(
         For ngram worker, we already done needed transposed internal, so the
         indicator pass to sampler_output_to_torch shall be False.
         """
-        self._raise_if_unsupported(
-            seq_group_metadata_list,
-            blocks_to_swap_in,
-            blocks_to_swap_out,
-            blocks_to_copy,
-        )
+        self._raise_if_unsupported(execute_model_req)
 
         arr = []
         has_spec_out = False
-        for seq_group_metadata in seq_group_metadata_list:
+        for seq_group_metadata in execute_model_req.seq_group_metadata_list:
             seq_data = next(iter(seq_group_metadata.seq_data.values()))
 
             input_ids = torch.as_tensor(seq_data.get_token_ids(),
@@ -135,17 +121,19 @@ def sampler_output(
         indices = token_ids.unsqueeze(2)
 
         token_probs = torch.zeros(
-            (len(seq_group_metadata_list), sample_len, self.vocab_size),
+            (len(execute_model_req.seq_group_metadata_list), sample_len,
+             self.vocab_size),
             dtype=torch.float32,
             device=self.device,
         )
         token_probs.scatter_(2, indices, 1)
         token_logprobs = torch.zeros(
-            (len(seq_group_metadata_list), sample_len, self.vocab_size),
+            (len(execute_model_req.seq_group_metadata_list), sample_len,
+             self.vocab_size),
             dtype=torch.float32,
             device=self.device,
         )
-        for i in range(len(seq_group_metadata_list)):
+        for i in range(len(execute_model_req.seq_group_metadata_list)):
             outputs.append(
                 SamplerOutput(
                     outputs=None,
@@ -157,40 +145,32 @@ def sampler_output(
 
     def get_spec_proposals(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        max_proposal_len: int,
+        execute_model_req: ExecuteModelRequest,
     ) -> SpeculativeProposals:
         """Produce speculations given an input batch of sequences. The number of
         speculative tokens per sequence is determined by max_proposal_len.
         """
 
-        return self._proposer.get_proposals(
-            seq_group_metadata_list,
-            blocks_to_swap_in,
-            blocks_to_swap_out,
-            blocks_to_copy,
-            max_proposal_len,
-        )
+        return self._proposer.get_proposals(execute_model_req)
 
     def _raise_if_unsupported(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
+        execute_model_req: ExecuteModelRequest,
     ) -> None:
         """NGramWorker does not yet implement support for cache swap
         operations or beam search.
         """
-        if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]):
+        if any([
+                execute_model_req.blocks_to_swap_in,
+                execute_model_req.blocks_to_swap_out,
+                execute_model_req.blocks_to_copy
+        ]):
             raise NotImplementedError(
                 "NGramWorker does not support cache operations")
 
         if any(
                 len(seq_group_metadata.seq_data.keys()) != 1
-                for seq_group_metadata in seq_group_metadata_list):
+                for seq_group_metadata in
+                execute_model_req.seq_group_metadata_list):
             raise NotImplementedError(
                 "NGramWorker does not support beam search.")
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 503519a0dfc4b..c2b119fbd5036 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,11 +1,12 @@
 from functools import cached_property
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
@@ -189,69 +190,37 @@ def initialize_cache(self, num_gpu_blocks: int,
 
     @torch.inference_mode()
     def execute_model(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Optional[Dict[int, int]],
-        blocks_to_swap_out: Optional[Dict[int, int]],
-        blocks_to_copy: Optional[Dict[int, List[int]]],
-        num_lookahead_slots: int,
-    ) -> List[SamplerOutput]:
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         """Perform speculative decoding on the input batch.
         """
 
-        assert seq_group_metadata_list is not None, (
+        assert execute_model_req.seq_group_metadata_list is not None, (
             "speculative decoding "
             "requires non-None seq_group_metadata_list")
 
-        #logger.info("spec_decode_worker.execute_model num_lookahead_slots=%d",
-        #            num_lookahead_slots)
-
         # If no spec tokens, call the proposer and scorer workers normally.
         # Used for prefill.
-        if num_lookahead_slots == 0 or len(seq_group_metadata_list) == 0:
-            return self._run_no_spec(
-                seq_group_metadata_list=seq_group_metadata_list,
-                blocks_to_swap_in=blocks_to_swap_in,
-                blocks_to_swap_out=blocks_to_swap_out,
-                blocks_to_copy=blocks_to_copy,
-            )
-
-        return self._run_speculative_decoding_step(
-            seq_group_metadata_list=seq_group_metadata_list,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-            k=num_lookahead_slots,
-        )
+        if execute_model_req.num_lookahead_slots == 0 or len(
+                execute_model_req.seq_group_metadata_list) == 0:
+            return self._run_no_spec(execute_model_req)
+
+        return self._run_speculative_decoding_step(execute_model_req)
 
     @nvtx_range("spec_decode_worker._run_no_spec")
     def _run_no_spec(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Optional[Dict[int, int]],
-        blocks_to_swap_out: Optional[Dict[int, int]],
-        blocks_to_copy: Optional[Dict[int, List[int]]],
-    ) -> List[SamplerOutput]:
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         """Run a prefill step, without any speculation. The input is sent to the
         proposer and scorer model so that the KV cache is consistent between the
         two.
         """
         #logger.info("run proposer worker no spec")
 
-        self.proposer_worker.execute_model(
-            seq_group_metadata_list=seq_group_metadata_list,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-        )
+        self.proposer_worker.execute_model(execute_model_req)
 
         #logger.info("run target worker no spec")
-        sampler_output = self.scorer_worker.execute_model(
-            seq_group_metadata_list=seq_group_metadata_list,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-        )
+        sampler_output = self.scorer_worker.execute_model(execute_model_req)
         assert len(sampler_output) == 1
         sampler_output = sampler_output[0]
 
@@ -264,13 +233,8 @@ def _run_no_spec(
 
     @nvtx_range("spec_decode_worker._run_speculative_decoding_step")
     def _run_speculative_decoding_step(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Optional[Dict[int, int]],
-        blocks_to_swap_out: Optional[Dict[int, int]],
-        blocks_to_copy: Optional[Dict[int, List[int]]],
-        k: int,
-    ) -> List[SamplerOutput]:
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         """Execute a single step of speculative decoding.
 
         This invokes the proposer worker to get k speculative tokens for each
@@ -282,33 +246,25 @@ def _run_speculative_decoding_step(
 
         #logger.info("get spec proposals")
         # Generate proposals using draft worker.
-        assert blocks_to_swap_in is not None
-        assert blocks_to_swap_out is not None
-        assert blocks_to_copy is not None
-        proposals = self.proposer_worker.get_spec_proposals(
-            seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out,
-            blocks_to_copy, k)
+        proposals = self.proposer_worker.get_spec_proposals(execute_model_req)
 
         #logger.info("score proposals")
         proposal_scores = self.scorer.score_proposals(
-            seq_group_metadata_list,
-            blocks_to_swap_in,
-            blocks_to_swap_out,
-            blocks_to_copy,
-            k,
+            execute_model_req,
             proposals,
         )
 
         #logger.info("verify proposals")
         accepted_token_ids, target_logprobs = self._verify_tokens(
-            seq_group_metadata_list, proposal_scores, proposals, k)
+            execute_model_req.seq_group_metadata_list, proposal_scores,
+            proposals, execute_model_req.num_lookahead_slots)
 
         #logger.info("create output list")
         return self._create_output_sampler_list(
-            seq_group_metadata_list,
+            execute_model_req.seq_group_metadata_list,
             accepted_token_ids,
             target_logprobs=target_logprobs,
-            k=k)
+            k=execute_model_req.num_lookahead_slots)
 
     @nvtx_range("spec_decode_worker._verify_tokens")
     def _verify_tokens(
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 56c63887b0315..eb622a0e2e7f4 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -1,8 +1,9 @@
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeProposer)
 from vllm.spec_decode.util import sampler_output_to_torch
@@ -40,17 +41,15 @@ def __init__(
 
     def get_proposals(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        proposal_len: int,
+        execute_model_req: ExecuteModelRequest,
     ) -> SpeculativeProposals:
         """Get speculative proposals given the input batch.
 
         Sequences which would exceed the max model length are skipped during
         speculation.
         """
+        proposal_len = execute_model_req.num_lookahead_slots
+        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
 
         # Split speculative- and non-speculative- sequences.
         (
@@ -66,11 +65,12 @@ def get_proposals(
             # token_ids is like [batch] format in proposal_len size list,
             # while if it is false, the format would be [proposal_len]
             # in batch size list
-            maybe_sampler_output, transposed = self._worker.sampler_output(
+            nonzero_execute_model_req = ExecuteModelRequest(
                 seq_group_metadata_list=nonzero_proposal_len_seqs,
-                blocks_to_swap_in=blocks_to_swap_in,
-                blocks_to_swap_out=blocks_to_swap_out,
-                blocks_to_copy=blocks_to_copy,
+                num_lookahead_slots=proposal_len,
+            )
+            maybe_sampler_output, transposed = self._worker.sampler_output(
+                execute_model_req=nonzero_execute_model_req,
                 sample_len=proposal_len,
             )
         else:
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 83ededd742533..4420d4cc9e12f 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -13,7 +13,7 @@
                               init_distributed_environment)
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.worker.cpu_model_runner import CPUModelRunner
 from vllm.worker.worker_base import LoraNotSupportedWorkerBase
@@ -256,22 +256,24 @@ def cache_copy(
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None,
-        blocks_to_swap_in: Optional[Dict[int, int]] = None,
-        blocks_to_swap_out: Optional[Dict[int, int]] = None,
-        blocks_to_copy: Optional[Dict[int, List[int]]] = None,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
     ) -> List[SamplerOutput]:
+
+        if execute_model_req is None:
+            seq_group_metadata_list = None
+        else:
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
             num_seq_groups: int = len(seq_group_metadata_list)
-            assert blocks_to_swap_in is not None
-            assert blocks_to_swap_out is not None
-            assert blocks_to_copy is not None
-            assert len(blocks_to_swap_in) == 0
-            assert len(blocks_to_swap_out) == 0
+            assert execute_model_req is not None
+            blocks_to_copy = execute_model_req.blocks_to_copy
+            assert len(execute_model_req.blocks_to_swap_in) == 0
+            assert len(execute_model_req.blocks_to_swap_out) == 0
             data: Dict[str, Any] = {
                 "num_seq_groups": num_seq_groups,
-                "blocks_to_copy": blocks_to_copy,
+                "blocks_to_copy": execute_model_req.blocks_to_copy,
             }
             broadcast_tensor_dict(data, src=0)
         else:
@@ -279,7 +281,6 @@ def execute_model(
             num_seq_groups = data["num_seq_groups"]
             blocks_to_copy = data["blocks_to_copy"]
 
-        assert blocks_to_copy is not None
         self.cache_copy(blocks_to_copy)
 
         # If there is no input, we don't need to execute the model.
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 808261e47318b..4add36e94f723 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -18,7 +18,7 @@
     init_custom_ar)
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.model_runner import ModelRunner
 from vllm.worker.worker_base import WorkerBase
@@ -211,19 +211,21 @@ def cache_swap(
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None,
-        blocks_to_swap_in: Optional[Dict[int, int]] = None,
-        blocks_to_swap_out: Optional[Dict[int, int]] = None,
-        blocks_to_copy: Optional[Dict[int, List[int]]] = None,
-        num_lookahead_slots: int = 0,
+        execute_model_req: Optional[ExecuteModelRequest] = None
     ) -> List[SamplerOutput]:
 
+        if execute_model_req is None:
+            seq_group_metadata_list = None
+        else:
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
+            assert execute_model_req is not None
             num_seq_groups = len(seq_group_metadata_list)
-            assert blocks_to_swap_in is not None
-            assert blocks_to_swap_out is not None
-            assert blocks_to_copy is not None
+            blocks_to_swap_in = execute_model_req.blocks_to_swap_in
+            blocks_to_swap_out = execute_model_req.blocks_to_swap_out
+            blocks_to_copy = execute_model_req.blocks_to_copy
             data: Dict[str, Any] = {
                 "num_seq_groups": num_seq_groups,
                 "blocks_to_swap_in": blocks_to_swap_in,
@@ -238,9 +240,6 @@ def execute_model(
             blocks_to_swap_out = data["blocks_to_swap_out"]
             blocks_to_copy = data["blocks_to_copy"]
 
-        assert blocks_to_swap_in is not None
-        assert blocks_to_swap_out is not None
-        assert blocks_to_copy is not None
         self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
 
         # If there is no input, we don't need to execute the model.
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 0a89e3a79769f..fb32feaca0c94 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -5,7 +5,7 @@
 
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (enable_trace_function_call_for_thread,
                         update_environment_variables)
 
@@ -48,10 +48,8 @@ def initialize_cache(self, num_gpu_blocks: int,
 
     @abstractmethod
     def execute_model(
-            self, seq_group_metadata_list: List[SequenceGroupMetadata],
-            blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int,
-                                                                        int],
-            blocks_to_copy: Dict[int, List[int]]) -> List[SamplerOutput]:
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         """Executes at least one model step on the given sequences, unless no
         sequences are provided."""
         raise NotImplementedError

From 36fb68f94792a8cec8df5b58bab7ab4d4d6158b4 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Sat, 4 May 2024 16:18:00 +0900
Subject: [PATCH 53/65] [Doc] Chunked Prefill Documentation (#4580)

---
 docs/source/index.rst              |  1 +
 docs/source/models/performance.rst | 38 ++++++++++++++++++++++++++++++
 vllm/config.py                     |  5 ++--
 3 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/models/performance.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5cc28a2d70139..4022c590843e6 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -87,6 +87,7 @@ Documentation
    models/adding_model
    models/engine_args
    models/lora
+   models/performance
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/models/performance.rst b/docs/source/models/performance.rst
new file mode 100644
index 0000000000000..067757699f32a
--- /dev/null
+++ b/docs/source/models/performance.rst
@@ -0,0 +1,38 @@
+.. _performance:
+
+Performance and Tuning
+======================
+
+Chunked Prefill
+---------------
+vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
+
+You can enable the feature by specifying
+
+.. code-block:: python
+
+    llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
+    # Set max_num_batched_tokens to tune performance.
+    # NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
+    # llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
+
+By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. This policy optimizes the TTFT (time to thefirst token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
+
+Once chunked prefill is enabled, the policy is changed to
+
+- prioritize decode requests. It batches all pending decode requests to the batch before scheduling any prefill.
+- When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills. If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
+
+This policy has two benefits.
+
+- It improves ITL (inter token latency) and generation decode because decode requests are prioritized.
+- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
+
+You can tune the performance by changing `max_num_batched_tokens`.
+By default, it is set to 512, which has the best ITL on A100 in the initial benchmark.
+Smaller batch size achieves better ITL because there are fewer prefills interrupting decodes.
+Higher batch size achieves better TTFT as you can put more prefill to the batch.
+If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
+Note that the default batch size (512) is optimized for ITL, and it may have lower throughput than the default scheduler. We recommend you set `max_num_batched_tokens > 2048` for throughput.
+
+See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369). 
diff --git a/vllm/config.py b/vllm/config.py
index fe54c54bed48e..6c65bbe247f84 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -607,8 +607,9 @@ def __init__(
             self.max_num_batched_tokens = max_num_batched_tokens
         else:
             if enable_chunked_prefill:
-                # For chunked prefill, choose the well-tuned batch size.
-                self.max_num_batched_tokens = 768
+                # It is the values that have the best balance between ITL
+                # and TTFT on A100. Note it is not optimized for throughput.
+                self.max_num_batched_tokens = 512
             else:
                 # If max_model_len is too short, use 2048 as the default value
                 # for higher throughput.

From 2a052011ca473a9dc8160f3daa1f5f63a2ad1fe3 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sat, 4 May 2024 14:45:16 -0400
Subject: [PATCH 54/65] [Kernel] Support MoE Fp8 Checkpoints for Mixtral
 (Static Weights with Dynamic/Static Activations) (#4527)

Follow on to #4332 to enable FP8 checkpoint loading for Mixtral and supersedes #4436.

This PR enables the following checkpoint loading features for Mixtral:

Supports loading fp8 checkpoints for Mixtral, such as this "nm-testing/Mixtral-8x7B-Instruct-v0.1-FP8" test model
Supports static or dynamic activation quantization with static weight quantization (all per tensor)
Supports different scales for each expert weight
Supports Fp8 in QKV layer
Notes:

The Expert Gate/Router always runs at half / full precision for now.
If there are different weight scales between QKV layer (for separate QKV weights), they are re-quantized using layer.weight_scale.max() so we can have a single gemm for performance.
---
 tests/kernels/test_moe.py             |   4 +-
 vllm/model_executor/models/mixtral.py | 171 ++++++++++++++++++--------
 2 files changed, 122 insertions(+), 53 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 046f11d957bdd..2356b9ec18b0d 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -77,8 +77,8 @@ def test_mixtral_moe(dtype: torch.dtype):
     for i in range(config.num_local_experts):
         weights = (hf_moe.experts[i].w1.weight.data,
                    hf_moe.experts[i].w3.weight.data)
-        vllm_moe.ws[i][:] = torch.cat(weights, dim=0)
-        vllm_moe.w2s[i][:] = hf_moe.experts[i].w2.weight.data
+        vllm_moe.w13_weight[i][:] = torch.cat(weights, dim=0)
+        vllm_moe.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
 
     # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
     hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 9ff9ba298588a..efa4de7516212 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -78,6 +78,8 @@ def __init__(
         self.top_k = top_k
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size // self.tp_size
+        self.quant_config = quant_config
+
         # FIXME(pcmoritz): Make this more general to support different
         # quantization schemes
         self.use_fp8 = isinstance(quant_config, Fp8Config)
@@ -86,55 +88,79 @@ def __init__(
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
 
+        # Gate always runs at half / full precision for now.
         self.gate = ReplicatedLinear(self.hidden_size,
                                      self.num_total_experts,
                                      bias=False,
                                      params_dtype=self.params_dtype,
                                      quant_config=None)
 
-        self.ws = nn.Parameter(
+        if self.use_fp8:
+            params_dtype = torch.float8_e4m3fn
+
+        self.w13_weight = nn.Parameter(
             torch.empty(self.num_total_experts,
                         2 * self.intermediate_size,
                         self.hidden_size,
-                        dtype=self.params_dtype))
-        self.w2s = nn.Parameter(
+                        dtype=params_dtype))
+        self.w2_weight = nn.Parameter(
             torch.empty(self.num_total_experts,
                         self.hidden_size,
                         self.intermediate_size,
-                        dtype=self.params_dtype))
+                        dtype=params_dtype))
 
-        set_weight_attrs(self.ws, {
+        set_weight_attrs(self.w13_weight, {
             "weight_loader": self.weight_loader,
         })
-        set_weight_attrs(self.w2s, {
+        set_weight_attrs(self.w2_weight, {
             "weight_loader": self.weight_loader,
         })
 
-        # Scaling factors for FP8 weights
-        self.ws_scale = nn.Parameter(
-            torch.ones(self.num_total_experts, dtype=torch.float32),
-            requires_grad=False) if self.use_fp8 else None
-        self.w2s_scale = nn.Parameter(
-            torch.ones(self.num_total_experts, dtype=torch.float32),
-            requires_grad=False) if self.use_fp8 else None
-
-        # Scaling factors for FP8 activations
-        need_act_scales = (self.use_fp8
-                           and quant_config.activation_scheme == "static")
-        self.as_scale = nn.Parameter(
-            torch.zeros(1, dtype=torch.float32),
-            requires_grad=False) if need_act_scales else None
-        self.a2s_scale = nn.Parameter(
-            torch.zeros(1, dtype=torch.float32),
-            requires_grad=False) if need_act_scales else None
-
-        if need_act_scales:
-            set_weight_attrs(self.as_scale, {
-                "weight_loader": self.weight_loader,
-            })
-            set_weight_attrs(self.a2s_scale, {
-                "weight_loader": self.weight_loader,
-            })
+        # Used for fp8.
+        self.w13_scale = None
+        self.w2_scale = None
+        self.a13_scale = None
+        self.a2_scale = None
+
+        if self.use_fp8:
+            # WEIGHT_SCALE (for fp8)
+            self.w13_scale = nn.Parameter(torch.ones(self.num_total_experts,
+                                                     dtype=torch.float32),
+                                          requires_grad=False)
+            self.w2_scale = nn.Parameter(torch.ones(self.num_total_experts,
+                                                    dtype=torch.float32),
+                                         requires_grad=False)
+
+            # If loading fp8 checkpoint, pass the weight loaders.
+            # If loading an fp16 checkpoint, do not (we will quantize in
+            #   process_weights_after_loading()
+            if quant_config.is_checkpoint_fp8_serialized:
+                set_weight_attrs(self.w13_scale, {
+                    "weight_loader": self.weight_loader,
+                })
+                set_weight_attrs(self.w2_scale, {
+                    "weight_loader": self.weight_loader,
+                })
+
+            # ACT_SCALE (for fp8)
+            if quant_config.activation_scheme == "static":
+                if not quant_config.is_checkpoint_fp8_serialized:
+                    raise ValueError(
+                        "Found static activation scheme for checkpoint that "
+                        "was not serialized fp8.")
+                self.a13_scale = nn.Parameter(torch.zeros(
+                    self.num_total_experts, dtype=torch.float32),
+                                              requires_grad=False)
+                self.a2_scale = nn.Parameter(torch.zeros(
+                    self.num_total_experts, dtype=torch.float32),
+                                             requires_grad=False)
+
+                set_weight_attrs(self.a13_scale, {
+                    "weight_loader": self.weight_loader,
+                })
+                set_weight_attrs(self.a2_scale, {
+                    "weight_loader": self.weight_loader,
+                })
 
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                       weight_name: str, expert_id: int):
@@ -149,20 +175,49 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                        shard_size:2 * shard_size, :] = loaded_weight[shard, :]
         if weight_name.endswith("w2.weight"):
             param_data[expert_id, :, :] = loaded_weight[:, shard]
-        if "act_scale" in weight_name:
-            param_data[:] = param_data[:].max(loaded_weight)
+        if "act_scale" in weight_name or "weight_scale" in weight_name:
+            param_data[expert_id] = loaded_weight
 
     def process_weights_after_loading(self):
-        if self.use_fp8:
-            ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
-            w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
+        # Fp8 is the only case where we need to process after loading.
+        if not self.use_fp8:
+            return
+
+        # If checkpoint is fp16, quantize here.
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            w13_weight = torch.empty_like(self.w13_weight.data,
+                                          dtype=torch.float8_e4m3fn)
+            w2_weight = torch.empty_like(self.w2_weight.data,
+                                         dtype=torch.float8_e4m3fn)
             for expert in range(self.num_total_experts):
-                ws[expert, :, :], self.ws_scale[expert] = ops.scaled_fp8_quant(
-                    self.ws.data[expert, :, :])
-                w2s[expert, :, :], self.w2s_scale[
-                    expert] = ops.scaled_fp8_quant(self.w2s.data[expert, :, :])
-            self.ws = nn.Parameter(ws, requires_grad=False)
-            self.w2s = nn.Parameter(w2s, requires_grad=False)
+                w13_weight[expert, :, :], self.w13_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        self.w13_weight.data[expert, :, :])
+                w2_weight[expert, :, :], self.w2_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        self.w2_weight.data[expert, :, :])
+            self.w13_weight = nn.Parameter(w13_weight, requires_grad=False)
+            self.w2_weight = nn.Parameter(w2_weight, requires_grad=False)
+
+        # If checkpoint is fp8 + static, cleanup act_scales.
+        #   Since state_dict has an act_scale per expert but our kernels
+        #   are passed one act_scale shared across all experts.
+        elif self.quant_config.activation_scheme == "static":
+            if self.a13_scale is None or self.a2_scale is None:
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None.")
+
+            if (not all_close_1d(self.a13_scale)
+                    or not all_close_1d(self.a2_scale)):
+                print_warning_once(
+                    "Found act_scales that are not equal for fp8 MoE layer. "
+                    "Using the maximum across experts for each layer. ")
+
+            self.a13_scale = nn.Parameter(self.a13_scale.max(),
+                                          requires_grad=False)
+            self.a2_scale = nn.Parameter(self.a2_scale.max(),
+                                         requires_grad=False)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_size = hidden_states.shape
@@ -170,17 +225,17 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
         final_hidden_states = fused_moe(hidden_states,
-                                        self.ws,
-                                        self.w2s,
+                                        self.w13_weight,
+                                        self.w2_weight,
                                         router_logits,
                                         self.top_k,
                                         renormalize=True,
                                         inplace=True,
                                         use_fp8=self.use_fp8,
-                                        w1_scale=self.ws_scale,
-                                        w2_scale=self.w2s_scale,
-                                        a1_scale=self.as_scale,
-                                        a2_scale=self.a2s_scale)
+                                        w1_scale=self.w13_scale,
+                                        w2_scale=self.w2_scale,
+                                        a1_scale=self.a13_scale,
+                                        a2_scale=self.a2_scale)
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -222,7 +277,9 @@ def __init__(self,
         self.rope_theta = rope_theta
         self.sliding_window = sliding_window
 
-        if isinstance(quant_config, Fp8Config):
+        if isinstance(
+                quant_config,
+                Fp8Config) and not quant_config.is_checkpoint_fp8_serialized:
             print_warning_once(
                 "For Mixtral FP8 quantization, we currently do not quantize "
                 "the attention layers until their FP8 performance is improved."
@@ -461,16 +518,23 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         expert_params_mapping = [
+            # These are the weight scales for the experts
+            # (param_name, weight_name, expert_id)
+            ("w13_scale" if weight_name in ["w1", "w3"] else "w2_scale",
+             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id)
+            for expert_id in range(self.config.num_local_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ] + [
             # These are the weights for the experts
             # (param_name, weight_name, expert_id)
-            ("ws" if weight_name in ["w1", "w3"] else "w2s",
+            ("w13_weight" if weight_name in ["w1", "w3"] else "w2_weight",
              f"experts.{expert_id}.{weight_name}.weight", expert_id)
             for expert_id in range(self.config.num_local_experts)
             for weight_name in ["w1", "w2", "w3"]
         ] + [
             # These are the activation scales for the experts
             # (param_name, weight_name, expert_id)
-            ("as_scale" if weight_name in ["w1", "w3"] else "a2s_scale",
+            ("a13_scale" if weight_name in ["w1", "w3"] else "a2_scale",
              f"experts.{expert_id}.{weight_name}.act_scale", expert_id)
             for expert_id in range(self.config.num_local_experts)
             for weight_name in ["w1", "w2", "w3"]
@@ -512,3 +576,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+
+
+def all_close_1d(x: torch.Tensor) -> bool:
+    assert len(x.shape) == 1
+    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))

From 021b1a2ab7497769dae8a67ea3467e4bafb474c5 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sat, 4 May 2024 13:44:36 -0700
Subject: [PATCH 55/65] [CI] check size of the wheels (#4319)

---
 .buildkite/check-wheel-size.py | 33 +++++++++++++++++++++++++++++++++
 Dockerfile                     | 12 ++++++++----
 2 files changed, 41 insertions(+), 4 deletions(-)
 create mode 100644 .buildkite/check-wheel-size.py

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
new file mode 100644
index 0000000000000..8178fba552c46
--- /dev/null
+++ b/.buildkite/check-wheel-size.py
@@ -0,0 +1,33 @@
+import os
+import zipfile
+
+MAX_SIZE_MB = 100
+
+
+def print_top_10_largest_files(zip_file):
+    with zipfile.ZipFile(zip_file, 'r') as z:
+        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
+        file_sizes.sort(key=lambda x: x[1], reverse=True)
+        for f, size in file_sizes[:10]:
+            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
+
+
+def check_wheel_size(directory):
+    for root, _, files in os.walk(directory):
+        for f in files:
+            if f.endswith(".whl"):
+                wheel_path = os.path.join(root, f)
+                wheel_size = os.path.getsize(wheel_path)
+                wheel_size_mb = wheel_size / (1024 * 1024)
+                if wheel_size_mb > MAX_SIZE_MB:
+                    print(
+                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
+                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
+                    print_top_10_largest_files(wheel_path)
+                    return 1
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(check_wheel_size(sys.argv[1]))
diff --git a/Dockerfile b/Dockerfile
index e8a9842c089dd..90be3a30f89b1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,7 @@
 
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
 
 RUN apt-get update -y \
     && apt-get install -y python3-pip git
@@ -16,7 +16,7 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.1/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/
 
 WORKDIR /workspace
 
@@ -75,6 +75,10 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
     python3 setup.py bdist_wheel --dist-dir=dist
 
+# check the size of the wheel, we cannot upload wheels larger than 100MB
+COPY .buildkite/check-wheel-size.py check-wheel-size.py
+RUN python3 check-wheel-size.py dist
+
 # the `vllm_nccl` package must be installed from source distribution
 # pip is too smart to store a wheel in the cache, and other CI jobs
 # will directly use the wheel from the cache, which is not what we want.
@@ -102,7 +106,7 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
 WORKDIR /vllm-workspace
 
 RUN apt-get update -y \
@@ -112,7 +116,7 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.1/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/
 
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \

From 43029870694de0789a10ab49f181f1cba6ec741a Mon Sep 17 00:00:00 2001
From: DearPlanet <149305930+DearPlanet@users.noreply.github.com>
Date: Sun, 5 May 2024 06:39:34 +0800
Subject: [PATCH 56/65] [Bugfix] Fix inappropriate content of model_name tag in
 Prometheus metrics (#3937)

---
 tests/metrics/test_metrics.py       | 30 +++++++++++++++++++++++++++++
 vllm/config.py                      | 25 ++++++++++++++++++++++++
 vllm/engine/arg_utils.py            | 20 +++++++++++++++++--
 vllm/engine/llm_engine.py           |  5 +++--
 vllm/entrypoints/openai/cli_args.py | 10 ----------
 5 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 311e60ba60f61..e0aa14f165c2d 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 from prometheus_client import REGISTRY
 
@@ -76,6 +78,34 @@ def test_metric_counter_generation_tokens(
         f"metric: {metric_count!r}")
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "served_model_name",
+    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
+def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
+                                   served_model_name: List[str]) -> None:
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             disable_log_stats=False,
+                             gpu_memory_utilization=0.3,
+                             served_model_name=served_model_name)
+    stat_logger = vllm_model.model.llm_engine.stat_logger
+    metrics_tag_content = stat_logger.labels["model_name"]
+
+    del vllm_model
+
+    if served_model_name is None or served_model_name == []:
+        assert metrics_tag_content == model, (
+            f"Metrics tag model_name is wrong! expect: {model!r}\n"
+            f"actual: {metrics_tag_content!r}")
+    else:
+        assert metrics_tag_content == served_model_name[0], (
+            f"Metrics tag model_name is wrong! expect: "
+            f"{served_model_name[0]!r}\n"
+            f"actual: {metrics_tag_content!r}")
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [4])
diff --git a/vllm/config.py b/vllm/config.py
index 6c65bbe247f84..13bb294591725 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -31,6 +31,8 @@ class ModelConfig:
 
     Args:
         model: Name or path of the huggingface model to use.
+            It is also used as the content for `model_name` tag in metrics 
+            output when `served_model_name` is not specified. 
         tokenizer: Name or path of the huggingface tokenizer to use.
         tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
             available, and "slow" will always use the slow tokenizer.
@@ -69,6 +71,10 @@ class ModelConfig:
             to eager mode
         skip_tokenizer_init: If true, skip initialization of tokenizer and
             detokenizer.
+        served_model_name: The model name used in metrics tag `model_name`,
+            matches the model name exposed via the APIs. If multiple model 
+            names provided, the first name will be used. If not specified, 
+            the model name will be the same as `model`.
     """
 
     def __init__(
@@ -90,6 +96,7 @@ def __init__(
         max_seq_len_to_capture: Optional[int] = None,
         max_logprobs: int = 5,
         skip_tokenizer_init: bool = False,
+        served_model_name: Optional[Union[str, List[str]]] = None,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -117,6 +124,8 @@ def __init__(
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.max_model_len = _get_and_verify_max_len(self.hf_text_config,
                                                      max_model_len)
+        self.served_model_name = get_served_model_name(model,
+                                                       served_model_name)
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
         self._verify_quantization()
@@ -1150,6 +1159,22 @@ def _get_and_verify_max_len(
     return int(max_model_len)
 
 
+def get_served_model_name(model: str,
+                          served_model_name: Optional[Union[str, List[str]]]):
+    """
+    If the input is a non-empty list, the first model_name in 
+    `served_model_name` is taken. 
+    If the input is a non-empty string, it is used directly. 
+    For cases where the input is either an empty string or an 
+    empty list, the fallback is to use `self.model`.
+    """
+    if not served_model_name:
+        return model
+    if isinstance(served_model_name, list):
+        return served_model_name[0]
+    return served_model_name
+
+
 @dataclass
 class DecodingConfig:
     """Dataclass which contains the decoding strategy of the engine"""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 78cd07575f17d..bb8245eb307f7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,7 +1,7 @@
 import argparse
 import dataclasses
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional, Union
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
                          EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
@@ -21,6 +21,7 @@ def nullable_str(val: str):
 class EngineArgs:
     """Arguments for vLLM engine."""
     model: str
+    served_model_name: Optional[Union[List[str]]] = None
     tokenizer: Optional[str] = None
     skip_tokenizer_init: bool = False
     tokenizer_mode: str = 'auto'
@@ -489,6 +490,21 @@ def add_cli_args(
                             'This should be a JSON string that will be '
                             'parsed into a dictionary.')
 
+        parser.add_argument(
+            "--served-model-name",
+            nargs="+",
+            type=str,
+            default=None,
+            help="The model name(s) used in the API. If multiple "
+            "names are provided, the server will respond to any "
+            "of the provided names. The model name in the model "
+            "field of a response will be the first name in this "
+            "list. If not specified, the model name will be the "
+            "same as the `--model` argument. Noted that this name(s)"
+            "will also be used in `model_name` tag content of "
+            "prometheus metrics, if multiple names provided, metrics"
+            "tag will take the first one.")
+
         return parser
 
     @classmethod
@@ -508,7 +524,7 @@ def create_engine_config(self, ) -> EngineConfig:
             self.quantization, self.quantization_param_path,
             self.enforce_eager, self.max_context_len_to_capture,
             self.max_seq_len_to_capture, self.max_logprobs,
-            self.skip_tokenizer_init)
+            self.skip_tokenizer_init, self.served_model_name)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 342f2c796d6fb..b9938b045ba2b 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -106,7 +106,7 @@ def __init__(
             "tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
             "quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
             "quantization_param_path=%s, device_config=%s, "
-            "decoding_config=%r, seed=%d)",
+            "decoding_config=%r, seed=%d, served_model_name=%s)",
             vllm.__version__,
             model_config.model,
             speculative_config,
@@ -129,6 +129,7 @@ def __init__(
             device_config.device,
             decoding_config,
             model_config.seed,
+            model_config.served_model_name,
         )
         # TODO(woosuk): Print more configs in debug mode.
 
@@ -219,7 +220,7 @@ def __init__(
         if self.log_stats:
             self.stat_logger = StatLogger(
                 local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                labels=dict(model_name=model_config.model),
+                labels=dict(model_name=model_config.served_model_name),
                 max_model_len=self.model_config.max_model_len)
             self.stat_logger.info("cache_config", self.cache_config)
 
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 2b57ab26bfd31..4c0cb1e4f3e49 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -56,16 +56,6 @@ def make_arg_parser():
                         default=None,
                         help="If provided, the server will require this key "
                         "to be presented in the header.")
-    parser.add_argument("--served-model-name",
-                        nargs="+",
-                        type=nullable_str,
-                        default=None,
-                        help="The model name(s) used in the API. If multiple "
-                        "names are provided, the server will respond to any "
-                        "of the provided names. The model name in the model "
-                        "field of a response will be the first name in this "
-                        "list. If not specified, the model name will be the "
-                        "same as the `--model` argument.")
     parser.add_argument(
         "--lora-modules",
         type=nullable_str,

From 8d8357c8ed1f3ddb6a0e8f3287ec669a13d77df1 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sat, 4 May 2024 17:09:49 -0700
Subject: [PATCH 57/65] bump version to v0.4.2 (#4600)

---
 .github/workflows/scripts/create_release.js | 2 +-
 vllm/__init__.py                            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/create_release.js b/.github/workflows/scripts/create_release.js
index 0f25624b4c21c..475742118afeb 100644
--- a/.github/workflows/scripts/create_release.js
+++ b/.github/workflows/scripts/create_release.js
@@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
 			generate_release_notes: true,
 			name: process.env.RELEASE_TAG,
 			owner: context.repo.owner,
-			prerelease: false,
+			prerelease: true,
 			repo: context.repo.repo,
 			tag_name: process.env.RELEASE_TAG,
 		});
diff --git a/vllm/__init__.py b/vllm/__init__.py
index ca454efd44b24..59810da3ca411 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -9,7 +9,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
 
-__version__ = "0.4.1"
+__version__ = "0.4.2"
 
 __all__ = [
     "LLM",

From c7f2cf2b7f67bce5842fedfdba508440fe257375 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sat, 4 May 2024 21:28:58 -0700
Subject: [PATCH 58/65] [CI] Reduce wheel size by not shipping debug symbols
 (#4602)

---
 .buildkite/check-wheel-size.py | 3 +++
 .github/workflows/publish.yml  | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 8178fba552c46..90a5e54736cf3 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -25,6 +25,9 @@ def check_wheel_size(directory):
                         f"compare to the allowed size ({MAX_SIZE_MB} MB).")
                     print_top_10_largest_files(wheel_path)
                     return 1
+                else:
+                    print(f"Wheel {wheel_path} is within the allowed size "
+                          f"({wheel_size_mb} MB).")
     return 0
 
 
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index d79681f03b003..ac60ce0fed14a 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -79,6 +79,8 @@ jobs:
 
       - name: Build wheel
         shell: bash
+        env:
+          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
         run: |
           bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
           wheel_name=$(ls dist/*whl | xargs -n 1 basename)

From 0650e5935b0f6af35fb2acf71769982c47b804d7 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Mon, 6 May 2024 07:58:55 +0800
Subject: [PATCH 59/65] Disable cuda version check in vllm-openai image (#4530)

---
 vllm/config.py | 11 +----------
 vllm/utils.py  | 24 +-----------------------
 2 files changed, 2 insertions(+), 33 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 13bb294591725..5c3a8615eefb4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,15 +4,13 @@
 from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
 
 import torch
-from packaging.version import Version
 from transformers import PretrainedConfig
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.transformers_utils.config import get_config, get_hf_text_config
-from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip,
-                        is_neuron)
+from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron
 
 GPTQMarlinConfig = get_quantization_config("gptq_marlin")
 
@@ -369,13 +367,6 @@ def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass
         elif self.cache_dtype == "fp8":
-            if not is_hip():
-                nvcc_cuda_version = get_nvcc_cuda_version()
-                if nvcc_cuda_version is not None \
-                        and nvcc_cuda_version < Version("11.8"):
-                    raise ValueError(
-                        "FP8 is not supported when cuda version is"
-                        "lower than 11.8.")
             logger.info(
                 "Using fp8 data type to store kv cache. It reduces the GPU "
                 "memory footprint and boosts the performance. "
diff --git a/vllm/utils.py b/vllm/utils.py
index b06c8508757c5..6479a8dab320a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -19,7 +19,6 @@
 
 import psutil
 import torch
-from packaging.version import Version, parse
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
@@ -314,27 +313,6 @@ def cdiv(a: int, b: int) -> int:
     return -(a // -b)
 
 
-@lru_cache(maxsize=None)
-def get_nvcc_cuda_version() -> Optional[Version]:
-    cuda_home = envs.CUDA_HOME
-    if not cuda_home:
-        cuda_home = '/usr/local/cuda'
-        if os.path.isfile(cuda_home + '/bin/nvcc'):
-            logger.info(
-                'CUDA_HOME is not found in the environment. '
-                'Using %s as CUDA_HOME.', cuda_home)
-        else:
-            logger.warning('Not found nvcc in %s. Skip cuda version check!',
-                           cuda_home)
-            return None
-    nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
-                                          universal_newlines=True)
-    output = nvcc_output.split()
-    release_idx = output.index("release") + 1
-    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
-    return nvcc_cuda_version
-
-
 def _generate_random_fp8(
     tensor: torch.tensor,
     low: float,
@@ -560,7 +538,7 @@ def maybe_expand_dim(tensor: torch.Tensor,
 def merge_dicts(dict1: Dict[Any, List[Any]],
                 dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
     """Merge 2 dicts that have key -> List of items.
-    
+
     When a key conflicts, the values in dict1 is prioritized.
     """
     merged_dict = defaultdict(list)

From 323f27b9048713cdbab31995265975842a937167 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 May 2024 00:31:05 +0800
Subject: [PATCH 60/65] [Bugfix] Fix `asyncio.Task` not being subscriptable
 (#4623)

---
 vllm/engine/async_llm_engine.py       | 6 +++---
 vllm/entrypoints/openai/api_server.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 9f72a0d11974f..37a2dc77a3b50 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,8 +1,8 @@
 import asyncio
 import time
 from functools import partial
-from typing import (Any, AsyncIterator, Callable, Dict, Iterable, List,
-                    Optional, Set, Tuple, Type, Union)
+from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional,
+                    Set, Tuple, Type, Union)
 
 from transformers import PreTrainedTokenizer
 
@@ -327,7 +327,7 @@ def __init__(self,
         # We need to keep a reference to unshielded
         # task as well to prevent it from being garbage
         # collected
-        self._background_loop_unshielded: Optional[asyncio.Task[Any]] = None
+        self._background_loop_unshielded: Optional[asyncio.Task] = None
         self.start_engine_loop = start_engine_loop
         self._errored_with: Optional[BaseException] = None
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f9e294af47253..44a946f2e32d4 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -4,7 +4,7 @@
 import re
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Any, Set
+from typing import Set
 
 import fastapi
 import uvicorn
@@ -34,7 +34,7 @@
 openai_serving_completion: OpenAIServingCompletion
 logger = init_logger(__name__)
 
-_running_tasks: Set[asyncio.Task[Any]] = set()
+_running_tasks: Set[asyncio.Task] = set()
 
 
 @asynccontextmanager

From e186d37cb135107a09cd684e4fa2cf30c0ce6f28 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 6 May 2024 15:23:36 -0700
Subject: [PATCH 61/65] [CI] use ccache actions properly in release workflow
 (#4629)

---
 .github/workflows/publish.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index ac60ce0fed14a..9c35ede5f6781 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -58,6 +58,9 @@ jobs:
 
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          create-symlink: true
+          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
 
       - name: Set up Linux Env
         if: ${{ runner.os == 'Linux' }}

From 19cb4716ee700e5d8baa64d7cf14fb5da3737f6d Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Mon, 6 May 2024 16:18:57 -0700
Subject: [PATCH 62/65] [CI] Add retry for agent lost (#4633)

---
 .buildkite/test-template.j2 | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index ea02b6b1e9c9e..919a09e1cc064 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -14,6 +14,8 @@ steps:
       automatic:
         - exit_status: -1  # Agent was lost
           limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
   - wait
 
   - group: "AMD Tests"
@@ -53,6 +55,8 @@ steps:
       automatic:
         - exit_status: -1  # Agent was lost
           limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
     plugins:
       - kubernetes:
           podSpec:

From bd99d226295776011f4ea4831498a7103bc4e43b Mon Sep 17 00:00:00 2001
From: Noam Gat <noamgat@gmail.com>
Date: Tue, 7 May 2024 02:51:59 +0300
Subject: [PATCH 63/65] Update lm-format-enforcer to 0.10.1 (#4631)

---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 3abb828116680..bd779d5acb68e 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -14,7 +14,7 @@ pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken == 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.9.8
+lm-format-enforcer == 0.10.1
 outlines == 0.0.34 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4

From a98187cf7227695819e199e2e3ad35be0a9a84f3 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Mon, 6 May 2024 17:39:28 -0700
Subject: [PATCH 64/65] [Kernel] Make static FP8 scaling more robust (#4570)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously FP8 static scaling works if the scales are overestimating the maxima of all activation tensors during computation. However this will not always be the case even if the scales were calibrated very carefully. For example, with the activations in my checkpoint

https://huggingface.co/pcmoritz/Mixtral-8x7B-v0.1-fp8-act-scale

(which was calibrated on https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k), I'm getting the following mostly random performance on MMLU:

|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.2295|±  |0.0035|
| - humanities     |N/A    |none  |     5|acc   |0.2421|±  |0.0062|
| - other          |N/A    |none  |     5|acc   |0.2398|±  |0.0076|
| - social_sciences|N/A    |none  |     5|acc   |0.2171|±  |0.0074|
| - stem           |N/A    |none  |     5|acc   |0.2125|±  |0.0073|
With the fix in this PR where the scaled activations are clamped between [-std::numeric_limits<c10::Float8_e4m3fn>::max(), std::numeric_limits<c10::Float8_e4m3fn>::max()] to make sure there are no NaNs, the performance is

|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.7008|±  |0.0036|
| - humanities     |N/A    |none  |     5|acc   |0.6453|±  |0.0065|
| - other          |N/A    |none  |     5|acc   |0.7692|±  |0.0072|
| - social_sciences|N/A    |none  |     5|acc   |0.8083|±  |0.0070|
| - stem           |N/A    |none  |     5|acc   |0.6115|±  |0.0083|
This is not perfect yet but is getting very close to the FP16 / dynamic activation scale performance.
---
 csrc/quantization/fp8/fp8_cuda_kernels.cu | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/csrc/quantization/fp8/fp8_cuda_kernels.cu b/csrc/quantization/fp8/fp8_cuda_kernels.cu
index 2477051eb60d7..b9c5d39277ca5 100644
--- a/csrc/quantization/fp8/fp8_cuda_kernels.cu
+++ b/csrc/quantization/fp8/fp8_cuda_kernels.cu
@@ -17,6 +17,15 @@ __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
     return old;
 }
 
+#define FP8_E4M3_MAX std::numeric_limits<c10::Float8_e4m3fn>::max()
+
+template<typename scalar_t>
+__device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion(const scalar_t val, const float scale) {
+  float x = static_cast<float>(val) / scale;
+  float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
+  return static_cast<c10::Float8_e4m3fn>(r);
+}
+
 // Compute the absolute maximum m of the input tensor and store
 // m / float8_e4m3::max() in *scale. Each thread block performs a
 // reduction tree and the memory in scale is atomically updated.
@@ -67,7 +76,7 @@ __global__ void scaled_fp8_quant_kernel(
   int64_t num_elems) {
   int i = blockDim.x * blockIdx.x + threadIdx.x;
   while (i < num_elems) {
-    out[i] = static_cast<c10::Float8_e4m3fn>(input[i] / *scale);
+    out[i] = scaled_fp8_conversion(input[i], *scale);
     i += blockDim.x * gridDim.x;
   }
 }

From 63575bc2e197b85ce1c911421ff30c5459e35e9c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 6 May 2024 21:30:27 -0700
Subject: [PATCH 65/65] [Core][Optimization] change python dict to pytorch
 tensor (#4607)

---
 csrc/cache.h                               |  2 +-
 csrc/cache_kernels.cu                      | 20 +++--------
 csrc/cpu/cache.cpp                         | 20 ++++-------
 tests/core/test_scheduler.py               |  8 ++---
 tests/kernels/test_cache.py                | 21 ++++++-----
 tests/worker/test_swap.py                  |  2 +-
 vllm/attention/backends/abstract.py        |  2 +-
 vllm/attention/backends/flash_attn.py      |  2 +-
 vllm/attention/backends/flashinfer.py      |  2 +-
 vllm/attention/backends/rocm_flash_attn.py |  2 +-
 vllm/attention/backends/torch_sdpa.py      |  2 +-
 vllm/attention/backends/xformers.py        |  2 +-
 vllm/attention/ops/paged_attn.py           |  2 +-
 vllm/core/scheduler.py                     | 41 +++++++++++-----------
 vllm/distributed/communication_op.py       |  7 ++++
 vllm/sequence.py                           |  6 ++--
 vllm/worker/cache_engine.py                |  2 +-
 vllm/worker/cpu_worker.py                  |  7 ++--
 vllm/worker/worker.py                      |  8 +++--
 19 files changed, 77 insertions(+), 81 deletions(-)

diff --git a/csrc/cache.h b/csrc/cache.h
index 4c142ce17f1b9..10871b3670bac 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -13,7 +13,7 @@ void swap_blocks(
 void copy_blocks(
   std::vector<torch::Tensor>& key_caches,
   std::vector<torch::Tensor>& value_caches,
-  const std::map<int64_t, std::vector<int64_t>>& block_mapping);
+  torch::Tensor& block_mapping);
 
 void reshape_and_cache(
   torch::Tensor& key,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 42f884c76c620..1e02f7fcbae4c 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -97,7 +97,7 @@ __global__ void copy_blocks_kernel(
 void copy_blocks(
   std::vector<torch::Tensor>& key_caches,
   std::vector<torch::Tensor>& value_caches,
-  const std::map<int64_t, std::vector<int64_t>>& block_mapping) {
+  torch::Tensor& block_mapping) {
   int num_layers = key_caches.size();
   TORCH_CHECK(num_layers == value_caches.size());
   if (num_layers == 0) {
@@ -114,17 +114,9 @@ void copy_blocks(
     key_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
     value_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
   }
-  // Create block mapping array.
-  std::vector<int64_t> block_mapping_vec;
-  for (const auto& pair : block_mapping) {
-    int64_t src_block_number = pair.first;
-    for (int64_t dst_block_number : pair.second) {
-      block_mapping_vec.push_back(src_block_number);
-      block_mapping_vec.push_back(dst_block_number);
-    }
-  }
-  int64_t* block_mapping_array = block_mapping_vec.data();
-  int num_pairs = block_mapping_vec.size() / 2;
+
+  // block_mapping is a 2D tensor with shape (num_pairs, 2).
+  int num_pairs = block_mapping.size(0);
 
   // Move the data structures to the GPU.
   // NOTE: This synchronizes the CPU and GPU.
@@ -132,8 +124,6 @@ void copy_blocks(
     key_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
   torch::Tensor value_cache_ptrs_tensor = torch::from_blob(
     value_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
-  torch::Tensor block_mapping_tensor = torch::from_blob(
-    block_mapping_array, {2 * num_pairs}, torch::kInt64).to(cache_device);
 
   // Launch the kernel.
   const int numel_per_block = key_caches[0][0].numel();
@@ -146,7 +136,7 @@ void copy_blocks(
       vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
         key_cache_ptrs_tensor.data_ptr<int64_t>(),
         value_cache_ptrs_tensor.data_ptr<int64_t>(),
-        block_mapping_tensor.data_ptr<int64_t>(),
+        block_mapping.data_ptr<int64_t>(),
         numel_per_block);
     }));
 }
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
index 7849a5df991b1..95e3f11900fde 100644
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -8,16 +8,16 @@ template <typename scalar_t>
 void copy_blocks_cpu_impl(
     std::vector<torch::Tensor> &key_caches,
     std::vector<torch::Tensor> &value_caches,
-    const std::vector<std::pair<int64_t, int64_t>> mapping_pairs,
+    const torch::Tensor& mapping_pairs,
     const int element_num_per_block, const int layer_num) {
-  const size_t pair_num = mapping_pairs.size();
+  const size_t pair_num = mapping_pairs.size(0);
   const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
 #pragma omp parallel for collapse(2)
   for (int layer = 0; layer < layer_num; ++layer) {
     for (size_t pair = 0; pair < pair_num; ++pair) {
-      int64_t source_offset = element_num_per_block * mapping_pairs[pair].first;
+      int64_t source_offset = element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
       int64_t target_offset =
-          element_num_per_block * mapping_pairs[pair].second;
+          element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
       scalar_t *key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
       scalar_t *source_ptr = key_cache_ptr + source_offset;
       scalar_t *target_ptr = key_cache_ptr + target_offset;
@@ -83,26 +83,18 @@ void reshape_and_cache_cpu_impl(
 
 void copy_blocks(std::vector<torch::Tensor> &key_caches,
                  std::vector<torch::Tensor> &value_caches,
-                 const std::map<int64_t, std::vector<int64_t>> &block_mapping) {
+                 torch::Tensor& block_mapping) {
   int num_layers = key_caches.size();
   TORCH_CHECK(num_layers == value_caches.size());
   if (num_layers == 0) {
     return;
   }
 
-  std::vector<std::pair<int64_t, int64_t>> mapping_pairs;
-  mapping_pairs.reserve(block_mapping.size());
-  for (const auto &pair : block_mapping) {
-    for (const auto &dst : pair.second) {
-      mapping_pairs.emplace_back(pair.first, dst);
-    }
-  }
-
   const int element_num_per_block = key_caches[0][0].numel();
   VLLM_DISPATCH_FLOATING_TYPES(
       key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
         CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
-        copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, mapping_pairs,
+        copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
                                        element_num_per_block, num_layers);
         CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
       });
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 1358dffec8104..348169035ae97 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -568,7 +568,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     # Both should be preempted, not swapped.
     assert output.blocks_to_swap_out == {}
     # Nothing is copied.
-    assert output.blocks_to_copy == {}
+    assert output.blocks_to_copy == []
 
 
 def test_decode_swap_beam_search():
@@ -618,7 +618,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     # Both should be preempted, not swapped.
     assert output.blocks_to_swap_out == expected_swap_mapping
     # Nothing is copied.
-    assert output.blocks_to_copy == {}
+    assert output.blocks_to_copy == []
 
 
 def test_schedule_decode_blocks_to_copy_update():
@@ -650,7 +650,7 @@ def test_schedule_decode_blocks_to_copy_update():
     assert output.blocks_to_swap_out == {}
     # Since append_slot returns the source -> dist mapping, it should
     # applied.
-    assert output.blocks_to_copy == {2: [3]}
+    assert output.blocks_to_copy == [(2, 3)]
 
 
 def test_schedule_swapped_simple():
@@ -853,7 +853,7 @@ def test_schedule_swapped_blocks_to_copy():
     assert len(remaining_swapped) == 0
     assert len(output.decode_seq_groups) == 1
     assert len(output.prefill_seq_groups) == 0
-    assert output.blocks_to_copy == {2: [3]}
+    assert output.blocks_to_copy == [(2, 3)]
 
 
 def test_scheduling_budget():
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index ca215bb75837a..94a577139596e 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -63,12 +63,13 @@ def test_copy_blocks(
     src_blocks = random.sample(range(num_blocks), num_mappings)
     remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
     dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
-    block_mapping = {}
+    block_mapping = []
     for i in range(num_mappings):
         src = src_blocks[i]
         dst1 = dst_blocks[2 * i]
         dst2 = dst_blocks[2 * i + 1]
-        block_mapping[src] = [dst1, dst2]
+        block_mapping.append((src, dst1))
+        block_mapping.append((src, dst2))
 
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
@@ -81,15 +82,17 @@ def test_copy_blocks(
     cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
 
     # Call the copy blocks kernel.
-    ops.copy_blocks(key_caches, value_caches, block_mapping)
+    block_mapping_tensor = torch.tensor(block_mapping,
+                                        dtype=torch.int64,
+                                        device=device).view(-1, 2)
+    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
 
     # Run the reference implementation.
-    for src, dsts in block_mapping.items():
-        for dst in dsts:
-            for cloned_key_cache in cloned_key_caches:
-                cloned_key_cache[dst].copy_(cloned_key_cache[src])
-            for cloned_value_cache in cloned_value_caches:
-                cloned_value_cache[dst].copy_(cloned_value_cache[src])
+    for src, dst in block_mapping:
+        for cloned_key_cache in cloned_key_caches:
+            cloned_key_cache[dst].copy_(cloned_key_cache[src])
+        for cloned_value_cache in cloned_value_caches:
+            cloned_value_cache[dst].copy_(cloned_value_cache[src])
 
     # Compare the results.
     for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 07bcd343a96a6..4d2d3add27d59 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -59,7 +59,7 @@ def test_swap() -> None:
         seq_group_metadata_list=[],
         blocks_to_swap_in={},
         blocks_to_swap_out=blocks_to_swap_out,
-        blocks_to_copy={},
+        blocks_to_copy=[],
     )
     worker.execute_model(execute_model_req=execute_model_req)
 
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 61c9c81d8a7b8..b2b6e7ac810e3 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -42,7 +42,7 @@ def swap_blocks(
     @abstractmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
+        src_to_dists: torch.Tensor,
     ) -> None:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index fc7501ed5e91f..da672d5df6161 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -48,7 +48,7 @@ def swap_blocks(
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
+        src_to_dists: torch.Tensor,
     ) -> None:
         PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 8ab4b1f12ee36..2851cbe2396b2 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -48,7 +48,7 @@ def swap_blocks(
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
+        src_to_dists: torch.Tensor,
     ) -> None:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index c411b3971b8f1..c3b522e63b4b8 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -46,7 +46,7 @@ def swap_blocks(
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
+        src_to_dists: torch.Tensor,
     ) -> None:
         PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index f75a279086a26..03825f6023f4c 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -44,7 +44,7 @@ def swap_blocks(
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
+        src_to_dists: torch.Tensor,
     ) -> None:
         PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 60f6d43f2eaa4..4c7fa71a2c78e 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -49,7 +49,7 @@ def swap_blocks(
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
+        src_to_dists: torch.Tensor,
     ) -> None:
         PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 00a0f10c0950b..6f7fd51c774f8 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -209,7 +209,7 @@ def swap_blocks(
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
+        src_to_dists: torch.Tensor,
     ) -> None:
         key_caches = [kv_cache[0] for kv_cache in kv_caches]
         value_caches = [kv_cache[1] for kv_cache in kv_caches]
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index a9e0b05b8db67..de3ecd24e52db 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -13,7 +13,6 @@
 from vllm.lora.request import LoRARequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
                            SequenceGroupMetadata, SequenceStatus)
-from vllm.utils import merge_dicts
 
 logger = init_logger(__name__)
 
@@ -122,8 +121,8 @@ class SchedulerOutputs:
     blocks_to_swap_in: Dict[int, int]
     # Blocks to swap out. Dict of GPU -> CPU block number.
     blocks_to_swap_out: Dict[int, int]
-    # Blocks to copy. Source to a list of dest blocks.
-    blocks_to_copy: Dict[int, List[int]]
+    # Blocks to copy. Source to dest block.
+    blocks_to_copy: List[Tuple[int, int]]
     # Sequence groups that are going to be ignored.
     ignored_seq_groups: List[SequenceGroup]
     # The number of slots for lookahead decoding.
@@ -177,7 +176,7 @@ class SchedulerRunningOutputs:
     # The blocks to swap out.
     blocks_to_swap_out: Dict[int, int]
     # The blocks to copy.
-    blocks_to_copy: Dict[int, List[int]]
+    blocks_to_copy: List[Tuple[int, int]]
     # The number of slots for lookahead decoding.
     num_lookahead_slots: int
 
@@ -189,7 +188,7 @@ def create_empty(cls) -> "SchedulerRunningOutputs":
             preempted=[],
             swapped_out=[],
             blocks_to_swap_out={},
-            blocks_to_copy={},
+            blocks_to_copy=[],
             num_lookahead_slots=0,
         )
 
@@ -209,7 +208,7 @@ class SchedulerSwappedInOutputs:
     # The blocks to swap in.
     blocks_to_swap_in: Dict[int, int]
     # The blocks to copy.
-    blocks_to_copy: Dict[int, List[int]]
+    blocks_to_copy: List[Tuple[int, int]]
     # The number of slots for lookahead decoding.
     num_lookahead_slots: int
     # Infeasible sequence groups.
@@ -221,7 +220,7 @@ def create_empty(cls) -> "SchedulerSwappedInOutputs":
             decode_seq_groups=[],
             prefill_seq_groups=[],
             blocks_to_swap_in={},
-            blocks_to_copy={},
+            blocks_to_copy=[],
             num_lookahead_slots=0,
             infeasible_seq_groups=[],
         )
@@ -394,7 +393,7 @@ def _schedule_running(
         """
         # Blocks that need to be swapped or copied before model execution.
         blocks_to_swap_out: Dict[int, int] = {}
-        blocks_to_copy: Dict[int, List[int]] = {}
+        blocks_to_copy: List[Tuple[int, int]] = []
 
         decode_seq_groups: List[ScheduledSequenceGroup] = []
         prefill_seq_groups: List[ScheduledSequenceGroup] = []
@@ -511,7 +510,7 @@ def _schedule_swapped(
         """
         # Blocks that need to be swapped or copied before model execution.
         blocks_to_swap_in: Dict[int, int] = {}
-        blocks_to_copy: Dict[int, List[int]] = {}
+        blocks_to_copy: List[Tuple[int, int]] = []
         decode_seq_groups: List[ScheduledSequenceGroup] = []
         prefill_seq_groups: List[ScheduledSequenceGroup] = []
         now = time.time()
@@ -794,8 +793,8 @@ def _schedule_default(self) -> SchedulerOutputs:
             num_batched_tokens=budget.num_batched_tokens,
             blocks_to_swap_in=swapped_in.blocks_to_swap_in,
             blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
-            blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
-                                       swapped_in.blocks_to_copy),
+            blocks_to_copy=running_scheduled.blocks_to_copy +
+            swapped_in.blocks_to_copy,
             ignored_seq_groups=prefills.ignored_seq_groups +
             swapped_in.infeasible_seq_groups,
             num_lookahead_slots=running_scheduled.num_lookahead_slots,
@@ -882,8 +881,8 @@ def _schedule_chunked_prefill(self):
             num_batched_tokens=budget.num_batched_tokens,
             blocks_to_swap_in=swapped_in.blocks_to_swap_in,
             blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
-            blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
-                                       swapped_in.blocks_to_copy),
+            blocks_to_copy=running_scheduled.blocks_to_copy +
+            swapped_in.blocks_to_copy,
             ignored_seq_groups=prefills.ignored_seq_groups,
             num_lookahead_slots=running_scheduled.num_lookahead_slots,
             running_queue_size=len(self.running),
@@ -1011,17 +1010,18 @@ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
     def _append_slots(
         self,
         seq_group: SequenceGroup,
-        blocks_to_copy: Dict[int, List[int]],
+        blocks_to_copy: List[Tuple[int, int]],
     ) -> None:
         """Appends new slots to the sequences in the given sequence group.
 
         Args:
             seq_group (SequenceGroup): The sequence group containing the
                 sequences to append slots to.
-            blocks_to_copy (Dict[int, List[int]]): A dictionary mapping source
-                block indices to lists of destination block indices. This
-                dictionary is updated with the new source and destination block
-                indices for the appended slots.
+            blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
+                ints, the first int is the source block index, and the second
+                int is the destination block index. This list is updated with
+                the new source and destination block indices for the appended
+                slots.
         """
         num_lookahead_slots = self._get_num_lookahead_slots(is_prefill=False)
 
@@ -1029,9 +1029,8 @@ def _append_slots(
             cows = self.block_manager.append_slots(seq, num_lookahead_slots)
 
             for src, dests in cows.items():
-                if src not in blocks_to_copy:
-                    blocks_to_copy[src] = []
-                blocks_to_copy[src].extend(dests)
+                for dest in dests:
+                    blocks_to_copy.append((src, dest))
 
     def _preempt(
         self,
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index b539a7beedbfe..817bd6d812e48 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -203,6 +203,9 @@ def broadcast_tensor_dict(
                                                 group=metadata_group)
         async_handles = []
         for tensor in tensor_list:
+            if tensor.numel() == 0:
+                # Skip broadcasting empty tensors.
+                continue
             async_handles.append(
                 torch.distributed.broadcast(tensor,
                                             src=src,
@@ -224,6 +227,10 @@ def broadcast_tensor_dict(
                 tensor = torch.empty(value.size,
                                      dtype=value.dtype,
                                      device="cuda")
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    tensor_dict[key] = tensor
+                    continue
                 async_handle = torch.distributed.broadcast(tensor,
                                                            src=src,
                                                            async_op=True,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index f2939eff7959b..b486d1fedebd3 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -2,7 +2,7 @@
 import copy
 import enum
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 from vllm.block import LogicalTokenBlock
 from vllm.lora.request import LoRARequest
@@ -745,8 +745,8 @@ class ExecuteModelRequest:
     blocks_to_swap_in: Dict[int, int] = field(default_factory=dict)
     # Blocks to swap out. Dict of GPU -> CPU block number.
     blocks_to_swap_out: Dict[int, int] = field(default_factory=dict)
-    # Blocks to copy. Source to a list of dest blocks.
-    blocks_to_copy: Dict[int, List[int]] = field(default_factory=dict)
+    # Blocks to copy. Source to dest block.
+    blocks_to_copy: List[Tuple[int, int]] = field(default_factory=list)
     # The number of slots for lookahead decoding.
     num_lookahead_slots: int = 0
     # The number of requests in the running queue.
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index c34ee0648626b..26a60c652b6f4 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -77,7 +77,7 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None:
             self.attn_backend.swap_blocks(self.gpu_cache[i], self.cpu_cache[i],
                                           src_to_dst)
 
-    def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
+    def copy(self, src_to_dsts: torch.Tensor) -> None:
         self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts)
 
     @staticmethod
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 4420d4cc9e12f..e1ef500ac07b8 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -248,9 +248,9 @@ def _init_cache_engine(self) -> None:
 
     def cache_copy(
         self,
-        blocks_to_copy: Dict[int, List[int]],
+        blocks_to_copy: torch.Tensor,
     ) -> None:
-        if blocks_to_copy:
+        if blocks_to_copy.numel() > 0:
             self.cache_engine.copy(blocks_to_copy)
 
     @torch.inference_mode()
@@ -269,6 +269,9 @@ def execute_model(
             num_seq_groups: int = len(seq_group_metadata_list)
             assert execute_model_req is not None
             blocks_to_copy = execute_model_req.blocks_to_copy
+            blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                          device="cpu",
+                                          dtype=torch.int64).view(-1, 2)
             assert len(execute_model_req.blocks_to_swap_in) == 0
             assert len(execute_model_req.blocks_to_swap_out) == 0
             data: Dict[str, Any] = {
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 4add36e94f723..538332ad003f1 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -197,7 +197,7 @@ def cache_swap(
         self,
         blocks_to_swap_in: Dict[int, int],
         blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
+        blocks_to_copy: torch.Tensor,
     ) -> None:
         # Issue cache operations.
         # TODO(woosuk): Profile swapping overhead and optimize if needed.
@@ -205,7 +205,7 @@ def cache_swap(
             self.cache_engine.swap_in(blocks_to_swap_in)
         if blocks_to_swap_out:
             self.cache_engine.swap_out(blocks_to_swap_out)
-        if blocks_to_copy:
+        if blocks_to_copy.numel() > 0:
             self.cache_engine.copy(blocks_to_copy)
 
     @torch.inference_mode()
@@ -225,7 +225,9 @@ def execute_model(
             num_seq_groups = len(seq_group_metadata_list)
             blocks_to_swap_in = execute_model_req.blocks_to_swap_in
             blocks_to_swap_out = execute_model_req.blocks_to_swap_out
-            blocks_to_copy = execute_model_req.blocks_to_copy
+            blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                          device=self.device,
+                                          dtype=torch.int64).view(-1, 2)
             data: Dict[str, Any] = {
                 "num_seq_groups": num_seq_groups,
                 "blocks_to_swap_in": blocks_to_swap_in,