From 4578c2490b086ef786bfdcf3755074a1259af2c6 Mon Sep 17 00:00:00 2001 From: ByronHsu Date: Fri, 1 Mar 2024 14:16:35 -0800 Subject: [PATCH 01/13] [zero++] Synchronize at the end of secondary partitioning and simplify the logic (#5216) ## 1. Why? We have a very long thread investigating [the issue](https://github.com/microsoft/DeepSpeed/issues/5059). To summarize, this is because a. The 2nd partitioning is asynchronous because it copies device-to-device from full tensor to 2nd tensor b. When using prefetching, the all-gather of 2nd tensor can happen before 2nd partitioning ends. At that moment, the value of 2nd tensor might contain bad values. ![image](https://github.com/microsoft/DeepSpeed/assets/24364830/ad6ee6a2-8e1e-4214-a0d2-ee5314b252b8) Also, we found that the logic of copying is wrong and lengthy, so we simplified it to only two lines. Kudos to @yundai424, Haowen Ning, @samadejacobs for the investigation effort. ## 2. What? After multiple careful tests, we found patching `get_accelerator().synchronize()` to ensure all cuda stream finished before 2nd partitioning can prevent the issue ## 3. Tests I validated the correctness of the simplification of 2nd partition logic. The loss is "exactly" the same before and after simplification under the same random seed. Before ``` [ {"loss": 2.0731}, {"loss": 2.0288}, {"loss": 1.927}, {"loss": 1.8347}, {"loss": 1.8347}, {"loss": 1.7896}, {"loss": 1.602}, {"loss": 1.766}, {"loss": 1.8751}, {"loss": 1.6776} ] ``` After ``` [ {"loss": 2.0731}, {"loss": 2.0288}, {"loss": 1.927}, {"loss": 1.8347}, {"loss": 1.8347}, {"loss": 1.7896}, {"loss": 1.602}, {"loss": 1.766}, {"loss": 1.8751}, {"loss": 1.6776} ] ``` ## 4. TODO We need further investigation on the issue @samadejacobs 1) Revisit ZeRO-3 prefetch design 2) Refactor hpz to reuse primary tensor for secondary partition. --------- Signed-off-by: byhsu Co-authored-by: byhsu --- .../runtime/zero/partition_parameters.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 5cf655d8741a..142259c1b7df 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -1635,19 +1635,16 @@ def _partition_param_sec(self, param, buffer=None, has_been_updated=False): secondary_end = secondary_start + secondary_partition_size one_dim_param = param.contiguous().view(-1) - start = partition_size * self.rank - end = start + partition_size - if start < param.ds_numel and end <= param.ds_numel: - if secondary_start < param.ds_numel and secondary_end <= param.ds_numel: - sec_src_tensor = one_dim_param.narrow(0, secondary_start, secondary_partition_size) - param.ds_secondary_tensor.copy_(sec_src_tensor) - else: - if start < param.ds_numel: - elements_to_copy = param.ds_numel - start - elements_to_copy_sec = elements_to_copy * param.ds_secondary_tensor_num_of_groups - param.ds_secondary_tensor.narrow(0, 0, elements_to_copy_sec).copy_( - one_dim_param.narrow(0, secondary_start, elements_to_copy_sec)) + # ds_numel is unpadded, so the last chunk of the secondary tensor might not be secondary_partition_size + sec_numel = param.ds_numel - secondary_start if secondary_end > param.ds_numel else secondary_partition_size + + # copy from full tensor to secondary tensor + param.ds_secondary_tensor.narrow(0, 0, + sec_numel).copy_(one_dim_param.narrow(0, secondary_start, sec_numel)) + + # TODO: This is a temporary fix to avoid the issue that 2nd tensor all-gather happens before 2nd tensor partition is done + get_accelerator().current_stream().synchronize() print_rank_0(f"{param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}", force=False) From bcc617a0009dd27b4e144de59979bd7770eaf57c Mon Sep 17 00:00:00 2001 From: Perry Zou Date: Sat, 2 Mar 2024 08:31:37 +0800 Subject: [PATCH 02/13] Add fp16 support of Qwen1.5 models (0.5B to 72B) to DeepSpeed-FastGen (#5219) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Qwen1.5 is the beta version of Qwen2 This PR adds support for Qwen1.5 models 0.5B to 72B ### Test Code for mii pipeline: ```python import mii pipe = mii.pipeline("Qwen/Qwen1.5-0.5B") responses = pipe("DeepSpeed is", max_new_tokens=128, do_sample=False) if pipe.is_rank_0: print(responses[0]) ``` for huggingface: ```python from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig import torch tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B") model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True).eval() inputs = tokenizer('DeepSpeed is', return_tensors='pt') inputs = inputs.to(model.device) pred = model.generate(**inputs, max_new_tokens=128, do_sample=False, repetition_penalty=1.0) test = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False) print(test) ``` ### Qwen1.5-0.5B Huggingface output with prompt "DeepSpeed is": ``` a new and innovative way to store and retrieve data in the cloud. It is a cloud-based data storage solution that allows you to store and retrieve data in the cloud. DeepSpeed is a cloud-based data storage solution that allows you to store and retrieve data in the cloud. It is a cloud-based data storage solution that allows you to store and retrieve data in the cloud. DeepSpeed is a cloud-based data storage solution that allows you to store and retrieve data in the cloud. It is a cloud-based data storage solution that allows you to store and retrieve data in the cloud. DeepSpeed is a cloud-based data storage solution that allows you ``` DeepSpeed-FastGen output with prompt "DeepSpeed is": ``` a new and innovative way to store and retrieve data in the cloud. It is a cloud-based data storage solution that allows you to store and retrieve data in the cloud. DeepSpeed is a cloud-based data storage solution that allows you to store and retrieve data in the cloud. It is a cloud-based data storage solution that allows you to store and retrieve data in the cloud. DeepSpeed is a cloud-based data storage solution that allows you to store and retrieve data in the cloud. It is a cloud-based data storage solution that allows you to store and retrieve data in the cloud. DeepSpeed is a cloud-based data storage solution that allows you ``` ### Qwen1.5-72B-Chat Huggingface output with prompt "DeepSpeed is" (for nice display, I use ''' to replace ```): ``` 为 PyTorch 提供的深度学习训练加速库,它集成了多种优化技术,包括混合精度训练、模型并行、数据并行、ZeRO 内存优化等,可以显著提高模型训练的速度和效率。以下是一个简单的使用 DeepSpeed 的例子: 首先,你需要安装 DeepSpeed。在你的终端中运行以下命令: '''bash pip install deepspeed ''' 然后,你可以使用 DeepSpeed 来训练你的 PyTorch 模型。以下是一个简单的例子,使用 ResNet-50 训练 CIFAR-10 数据集: '''python import torch ``` DeepSpeed-FastGen output with prompt "DeepSpeed is" with 8-way sharding: ``` 为 PyTorch 提供的深度学习训练加速库,它集成了多种优化技术,包括混合精度训练、模型并行、数据并行、ZeRO 内存优化等,可以显著提高模型训练的速度和效率。以下是一个简单的使用 DeepSpeed 的例子: 首先,你需要安装 DeepSpeed。在你的终端中运行以下命令: '''bash pip install deepspeed ''' 然后,你可以使用 DeepSpeed 来训练你的 PyTorch 模型。以下是一个简单的例子,使用 ResNet-50 训练 CIFAR-10 数据集: '''python import torch ``` Co-authored-by: Michael Wyatt --- deepspeed/inference/v2/engine_factory.py | 3 + .../v2/model_implementations/__init__.py | 1 + .../model_implementations/qwen_v2/__init__.py | 6 + .../qwen_v2/container.py | 82 +++++++ .../v2/model_implementations/qwen_v2/model.py | 221 ++++++++++++++++++ .../model_implementations/qwen_v2/policy.py | 31 +++ 6 files changed, 344 insertions(+) create mode 100644 deepspeed/inference/v2/model_implementations/qwen_v2/__init__.py create mode 100644 deepspeed/inference/v2/model_implementations/qwen_v2/container.py create mode 100644 deepspeed/inference/v2/model_implementations/qwen_v2/model.py create mode 100644 deepspeed/inference/v2/model_implementations/qwen_v2/policy.py diff --git a/deepspeed/inference/v2/engine_factory.py b/deepspeed/inference/v2/engine_factory.py index 9281640f844a..c320108f55e5 100644 --- a/deepspeed/inference/v2/engine_factory.py +++ b/deepspeed/inference/v2/engine_factory.py @@ -21,6 +21,7 @@ FalconPolicy, PhiPolicy, QwenPolicy, + Qwen2Policy, ) from .model_implementations.inference_policy_base import POLICIES, InferenceV2Policy from .model_implementations.flat_model_helpers import make_metadata_filename, ModelMetadata @@ -120,6 +121,8 @@ def build_hf_engine(path: str, policy = PhiPolicy(model_config, checkpoint_engine=checkpoint_engine) elif model_config.model_type == "qwen": policy = QwenPolicy(model_config, checkpoint_engine=checkpoint_engine) + elif model_config.model_type == "qwen2": + policy = Qwen2Policy(model_config, checkpoint_engine=checkpoint_engine) else: raise ValueError(f"Unsupported model type {model_config.model_type}") diff --git a/deepspeed/inference/v2/model_implementations/__init__.py b/deepspeed/inference/v2/model_implementations/__init__.py index 869c4316cdc7..14b0654a8c36 100644 --- a/deepspeed/inference/v2/model_implementations/__init__.py +++ b/deepspeed/inference/v2/model_implementations/__init__.py @@ -16,3 +16,4 @@ from .falcon import * from .phi import * from .qwen import * +from .qwen_v2 import * diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/__init__.py b/deepspeed/inference/v2/model_implementations/qwen_v2/__init__.py new file mode 100644 index 000000000000..80b09757c74d --- /dev/null +++ b/deepspeed/inference/v2/model_implementations/qwen_v2/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from .policy import Qwen2Policy diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/container.py b/deepspeed/inference/v2/model_implementations/qwen_v2/container.py new file mode 100644 index 000000000000..6556d87d6afb --- /dev/null +++ b/deepspeed/inference/v2/model_implementations/qwen_v2/container.py @@ -0,0 +1,82 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# Create a container object to save model-specific tensors using the policy file above. + +from ..common_parameters import * +from ..layer_container_base import LayerContainer +''' + # HF Qwen2 model looks like this: + +Qwen2ForCausalLM( + (model): Qwen2Model( + (embed_tokens): Embedding(151936, 1024) + (layers): ModuleList( + (0-23): 24 x Qwen2DecoderLayer( + (self_attn): Qwen2SdpaAttention( + (q_proj): Linear(in_features=1024, out_features=1024, bias=True) + (k_proj): Linear(in_features=1024, out_features=1024, bias=True) + (v_proj): Linear(in_features=1024, out_features=1024, bias=True) + (o_proj): Linear(in_features=1024, out_features=1024, bias=False) + (rotary_emb): Qwen2RotaryEmbedding() + ) + (mlp): Qwen2MLP( + (gate_proj): Linear(in_features=1024, out_features=2816, bias=False) + (up_proj): Linear(in_features=1024, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=1024, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): Qwen2RMSNorm() + (post_attention_layernorm): Qwen2RMSNorm() + ) + ) + (norm): Qwen2RMSNorm() + ) + (lm_head): Linear(in_features=1024, out_features=151936, bias=False) +) +''' + + +class Qwen2TransformerContainer(LayerContainer): + """ + Transformer layer container for the Qwen2 model. + """ + qkv_w: UnfusedQKVParameter + qkv_b: UnfusedQKVParameter + attn_out_w: AttentionOutputParameter + mlp_1_w: GatedMLPParameter + mlp_2_w: MLP2Parameter + attn_norm_gamma: NormParameter + mlp_norm_gamma: NormParameter + + PARAM_MAPPING = { + "self_attn.q_proj.weight": "qkv_w.q_params", + "self_attn.k_proj.weight": "qkv_w.k_params", + "self_attn.v_proj.weight": "qkv_w.v_params", + "self_attn.q_proj.bias": "qkv_b.q_params", + "self_attn.k_proj.bias": "qkv_b.k_params", + "self_attn.v_proj.bias": "qkv_b.v_params", + "self_attn.o_proj.weight": "attn_out_w.params", + "mlp.gate_proj.weight": "mlp_1_w.gate_params", + "mlp.up_proj.weight": "mlp_1_w.up_params", + "mlp.down_proj.weight": "mlp_2_w.params", + "input_layernorm.weight": "attn_norm_gamma.params", + "post_attention_layernorm.weight": "mlp_norm_gamma.params", + } + + +class Qwen2NonTransformerContainer(LayerContainer): + """ + Non-Transformer layer container for the Qwen2 model. + """ + word_emb: EmbeddingParameter + word_unembed: UnembedParameter + final_norm: NormParameter + + PARAM_MAPPING = { + "model.embed_tokens.weight": "word_emb.params", + "model.norm.weight": "final_norm.params", + "lm_head.weight": "word_unembed.params", + } diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/model.py b/deepspeed/inference/v2/model_implementations/qwen_v2/model.py new file mode 100644 index 000000000000..d535462a954d --- /dev/null +++ b/deepspeed/inference/v2/model_implementations/qwen_v2/model.py @@ -0,0 +1,221 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from typing import Iterable, Optional, Tuple + +import torch + +import deepspeed.comm as dist + +from ...allocator import empty_from +from ...inference_utils import ActivationType, DtypeEnum +from .. import * +from ...modules.configs import * +from ...modules.interfaces import * +from ...modules import heuristics +from ...ragged import RaggedBatchWrapper + +from .container import Qwen2NonTransformerContainer, Qwen2TransformerContainer + + +class Qwen2InferenceModel(DSTransformerModelBase): + """ + Inference model implementation for ragged batching for Llama-2 models. + """ + + _non_transformer: Optional[Qwen2NonTransformerContainer] + """ + Embed + unembed container. Specializing the type annotation. + """ + + _transformer: Optional[Iterable[Qwen2TransformerContainer]] + """ + Per-layer transformer container. Specializing the type annotation. + """ + """ + Properties ineherited from `DSInferenceModelBase` + """ + + @property + def max_sequence_length(self) -> int: + return self._config.max_seq_length + + """ + Properties ineherited from `DSTransformerModelBase` + """ + + @property + def num_layers(self) -> int: + return self._config.num_hidden_layers + + @property + def model_dim(self) -> int: + return self._config.hidden_size + + @property + def vocab_size(self) -> int: + return self._config.vocab_size + + @property + def head_size(self) -> int: + return self.model_dim // self.n_heads + + @property + def n_heads(self) -> int: + return self._config.num_attention_heads + + @property + def intermediate_dim(self) -> int: + return self._config.intermediate_size + + @property + def n_heads_kv(self) -> int: + return self._config.num_key_value_heads + + @property + def activation_dtype(self) -> DtypeEnum: + # TODO(ZonePG): bf16 inference results may be different from huggingface bf16, + # because in rms_norm, Qwen still use float() instead of bf16 + # if self._config.torch_dtype == torch.float16: + # return DtypeEnum.fp16 + # elif self._config.torch_dtype == torch.bfloat16: + # return DtypeEnum.bf16 + # else: + # raise NotImplementedError("Only fp16 and bf16 are supported") + return DtypeEnum.fp16 + + @property + def mlp_activation_fn(self) -> ActivationType: + return ActivationType.SiGLU + + @property + def norm_type(self) -> NormTypeEnum: + return NormTypeEnum.RMSNorm + + @property + def positional_embedding_type(self) -> PositionalEmbeddingType: + return PositionalEmbeddingType.rotate_half + + @property + def positional_embedding_config(self) -> Optional[RotateHalfConfig]: + return RotateHalfConfig(theta_base=self._config.rope_theta) + + def make_norm_layer(self) -> None: + """ + Instantiates the normalization layer for the model. This sets the `self.norm` attribute. + + TODO(cmikeh2): In the future we'll distinguish between the different norm objects, + but for now we'll just use the same one for all of them. + """ + norm_config = DSNormConfig( + max_tokens=self._engine_config.state_manager.max_ragged_batch_size, + type=self.norm_type, + channels=self.model_dim, + residual_dtype=self.activation_dtype, + input_dtype=self.activation_dtype, + output_dtype=self.activation_dtype, + eps=self._config.rms_norm_eps, + ) + + self.norm = heuristics.instantiate_pre_norm(norm_config, self._engine_config) + + """ + Forward implementations + """ + + def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor: + """ + Performs the embedding lookup prior to running the transformer of the model. + + Arguments: + ragged_batch (RaggedBatchWrapper): The batch to embed. + + Returns: + torch.Tensor: The embedded batch. + """ + embed = self.embed(ragged_batch, self._non_transformer.word_emb) + + if embed.shape[-1] != self.model_dim: + raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}") + + return embed + + def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor, + ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead + optimization to fuse the layer norm of the next layer into the current layer. + + Arguments: + layer_idx (int): The index of the layer to execute. + residual (torch.Tensor): The residual tensor from the previous layer. + hidden_states (torch.Tensor): The hidden states from the previous layer. This is the + hidden states after pre normalization. + ragged_batch_info (RaggedBatchWrapper): The batch metadata. + """ + # TODO(cmikeh2): Distribute ragged_batch_info to all modules + + cur_params = self._transformer[layer_idx] + kv_cache = self.state_manager.get_cache(layer_idx) + + hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=cur_params.qkv_b) + hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info) + hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None) + + if self.tp_size > 1: + dist.all_reduce(hidden_states, group=self._base_mp_group) + + residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None) + + # Should be configurable in the future + hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=None) + hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=None) + + if self.tp_size > 1: + dist.all_reduce(hidden_states, group=self._base_mp_group) + + if layer_idx != self.num_layers - 1: + next_params = self._transformer[layer_idx + 1] + residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None) + else: + # On last layer, we just need to perform the residual add. Adding into the residual + # here is safe. + residual.add_(hidden_states) + + return residual, hidden_states + + def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor: + """ + Performs unembedding of the hidden states to logits. This will only sample the final + token of each sequence. + """ + logits = self.unembed(hidden_states, + self._non_transformer.word_unembed, + ragged_batch_info, + gamma=self._non_transformer.final_norm) + + if self.tp_size > 1: + comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1])) + full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size)) + + dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group) + + full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size)) + + return full_logits + else: + return logits + + def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor: + + residual = self._forward_embed(wrapped_batch) + + residual, hidden_states = self.norm(residual, None, self._transformer[0].attn_norm_gamma, beta=None) + + for layer_idx in range(self.num_layers): + residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states, + wrapped_batch) + + return self._forward_unembed(residual, wrapped_batch) diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/policy.py b/deepspeed/inference/v2/model_implementations/qwen_v2/policy.py new file mode 100644 index 000000000000..9c5db2ba0065 --- /dev/null +++ b/deepspeed/inference/v2/model_implementations/qwen_v2/policy.py @@ -0,0 +1,31 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from typing import Any + +from ...config_v2 import RaggedInferenceEngineConfig +from ..inference_policy_base import ContainerMap, InferenceV2Policy +from .container import Qwen2NonTransformerContainer, Qwen2TransformerContainer +from .model import Qwen2InferenceModel + + +class Qwen2Policy(InferenceV2Policy): + + def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> Qwen2InferenceModel: + return Qwen2InferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group) + + def build_container_map(self) -> ContainerMap: + map = ContainerMap() + + transformer_containers = [Qwen2TransformerContainer(self.model) for _ in range(self.model.num_layers)] + + map.set_transformer_params(['model.layers'], transformer_containers) + + map.set_non_transformer_params(Qwen2NonTransformerContainer(self.model)) + + map.set_unmapped_params( + [f'model.layers.{i}.self_attn.rotary_emb.inv_freq' for i in range(self.model.num_layers)]) + + return map From 3e06a154b4d6a2f93c2c5504f52a932184def112 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Mon, 4 Mar 2024 09:49:45 -0800 Subject: [PATCH 03/13] Rename nv-torch-latest-cpu workflow to cpu-torch-latest (#5226) Rename nv-torch-latest-cpu workflow to remove the nv- prefix as it does not run on nvidia hardware. FYI @delock --- .../workflows/{nv-torch-latest-cpu.yml => cpu-torch-latest.yml} | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename .github/workflows/{nv-torch-latest-cpu.yml => cpu-torch-latest.yml} (98%) diff --git a/.github/workflows/nv-torch-latest-cpu.yml b/.github/workflows/cpu-torch-latest.yml similarity index 98% rename from .github/workflows/nv-torch-latest-cpu.yml rename to .github/workflows/cpu-torch-latest.yml index 60f9332f835d..ba4906db15c9 100644 --- a/.github/workflows/nv-torch-latest-cpu.yml +++ b/.github/workflows/cpu-torch-latest.yml @@ -1,4 +1,4 @@ -name: nv-torch-latest-cpu +name: cpu-torch-latest on: workflow_dispatch: diff --git a/README.md b/README.md index c78c08e65498..02dc3aec7f7f 100755 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram | ----------- | ------ | | NVIDIA | [![nv-torch110-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml) [![nv-torch110-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-h100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) | | AMD | [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) | -| CPU | [![nv-torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml) | +| CPU | [![nv-torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml) | | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) | | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) | | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) | From e6e8c1378de035df59034d09373b44af3319b6d7 Mon Sep 17 00:00:00 2001 From: Reza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com> Date: Mon, 4 Mar 2024 10:35:12 -0800 Subject: [PATCH 04/13] Fix moe cpu offload (#5220) The MoE- param gradients norms don't need to be averaged when created on CPU only when using 1-DP training. However, I just moved the tensor back to GPU to get average when having data-parallel on the MoE parameters and using CPU-offload. This PR addresses https://github.com/microsoft/DeepSpeed/issues/5203 --------- Co-authored-by: Reza Yazdani --- deepspeed/runtime/zero/stage_1_and_2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index e4009f6ac883..71a01b2391f8 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1946,8 +1946,10 @@ def _average_expert_grad_norms(self, norm_groups): for i, norm in enumerate(norm_groups): if self.is_moe_param_group[i]: scaled_norm_tensor = norm * 1.0 / dist.get_world_size(group=self.real_dp_process_group[i]) + if self.device == 'cpu': + scaled_norm_tensor = scaled_norm_tensor.to(get_accelerator().current_device_name()) dist.all_reduce(scaled_norm_tensor, group=self.real_dp_process_group[i]) - norm_groups[i] = scaled_norm_tensor + norm_groups[i] = scaled_norm_tensor.to(self.device) def unscale_and_clip_grads(self, grad_groups_flat, total_norm): # compute combined scale factor for this group From acf07398b7887f70ab5ffcff2733747f8440b276 Mon Sep 17 00:00:00 2001 From: iLeGend Date: Tue, 5 Mar 2024 07:11:57 +0800 Subject: [PATCH 05/13] Use `deepspeed.comm` instead of `torch.distributed` (#5225) --- deepspeed/moe/sharded_moe.py | 6 +----- deepspeed/runtime/comm/coalesced_collectives.py | 3 +-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py index d92211b9d220..e6a5292d7e4f 100644 --- a/deepspeed/moe/sharded_moe.py +++ b/deepspeed/moe/sharded_moe.py @@ -95,11 +95,7 @@ def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor: class _AllToAll(torch.autograd.Function): @staticmethod - def forward( - ctx: Any, - # TODO: replace with DS process group - group: torch.distributed.ProcessGroup, - input: Tensor) -> Tensor: # type: ignore + def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor) -> Tensor: # type: ignore ctx.group = group input = input.contiguous() output = torch.empty_like(input) diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py index d63d7e985e07..543795126fab 100644 --- a/deepspeed/runtime/comm/coalesced_collectives.py +++ b/deepspeed/runtime/comm/coalesced_collectives.py @@ -12,8 +12,7 @@ import torch from torch import Tensor from deepspeed import comm as dist -# NOTE: Use torch.distributed's ProcessGroup class until we have our own. -from torch.distributed import ProcessGroup, all_to_all_single +from deepspeed.comm import ProcessGroup, all_to_all_single from deepspeed.accelerator import get_accelerator from deepspeed.utils import instrument_w_nvtx from deepspeed.ops import op_builder From bc0d24651d7d40d5b78ff5a2f1702c7169852433 Mon Sep 17 00:00:00 2001 From: Yejing-Lai Date: Wed, 6 Mar 2024 06:31:46 +0800 Subject: [PATCH 06/13] fix fused_qkv model accuracy issue (#5217) Fused_qkv model can not correctly choose the fused_qkv type. Need to update the module_name_matches. Co-authored-by: Olatunji Ruwase --- deepspeed/module_inject/fusedqkv_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py index ba238cba7508..cf087c16da8a 100644 --- a/deepspeed/module_inject/fusedqkv_utils.py +++ b/deepspeed/module_inject/fusedqkv_utils.py @@ -113,7 +113,7 @@ def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None, module=None): raise ValueError("unknown fused_qkv_type") - module_name_matches = [k for k in fused_type_dict.keys() if module_str in k] + module_name_matches = [k for k in fused_type_dict.keys() if k in module_str] if module_name_matches: # There can be overlap with matches (e.g., "DecoderLayer" and "FalconDecoderLayer"). # We take the longest matching module_name From db70c183d8780e429886e6bb2fe06133e70f63be Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 5 Mar 2024 17:18:03 -0800 Subject: [PATCH 07/13] Update version.txt after 0.13.5 release (#5229) **Auto-generated PR to update version.txt after a DeepSpeed release** Released version - 0.13.5 Author - @mrwyattii Co-authored-by: mrwyattii --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index c37136a84824..ebf55b3d7679 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.13.5 +0.13.6 From 5a2e705b888bbcb41beef32b7f58a0e9010d287a Mon Sep 17 00:00:00 2001 From: Moshe Island Date: Thu, 7 Mar 2024 16:10:13 +0200 Subject: [PATCH 08/13] MOE gate fixes and enhancements (#5156) Fixes the following issues: - Fix capacity when using TP for non-MoE by aligning the capacity to TP - Fix TopKGate.wg (gate weight) when using ZeRO with fp16 or bf16 - Fix top2 aux loss to be similar to top1 aux loss Following are few configurable enhancements: - Support top2 with disable token dropping - Support disable top2 2nd expert sampling --------- Signed-off-by: Moshe Island Co-authored-by: Moshe Island Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/moe/layer.py | 7 +++-- deepspeed/moe/sharded_moe.py | 61 ++++++++++++++++++++++++------------ 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/deepspeed/moe/layer.py b/deepspeed/moe/layer.py index 46f7924ac038..dfa9fcf4f464 100644 --- a/deepspeed/moe/layer.py +++ b/deepspeed/moe/layer.py @@ -32,6 +32,7 @@ class MoE(nn.Module): use_rts (bool, optional): default=True, whether to use Random Token Selection. use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed). enable_expert_tensor_parallelism (bool, optional): default=False, whether to use tensor parallelism for experts + top2_2nd_expert_sampling (bool, optional): default=True, whether to perform sampling for 2nd expert """ def __init__(self, @@ -48,7 +49,8 @@ def __init__(self, drop_tokens: bool = True, use_rts: bool = True, use_tutel: bool = False, - enable_expert_tensor_parallelism: bool = False) -> None: + enable_expert_tensor_parallelism: bool = False, + top2_2nd_expert_sampling: bool = True) -> None: super(MoE, self).__init__() @@ -69,7 +71,8 @@ def __init__(self, experts = Experts(expert, self.num_local_experts, self.expert_group_name) self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor, - min_capacity, noisy_gate_policy, drop_tokens, use_rts), + min_capacity, noisy_gate_policy, drop_tokens, use_rts, + top2_2nd_expert_sampling), experts, self.expert_group_name, self.ep_size, diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py index e6a5292d7e4f..d6c023ec11d3 100644 --- a/deepspeed/moe/sharded_moe.py +++ b/deepspeed/moe/sharded_moe.py @@ -210,6 +210,11 @@ def top1gating(logits: Tensor, if not drop_tokens: new_capacity = torch.max(exp_counts).to(logits.device) dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group()) + if groups._get_expert_model_parallel_world_size() == 1: + # If the non-expert is tensor-parallel, we need to pad the capacity to 'tp'. + # This is since we are going to activate drop_tokens() to drop duplicate tokens. + tp = 1 if groups.mpu is None else groups.mpu.get_tensor_model_parallel_world_size() + new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype) capacity = new_capacity # Compute l_aux @@ -275,23 +280,27 @@ def top1gating(logits: Tensor, return l_aux, combine_weights, dispatch_mask, exp_counts -def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]: +def top2gating(logits: Tensor, + capacity_factor: float, + min_capacity: int, + drop_tokens: bool = True, + top2_2nd_expert_sampling: bool = True) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Implements Top2Gating on logits.""" # everything is in fp32 in this function gates = F.softmax(logits, dim=1) - capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity)) - # Create a mask for 1st's expert per token indices1_s = torch.argmax(gates, dim=1) num_experts = int(gates.shape[1]) mask1 = F.one_hot(indices1_s, num_classes=num_experts) - # Create a mask for 2nd's expert per token using Gumbel-max trick - # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/ - logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device) + if top2_2nd_expert_sampling: + # Create a mask for 2nd's expert per token using Gumbel-max trick + # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/ + logits += gumbel_rsample(logits.shape, device=logits.device) + # Replace top-expert with min value - logits_except1 = logits_w_noise.masked_fill(mask1.bool(), float("-inf")) + logits_except1 = logits.masked_fill(mask1.bool(), float("-inf")) indices2_s = torch.argmax(logits_except1, dim=1) mask2 = F.one_hot(indices2_s, num_classes=num_experts) @@ -301,17 +310,29 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tup # Update 2nd's location by accounting for locations of 1st locations2 += torch.sum(mask1, dim=0, keepdim=True) - # gating decisions - exp_counts = torch.sum(mask1, dim=0).detach().to('cpu') - # Compute l_aux me = torch.mean(gates, dim=0) ce = torch.mean(mask1.float(), dim=0) l_aux = torch.mean(me * ce) * num_experts * num_experts - # Remove locations outside capacity from mask - mask1 *= torch.lt(locations1, capacity) - mask2 *= torch.lt(locations2, capacity) + # gating decisions + exp_counts = torch.sum(mask1 + mask2, dim=0) + + if drop_tokens: + # Calculate configured capacity and remove locations outside capacity from mask + capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity)) + mask1 *= torch.lt(locations1, capacity) + mask2 *= torch.lt(locations2, capacity) + else: + # Do not drop tokens - set capacity according to current expert assignments + new_capacity = torch.max(exp_counts) + dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group()) + if groups._get_expert_model_parallel_world_size() == 1: + # If the non-expert is tensor-parallel, we need to pad the capacity to 'tp'. + # This is since we are going to activate drop_tokens() to drop duplicate tokens. + tp = 1 if groups.mpu is None else groups.mpu.get_tensor_model_parallel_world_size() + new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype) + capacity = new_capacity # Store the capacity location for each token locations1_s = torch.sum(locations1 * mask1, dim=1) @@ -338,7 +359,7 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tup combine_weights = combine1_sec + combine2_sec dispatch_mask = combine_weights.bool() - return l_aux, combine_weights, dispatch_mask, exp_counts + return l_aux, combine_weights, dispatch_mask, exp_counts.detach().to('cpu') class TopKGate(Module): @@ -368,13 +389,14 @@ def __init__(self, min_capacity: int = 8, noisy_gate_policy: Optional[str] = None, drop_tokens: bool = True, - use_rts: bool = True) -> None: + use_rts: bool = True, + top2_2nd_expert_sampling: bool = True) -> None: super().__init__() # Only top-1 and top-2 are supported at the moment. if k != 1 and k != 2: raise ValueError('Only top-1 and top-2 gatings are supported.') - self.wg = torch.nn.Linear(model_dim, num_experts, bias=False).float() + self.wg = torch.nn.Linear(model_dim, num_experts, bias=False) self.k = k self.capacity_factor = capacity_factor self.eval_capacity_factor = eval_capacity_factor @@ -385,6 +407,7 @@ def __init__(self, self.gate_time = 0.0 self.drop_tokens = drop_tokens self.use_rts = use_rts + self.top2_2nd_expert_sampling = top2_2nd_expert_sampling def forward(self, input: torch.Tensor, @@ -394,13 +417,11 @@ def forward(self, if self.wall_clock_breakdown: self.timers(TOPK_GATE_TIMER).start() - if self.wg.weight.dtype != torch.float32: - self.wg = self.wg.float() input_fp32 = input.float() # input jittering if self.noisy_gate_policy == 'Jitter' and self.training: input_fp32 = multiplicative_jitter(input_fp32, device=input.device) - logits = self.wg(input_fp32) + logits = torch.nn.functional.linear(input_fp32, weight=self.wg.weight.float(), bias=None) if self.k == 1: gate_output = top1gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor, @@ -409,7 +430,7 @@ def forward(self, else: gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor, - self.min_capacity) + self.min_capacity, self.drop_tokens, self.top2_2nd_expert_sampling) if self.wall_clock_breakdown: self.timers(TOPK_GATE_TIMER).stop() From ccfdb84e2a4a373ac657a99afd2d97e1d741b22b Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Thu, 7 Mar 2024 16:55:43 -0800 Subject: [PATCH 09/13] FP6 quantization end-to-end. (#5234) The user interface: https://github.com/microsoft/DeepSpeed-MII/pull/433 nv-a6000 ci running against the MII branch linked above is [here](https://github.com/microsoft/DeepSpeed/actions/runs/8192124606) Co-authored-by: Zhen Zheng [zhengzhen@microsoft.com](mailto:zhengzhen@microsoft.com) Co-authored-by: Shiyang Chen [csycfl@gmail.com](mailto:csycfl@gmail.com) Co-authored-by: Arash Bakhtiari [abakhtiari@microsoft.com](mailto:abakhtiari@microsoft.com) Co-authored-by: Haojun Xia [xhjustc@mail.ustc.edu.cn](mailto:xhjustc@mail.ustc.edu.cn) --------- Co-authored-by: ZHENG, Zhen Co-authored-by: Shiyang Chen Co-authored-by: Haojun Xia Co-authored-by: Arash Bakhtiari Co-authored-by: Michael Wyatt Co-authored-by: Michael Wyatt --- .github/workflows/nv-a6000.yml | 3 +- deepspeed/inference/v2/config_v2.py | 14 +- .../inference/v2/kernels/core_ops/__init__.py | 1 + .../v2/kernels/core_ops/core_ops.cpp | 6 + .../kernels/core_ops/cuda_linear/__init__.py | 6 + .../core_ops/cuda_linear/cuda_linear.py | 207 ++++++++++++ .../cuda_linear/cuda_linear_kernels.cpp | 224 +++++++++++++ .../cuda_linear/cuda_linear_kernels.h | 25 ++ .../core_ops/cuda_linear/fp6_linear.cu | 315 ++++++++++++++++++ .../core_ops/cuda_linear/fp6_linear.cuh | 46 +++ .../core_ops/cuda_linear/include/configs.h | 96 ++++++ .../cuda_linear/include/kernel_matmul.cuh | 261 +++++++++++++++ .../cuda_linear/include/kernel_reduction.cuh | 38 +++ .../cuda_linear/include/ptx_cp.async.cuh | 52 +++ .../core_ops/cuda_linear/include/ptx_mma.cuh | 125 +++++++ .../cuda_linear/include/utils_core.cuh | 246 ++++++++++++++ .../cuda_linear/include/utils_gmem.cuh | 86 +++++ .../include/utils_paralleldequant.cuh | 127 +++++++ .../cuda_linear/include/weight_prepacking.h | 204 ++++++++++++ .../gated_activation_kernels_cuda.cu | 3 + .../flat_model_helpers.py | 2 +- deepspeed/inference/v2/modules/heuristics.py | 11 +- .../implementations/linear/__init__.py | 1 + .../linear/quantized_linear.py | 205 ++++++++++++ op_builder/inference_core_ops.py | 12 + requirements/requirements-inf.txt | 1 + .../modules/test_quantized_linear_module.py | 184 ++++++++++ .../v2/ragged/test_manager_configs.py | 2 +- 28 files changed, 2497 insertions(+), 6 deletions(-) create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/__init__.py create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/configs.h create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh create mode 100644 deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h create mode 100644 deepspeed/inference/v2/modules/implementations/linear/quantized_linear.py create mode 100644 tests/unit/inference/v2/modules/test_quantized_linear_module.py diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index d7db447f5d26..960e0203919e 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -47,7 +47,8 @@ jobs: - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja - python -m pip install .[dev,1bit,autotuning] + python -m pip install pydantic==1.10.11 + python -m pip install .[dev,1bit,autotuning,inf] ds_report - name: Python environment run: | diff --git a/deepspeed/inference/v2/config_v2.py b/deepspeed/inference/v2/config_v2.py index 64e7e29b1844..85e4b7a0e0a0 100644 --- a/deepspeed/inference/v2/config_v2.py +++ b/deepspeed/inference/v2/config_v2.py @@ -3,8 +3,8 @@ # DeepSpeed Team +from typing import Optional from deepspeed.pydantic_v1 import Field - from deepspeed.runtime.config_utils import DeepSpeedConfigModel from .ragged import DSStateManagerConfig @@ -16,6 +16,16 @@ class DeepSpeedTPConfig(DeepSpeedConfigModel): """ Number of devices to split the model across using tensor parallelism. """ +class QuantizationConfig(DeepSpeedConfigModel): + """ Configure tensor parallelism settings """ + + quantization_mode: Optional[str] = None + """ The quantization mode in string format. The supported modes are as follows: + - 'wf6af16', weight-only quantization with FP6 weight and FP16 activation. + """ + # TODO: may reuse the constants in deepspeed/compression/constants.py + + class RaggedInferenceEngineConfig(DeepSpeedConfigModel): """ Sets parameters for DeepSpeed Inference Engine. """ @@ -29,3 +39,5 @@ class RaggedInferenceEngineConfig(DeepSpeedConfigModel): """ Configuration for managing persistent state """ + + quantization: QuantizationConfig = {} diff --git a/deepspeed/inference/v2/kernels/core_ops/__init__.py b/deepspeed/inference/v2/kernels/core_ops/__init__.py index bbb53e5b58a2..1d16b484a560 100644 --- a/deepspeed/inference/v2/kernels/core_ops/__init__.py +++ b/deepspeed/inference/v2/kernels/core_ops/__init__.py @@ -8,3 +8,4 @@ from .cuda_layer_norm import * from .cuda_rms_norm import * from .gated_activations import * +from .cuda_linear import * diff --git a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp index 58df88e56136..2397b0694696 100644 --- a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp +++ b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp @@ -8,6 +8,7 @@ #include "bias_activation.h" #include "blas.h" +#include "cuda_linear_kernels.h" #include "gated_activation_kernels.h" #include "layer_norm.h" #include "rms_norm.h" @@ -33,4 +34,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) // rms_norm.h m.def("rms_norm", &rms_norm, "DeepSpeed rms norm in CUDA"); m.def("rms_pre_norm", &rms_pre_norm, "DeepSpeed rms pre norm in CUDA"); + + // cuda_linear_kernels.h + m.def("cuda_wf6af16_linear", &cuda_wf6af16_linear, "DeepSpeed Wf6Af16 linear in CUDA"); + m.def( + "preprocess_weight", &preprocess_weight, "preprocess the FP16 weight to be 2bit and 4 bit"); } diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/__init__.py b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/__init__.py new file mode 100644 index 000000000000..cd08409c0a7a --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from .cuda_linear import * diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py new file mode 100644 index 000000000000..69aa9e8920e2 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py @@ -0,0 +1,207 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import torch + +from ....inference_utils import DtypeEnum +from ....logging import inference_logger +from deepspeed.ops.op_builder import InferenceCoreBuilder +from ... import DSKernelBase + + +class CUDAWf6Af16Linear(DSKernelBase): + """ + Wrapper around the CUDA kernel of Wf6Af16 quantized linear. + + Performs z = x @ y + """ + supported_dtypes = [DtypeEnum.fp16] + + def __init__(self): + self.inf_module = InferenceCoreBuilder().load() + self.inf_module.create_handle() + self.kernel = self.inf_module.cuda_wf6af16_linear + # The split_k_map is profiled on A100-80G GPU for some common shapes. + # It is an array of dictionaries, where the array index is the tokens chunk id. + # The dictionary is the mapping from the output channel to the split-K size. + self.split_k_map = [ + { # tokens: [1, 64] + 3072: 18, + 4096: 13, + 5120: 10, + 6144: 9, + 8192: 6, + 10240: 5, + 14336: 7, + 28672: 7, + 57344: 7 + }, + { # tokens: [65:128] + 3072: 9, + 4096: 6, + 5120: 5, + 6144: 9, + 8192: 3, + 10240: 5, + 14336: 7, + 28672: 7, + 57344: 6 + }, + { # tokens: [129:192] + 3072: 6, + 4096: 4, + 5120: 7, + 6144: 3, + 8192: 2, + 10240: 5, + 14336: 5, + 28672: 5, + 57344: 4 + }, + { # tokens: [193:256] + 3072: 9, + 4096: 3, + 5120: 5, + 6144: 2, + 8192: 5, + 10240: 4, + 14336: 8, + 28672: 6, + 57344: 4 + }, + { # tokens: [257:320] + 3072: 7, + 4096: 5, + 5120: 2, + 6144: 5, + 8192: 4, + 10240: 1, + 14336: 3, + 28672: 3, + 57344: 4 + }, + { # tokens: [321:384] + 3072: 3, + 4096: 2, + 5120: 5, + 6144: 3, + 8192: 1, + 10240: 8, + 14336: 3, + 28672: 4, + 57344: 3 + }, + { # tokens: [385:448] + 3072: 5, + 4096: 7, + 5120: 3, + 6144: 5, + 8192: 7, + 10240: 3, + 14336: 1, + 28672: 1, + 57344: 3 + }, + { # tokens: [449:512] + 3072: 2, + 4096: 5, + 5120: 4, + 6144: 1, + 8192: 5, + 10240: 2, + 14336: 6, + 28672: 4, + 57344: 1 + }, + { # tokens: [513:576] + 3072: 2, + 4096: 3, + 5120: 1, + 6144: 1, + 8192: 3, + 10240: 3, + 14336: 3, + 28672: 1, + 57344: 1 + }, + { # tokens: [577:640] + 3072: 5, + 4096: 4, + 5120: 1, + 6144: 4, + 8192: 2, + 10240: 1, + 14336: 1, + 28672: 1, + 57344: 1 + }, + { # tokens: [641:704] + 3072: 3, + 4096: 1, + 5120: 2, + 6144: 2, + 8192: 1, + 10240: 2, + 14336: 1, + 28672: 1, + 57344: 1 + }, + { # tokens: [705:768] + 3072: 3, + 4096: 1, + 5120: 3, + 6144: 2, + 8192: 1, + 10240: 1, + 14336: 1, + 28672: 1, + 57344: 1 + } + ] + + def __call__(self, output: torch.Tensor, hidden_states: torch.Tensor, weights_2bit: torch.Tensor, + weights_4bit: torch.Tensor, scale: torch.Tensor, out_channels, tokens, in_channels) -> torch.Tensor: + """ + Matmul kernel of FP6 weight-only quantized linear. All inputs should be contiguous. + It does not support batched-matmul. + + Parameters: + output (torch.Tensor): Output tensor. Shape is of [token_number, out_features] + hidden_states (torch.Tensor): Input tensor. Shape is of [token_number, in_features] + weights_2bit (torch.Tensor): Input tensor of the 2-bit slice. Shape is of [out_features*2/8, in_features] + weights_4bit (torch.Tensor): Input tensor of the 4-bit slice. Shape is of [out_features*4/8, in_features] + scale (torch.Tensor): Input tensor. Shape is of [out_features], since the scale is per output channel + out_channels (int): The number of output channels + tokens (int): The number of tokens + in_channels (int): The number of input channels + """ + + if out_channels % 256 != 0 or in_channels % 64 != 0: + raise ValueError("The out and in channel should be multiple of 256 and 64 respectively.") + + # TODO: add a more general heuristic to determine the split-K. + split_k = -1 # not initialized + if tokens <= 768: + # Try to find the split-K from the pre-profiled map. + tokens_chunk_id = (tokens - 1) // 64 + split_k = self.split_k_map[tokens_chunk_id].get(out_channels, -1) + if split_k == -1: + split_k = 1 + inference_logger().warning( + f"The split-K setting may be suboptimal for shape {tokens}x{in_channels}x{out_channels}...") + + workspace = self.get_workspace(out_channels, tokens, in_channels, split_k, torch.float, hidden_states.device) + self.kernel(output, hidden_states, weights_2bit, weights_4bit, scale, workspace, out_channels, tokens, + in_channels, split_k) + + def get_workspace(self, out_channels: int, tokens: int, in_channels: int, split_k: int, dtype, + device) -> torch.Tensor: + """ + Allocate workspace for the kernel. The workspace is used to store the intermediate results of the matmul before + split-K. The split-K size is determined by the size of the matmul. + """ + workspace = torch.empty((split_k, out_channels, tokens), dtype=dtype, device=device) + + return workspace diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp new file mode 100644 index 000000000000..677bec22ded8 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp @@ -0,0 +1,224 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#include + +#include "cuda_linear_kernels.h" + +namespace { + +// For bit-level debugging. +template +void print_bits(T num) +{ + char bits[sizeof(T) * 8 + 1] = {'\0'}; + for (int bit = 0; bit < (sizeof(T) * 8); bit++) { + bits[sizeof(T) * 8 - 1 - bit] = '0' + (num & 0x01); + num = num >> 1; + } + printf("%s\n", bits); +} + +void print_bits(half num) +{ + char bits[sizeof(half) * 8 + 1] = {'\0'}; + auto int_num = *reinterpret_cast(&num); + for (int bit = 0; bit < (sizeof(half) * 8); bit++) { + bits[sizeof(half) * 8 - 1 - bit] = '0' + (int_num & 0x01); + int_num = int_num >> 1; + } + printf("%s\n", bits); +} + +/* + * Function to pack 4 fake quantized FP16 value into continuously stored 4 FP6 values. + */ +void cast_fp16_fp6(uint16_t* FP16x4, uint8_t* FP6x4) +{ + // Constants for FP6 + constexpr int exponent_nbits_fp6 = 3; + constexpr int mantissa_nbits_fp6 = 2; + constexpr int exp_bias_fp6 = (1 << (exponent_nbits_fp6 - 1)) - 1; + // Constants for FP16 + constexpr int exponent_nbits_fp16 = 5; + constexpr int mantissa_nbits_fp16 = 10; + constexpr int exp_bias_fp16 = (1 << (exponent_nbits_fp16 - 1)) - 1; + + int fp6_temp[4]; + + float absmin_nonzero_fp6 = 0.0625; + // Note that we regard the exponent of '111' as a regular value rather than NaN or inf. This is + // the same with that in qtorch. + float absmax_fp6 = 28; + + for (int i = 0; i < 4; ++i) { + uint16_t source = FP16x4[i]; + float fp6_value_abs = std::abs(__half2float(*((half*)(&source)))); + if ((fp6_value_abs != 0 && fp6_value_abs < absmin_nonzero_fp6) || + fp6_value_abs > absmax_fp6) { + // TODO(zhen): a better way may be rounding it to the nearest FP6 value. + throw std::invalid_argument("Input value out of range for FP6."); + } + + // It is not safe to do shift operation on uint16_t. So we promote it to int. + int source_promote = int(source); + + int sign_bit = (source_promote >> 15); + // Extracting exponent represented in FP16. The sign mask 0x7FFF is '0111 1111 1111 1111' + int exp_bit = (source_promote & 0x7FFF) >> mantissa_nbits_fp16; + // Extracting mantissa represented in FP16 + int mant_bit = source_promote & ((1 << mantissa_nbits_fp16) - 1); + + int new_exp_bit; + int new_mant_bit; + + if (exp_bit == 0) { + // Subnormal FP16 number. Too small for FP6. + new_exp_bit = 0; + new_mant_bit = 0; + } else { + new_mant_bit = mant_bit >> (mantissa_nbits_fp16 - mantissa_nbits_fp6); + new_exp_bit = exp_bit - exp_bias_fp16 + exp_bias_fp6; + + // Deal with subnormal FP6 values. + int target_exp_val = exp_bit - exp_bias_fp16; + int min_fp6_exp_val = -exp_bias_fp6 + 1; + bool subnormal_fp6 = target_exp_val < min_fp6_exp_val; + if (subnormal_fp6) { + // TODO(zhen): add the rounding logic. + new_exp_bit = 0; + // The implicit 1 in the mantissa of FP16 is not present in subnormal FP6. Thus we + // need to add it + new_mant_bit = (new_mant_bit | (1 << mantissa_nbits_fp6)) >> + (min_fp6_exp_val - target_exp_val); + } + } + + fp6_temp[i] = (sign_bit << (exponent_nbits_fp6 + mantissa_nbits_fp6)) | + (new_exp_bit << mantissa_nbits_fp6) | new_mant_bit; + } + // Pack the values + FP6x4[0] = fp6_temp[0] << 2 | (fp6_temp[1] >> 4); + FP6x4[1] = (fp6_temp[1] & 0x0F) << 4 | (fp6_temp[2] >> 2); + FP6x4[2] = (fp6_temp[2] & 0x03) << 6 | fp6_temp[3]; +} + +/* + * Function to prepack FP16 weights into continuous FP6 values. + * + * Parameters: + * weight_16bit: input weight in FP16, size M*K + * weight_6bit: output weight in packed FP6, continuously stored, size M*K*6/8 + * M, K: the shape of the weight + */ +void weight_prepacking_fp16_to_fp6(uint16_t* weight_16bit, + uint8_t* weight_6bit_packed, + size_t M, + size_t K) +{ + // Every four 16-bit elements are packed into three 6-bit values (4*6bit == 3*8bit). + if (K * 6 % 8 != 0) { throw std::invalid_argument("(K * 6 % 8) should be 0"); } + size_t K_fp6_packed = K * 6 / 8; + // #pragma omp parallel for + for (auto m = 0; m < M; m++) { + uint8_t* ptr_6bit = weight_6bit_packed + m * K_fp6_packed; + uint16_t* ptr_16bit = weight_16bit + m * K; + for (auto k = 0; k < K; k += 4) { + cast_fp16_fp6(ptr_16bit, ptr_6bit); + ptr_16bit += 4; + ptr_6bit += 3; + } + } +} + +} // namespace + +/* + * Function to execute the FP6 linear kernel. + * + * Parameters: + * output: output tensor, size M*N + * hidden_states: input activation tensor, size N*K + * weights_2bit: packed 2bit weights, size M*K*2/8 + * weights_4bit: packed 4bit weights, size M*K*4/8 + * scales: scale tensor, size M + * workspace: workspace tensor, size M*N*split_k + * M: the output channel number of the weight + * N: the token number of the activation + * K: the input channel number of the weight + * split_k: the split size of the GEMM calculation + */ +void cuda_wf6af16_linear(torch::Tensor& output, + torch::Tensor& hidden_states, + torch::Tensor& weights_2bit, + torch::Tensor& weights_4bit, + torch::Tensor& scales, + torch::Tensor& workspace, + int M, + int N, + int K, + int split_k) +{ + TORCH_CHECK(weights_2bit.device().type() == torch::kCUDA, "weight_2bit must be on CUDA"); + TORCH_CHECK(weights_4bit.device().type() == torch::kCUDA, "weight_4bit must be on CUDA"); + TORCH_CHECK(hidden_states.device().type() == torch::kCUDA, "X must be on CUDA"); + TORCH_CHECK(scales.device().type() == torch::kCUDA, "scales must be on CUDA"); + + auto status = fp6_linear_kernel(at::cuda::getCurrentCUDAStream(), + (uint4*)(weights_2bit.data_ptr()), + (uint4*)(weights_4bit.data_ptr()), + (half*)(scales.data_ptr()), + (half*)(hidden_states.data_ptr()), + (half*)(output.data_ptr()), + M, + N, + K, + workspace.data_ptr(), + split_k); + if (status != cudaSuccess) { + AT_ERROR("fp6_linear_kernel failed with error: ", cudaGetErrorString(status)); + } +} + +/* + * Function to prepack the fake 6-bit-quantized FP16 weights into 2bit and 4bit. + * + * Parameters: + * weight: input weight in FP16 (containing the quantized FP6-ranged value), size M*K + * Returns: + * weight_2bit: output weight in 2bit, size M*K*2/8 + * weight_4bit: output weight in 4bit, size M*K*4/8 + */ +std::vector preprocess_weight(torch::Tensor& weight) +{ + TORCH_CHECK(weight.dim() == 2, "weight must be 2-dimensional"); + TORCH_CHECK(weight.scalar_type() == torch::kFloat16, "weight must be FP16"); + TORCH_CHECK(weight.is_contiguous(), "weight must be contiguous"); + TORCH_CHECK(weight.device().type() == torch::kCPU, "weight must be on CPU"); + auto M = weight.size(0); + auto K = weight.size(1); + TORCH_CHECK(K % 4 == 0, "K must be multiple of 4"); + + // Pack weight from FP16 to FP6. + uint16_t* weight_16bit_ptr = reinterpret_cast(weight.data_ptr()); + std::vector weight_6bit_packed(M * K * 6 / 8); + uint8_t* weight_6bit_ptr = weight_6bit_packed.data(); + weight_prepacking_fp16_to_fp6(weight_16bit_ptr, weight_6bit_ptr, M, K); + + // Split weight into 2bit and 4bit. + weight_matrix_prepacking(reinterpret_cast(weight_6bit_ptr), M, K); + uint8_t* weight_2bit_ptr = weight_6bit_ptr; + + // Make sure that the new split tensor does not share the underlying memory with the original + // one. Otherwise it will incur some problems when the original tensor is deleted. It also + // makes the memory flattern risky. + auto weight_2bit = + torch::from_blob(weight_2bit_ptr, {M * K * 2 / 8}, torch::kUInt8).clone().detach(); + uint8_t* weight_4bit_ptr = weight_2bit_ptr + M * K * 2 / 8; + auto weight_4bit = + torch::from_blob(weight_4bit_ptr, {M * K * 4 / 8}, torch::kUInt8).clone().detach(); + + return {weight_2bit, weight_4bit}; +} diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h new file mode 100644 index 000000000000..0f5882d519ca --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#pragma once + +#include +#include +#include "ds_kernel_utils.h" + +#include "fp6_linear.cuh" + +void cuda_wf6af16_linear(torch::Tensor& output, + torch::Tensor& hidden_states, + torch::Tensor& weights_2bit, + torch::Tensor& weights_4bit, + torch::Tensor& scale, + torch::Tensor& workspace, + int M, + int N, + int K, + int split_k); + +std::vector preprocess_weight(torch::Tensor& Weight); diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu new file mode 100644 index 000000000000..64e06a5435c6 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu @@ -0,0 +1,315 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +// clang-format off +// Put the torch headers at the front to avoid conflict with other headers on +// `at::nullopt` and `at::optional`. +#include +#include +// clang-format on + +#include "include/kernel_matmul.cuh" +#include "include/kernel_reduction.cuh" +#include "include/weight_prepacking.h" + +#include +#include + +template +static void Kernel_Ex(cudaStream_t stream, + const uint4* Weight1, + const uint4* Weight2, + const half* Scales, + const half* B, + OutputDataType* C, + const size_t M_Global, + const size_t N_Global, + const size_t K_Global, + int Split_K) +{ +#ifdef DEBUG_MODE + printf("\n"); + printf("Launcher.cu->Kernel_Ex():\n"); + printf("M: %d, N: %d, K: %d, SplitK: %d\n", M_Global, N_Global, K_Global, Split_K); + printf("TILE_M: %d, TILE_K: %d, TILE_N: %d\n", + TilingConfig::TILE_M, + TilingConfig::TILE_K, + TilingConfig::TILE_N); +#endif + static size_t SHMEM_SZ = + max(TilingConfig::SMEM_SIZE_B_TILE + SMEM_SIZE_A1_TILE + SMEM_SIZE_A2_TILE, + TilingConfig::SMEM_SIZE_C_TILE); + cudaFuncSetAttribute(QUANT_GEMM_Kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + SHMEM_SZ); + size_t dimN = (N_Global - 1) / TilingConfig::TILE_N + 1; + size_t dimM = M_Global * Split_K / TilingConfig::TILE_M; + dim3 GridDim(dimN, dimM, 1); + dim3 BlockDim(WARP_SIZE * TilingConfig::BLOCK_WARPS, 1, 1); +// +#ifdef DEBUG_MODE + printf( + "GridDim.x: %d, GridDim.y: %d, GridDim.z: %d, BlockDim.x: %d, BlockDim.y: %d, BlockDim.z: " + "%d SHMEM_SZ: %d\n", + GridDim.x, + GridDim.y, + GridDim.z, + BlockDim.x, + BlockDim.y, + BlockDim.z, + SHMEM_SZ); + printf("\n"); +#endif + QUANT_GEMM_Kernel<<>>( + Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K); +} + +/* + * + */ +cudaError_t fp6_linear_kernel(cudaStream_t stream, + const uint4* Weight1, + const uint4* Weight2, + const half* Scales, + const half* B, + half* C, + const size_t M_Global, + const size_t N_Global, + const size_t K_Global, + float* Reduction_Workspace, // Reduction_Workspace_Size = Split_K * + // M_Global * N_Global * sizeof(fp32) + int Split_K) +{ + assert(M_Global % 256 == 0); + assert(K_Global % 64 == 0); + assert(N_Global > 0); + + // Work around to support more N shapes: + size_t N_PowerOf2; + if (N_Global > 0 && N_Global <= 8) N_PowerOf2 = 8; + if (N_Global > 8 && N_Global <= 16) N_PowerOf2 = 16; + if (N_Global > 16 && N_Global <= 32) N_PowerOf2 = 32; + if (N_Global > 32 && N_Global <= 64) N_PowerOf2 = 64; + if (N_Global > 64 && N_Global <= 128) N_PowerOf2 = 128; + if (N_Global > 128) N_PowerOf2 = ((N_Global - 1) / 128 + 1) * 128; + + if (Split_K == 1) { + switch (N_PowerOf2) { + case 8: + Kernel_Ex, half>( + stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K); + break; + case 16: + Kernel_Ex, half>( + stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K); + break; + case 32: + Kernel_Ex, half>( + stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K); + break; + case 64: + Kernel_Ex, half>( + stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K); + break; + case 128: + Kernel_Ex, half>( + stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K); + break; + default: + if (N_PowerOf2 % 128 != 0) { + printf("QuantLLM_API Error: Unsupported N dimension %d!\n", N_PowerOf2); + return cudaErrorUnknown; + } + Kernel_Ex, half>( + stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K); + break; + } + } else { + switch (N_PowerOf2) { + case 8: + Kernel_Ex, float>(stream, + Weight1, + Weight2, + Scales, + B, + Reduction_Workspace, + M_Global, + N_Global, + K_Global, + Split_K); + break; + case 16: + Kernel_Ex, float>(stream, + Weight1, + Weight2, + Scales, + B, + Reduction_Workspace, + M_Global, + N_Global, + K_Global, + Split_K); + break; + case 32: + Kernel_Ex, float>(stream, + Weight1, + Weight2, + Scales, + B, + Reduction_Workspace, + M_Global, + N_Global, + K_Global, + Split_K); + break; + case 64: + Kernel_Ex, float>(stream, + Weight1, + Weight2, + Scales, + B, + Reduction_Workspace, + M_Global, + N_Global, + K_Global, + Split_K); + break; + case 128: + Kernel_Ex, float>(stream, + Weight1, + Weight2, + Scales, + B, + Reduction_Workspace, + M_Global, + N_Global, + K_Global, + Split_K); + break; + default: + if (N_PowerOf2 % 128 != 0) { + printf("QuantLLM_API Error: Unsupported N dimension %d!\n", N_PowerOf2); + return cudaErrorUnknown; + } + Kernel_Ex, float>(stream, + Weight1, + Weight2, + Scales, + B, + Reduction_Workspace, + M_Global, + N_Global, + K_Global, + Split_K); + break; + } + // Reduction for SplitK + dim3 GridDim((M_Global * N_Global) / REDUCTION_ELEMENT_PER_THREADBLOCK, 1, 1); + dim3 BlockDim(WARP_SIZE, 1, 1); + SplitK_Reduction<<>>( + C, Reduction_Workspace, M_Global, N_Global, Split_K); + } + return cudaGetLastError(); +} + +/* +Computes FP6-FP16 GEMM (PyTorch interface). + +[Mathematical Formula] +Standard definition of linear layer: Out = In * trans(W), where In, Out, and W are stored in +row-major. After Equivalent transformation : trans(Out) = W * trans(In). Note that we do not +perform "transpose" during runtime, we instead interpret the In/Out as column-major matrices when +calling our CUDA kernel. + +[Inputs] + _in_feats: tensor of shape [B, IC]; // half + _weights: int tensor of shape [OC, IC // 16 * 3]; // 3 INT32 words contains 16 FP6 weights. + _scales: tensor of shape [OC]; // half + splitK: splitting the MatMul problem along K dimension for higher GPU utilization, default 1. +[Outputs] + _out_feats: tensor of shape [B, OC]; // half +*/ +torch::Tensor fp6_linear_forward_cuda(torch::Tensor _in_feats, + torch::Tensor _weights, + torch::Tensor _scales, + int splitK = 1) +{ + int num_in_feats = _in_feats.size(0); + int num_in_channels = _in_feats.size(1); + int num_out_channels = _weights.size(0); + assert(num_in_channels % 64 == 0); + assert((num_in_channels / 16 * 3) == + _weights.size(1)); // Making sure the K dimension is matched. + // + int M = num_out_channels; + int K = num_in_channels; + int N = num_in_feats; + // Input Tensors + auto weight1 = reinterpret_cast( + _weights.data_ptr()); // weights is [OC, IC] but in FP6. + auto weight2 = weight1 + num_in_channels * num_out_channels * 2 / 128; + auto in_feats = reinterpret_cast(_in_feats.data_ptr()); + auto scales = reinterpret_cast(_scales.data_ptr()); + // Output Tensors + auto options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device()); + at::Tensor _out_feats = torch::empty({num_in_feats, num_out_channels}, options); + auto out_feats = reinterpret_cast(_out_feats.data_ptr()); + + float* Reduction_Workspace = nullptr; + if (splitK != 1) { + auto options = torch::TensorOptions().dtype(torch::kFloat32).device(_in_feats.device()); + at::Tensor _workspace = torch::empty({splitK, num_in_feats, num_out_channels}, options); + auto Reduction_Workspace = reinterpret_cast( + _out_feats.data_ptr()); // Reduction_Workspace_Size = Split_K * M_Global * + // N_Global * sizeof(fp32) + } + + fp6_linear_kernel(0, // Using default stream here. + weight1, + weight2, + scales, + in_feats, + out_feats, + M, + N, + K, + Reduction_Workspace, + splitK); + + return _out_feats; +} + +/* + * Inputs: + * (1) unsigned char Weight_6bit [M*K*6/8] + * Outputs: + * (1) unsigned char Weight_2bit [M*K*2/8] + * (2) unsigned char Weight_4bit [M*K*4/8] + * + * Assumption: Weight_6bit, Weight_2bit, Weight_4bit all stored continuously in row-major. + * 8 FP6 = 6 Bytes + * 8 FP4 = 4 Bytes + * 8 FP2 = 2 Bytes + */ + +/* + * Weight prepacking (Pytorch interface). + * [Input & Output] + * fp6_tensor: int tensor of shape [OC, IC // 16 * 3]; // 3 INT32 words contains 16 FP6 weights. + * [Output] + * packed_tensor: int tensor of shape [OC, IC // 16 * 3]; + */ +torch::Tensor weight_matrix_prepacking_cpu(torch::Tensor fp6_tensor, size_t OC, size_t IC) +{ + assert((OC % 256 == 0) && (IC % 64 == 0)); + assert((fp6_tensor.size(0) == OC) && (fp6_tensor.size(1) == IC / 16 * 3)); + // auto packed_tensor = torch::empty_like(fp6_tensor); + // auto packed_tensor_ptr = reinterpret_cast(packed_tensor.data_ptr()); + auto fp6_tensor_ptr = reinterpret_cast(fp6_tensor.data_ptr()); + weight_matrix_prepacking(fp6_tensor_ptr, OC, IC); + return fp6_tensor; +} diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh new file mode 100644 index 000000000000..95f7f6050c15 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +#include +#include +#include + +#include + +/* + * Computes FP6-FP16 GEMM (C++ interface). + */ +cudaError_t fp6_linear_kernel(cudaStream_t stream, + const uint4* Weight1, + const uint4* Weight2, + const half* Scales, + const half* B, + half* C, + const size_t M_Global, + const size_t N_Global, + const size_t K_Global, + float* Reduction_Workspace, // Reduction_Workspace_Size = Split_K * + // M_Global * N_Global * sizeof(fp32) + int Split_K); + +/* + * Computes FP6-FP16 GEMM (PyTorch interface). + */ +torch::Tensor fp6_linear_forward_cuda(torch::Tensor _in_feats, + torch::Tensor _weights, + torch::Tensor _scales, + int splitK = 1); + +/* + * In-place weight prepacking (C++ interface). + */ +void weight_matrix_prepacking(int* FP6Weights, size_t M, size_t K); + +/* + * Weight prepacking (Pytorch interface). + */ +torch::Tensor weight_matrix_prepacking_cpu(torch::Tensor fp6_tensor, size_t M, size_t K); diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/configs.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/configs.h new file mode 100644 index 000000000000..76e8eda2d35e --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/configs.h @@ -0,0 +1,96 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +#ifndef CONFIGS_H +#define CONFIGS_H + +// #define DEBUG_MODE +#define PIPELINE_LEVEL_GMEM 2 +#define PIPELINE_LEVEL_SMEM 2 // only support 2 + +/************************ Hardware Parameters ************************/ +#define WARP_SIZE 32 +#define REG_BIT_WIDTH 32 +// mma: M=16 K=16 N=8 +#define MMA_8 8 +#define MMA_16 16 +// for memory access +#define THREAD_OPT_ACCESS_BIT_WIDTH_128 128 // LDS.128, cp_async.128, ... +#define BIT_WIDTH_PER_HALF 16 // Half precision: FP16 + +/******************** Register Allocation For GEMM ********************/ +#define REG_PER_THREAD_C_TENSOR_16_16 8 // 8 for FP32 Accumulation +/********************** Memory Padding Parameters **********************/ +// Eliminating bank-conflict +#define PADDING_BYTES_16 16 // Padding 16 bytes each column +#define PADDING_SHARED_MEM_FOR_B_8 \ + 8 // Padding 8 half each column, during CopyFromGlobalToShared() for B +#define PADDING_SHARED_MEM_FOR_C_4 \ + 4 // Padding 4 float each column, during StoreToSharedMemoryFromRegister() for C +/************************* WARP Tiling part-1 *************************/ +#define WARP_ROW_MMA_TENSORS 4 +#define WARP_M (WARP_ROW_MMA_TENSORS * MMA_16) // 64 +#define WARP_K_MMA_TENSORS 4 +#define WARP_K (WARP_K_MMA_TENSORS * MMA_16) // 64 +template +struct TilingConfig { + // Depending on "n" dimension of the GEMM + static constexpr int BLOCK_ROW_WARPS = BLOCK_ROW_WARPS_; + static constexpr int BLOCK_COL_WARPS = BLOCK_COL_WARPS_; + static constexpr int WARP_COL_MMA_TENSORS = WARP_COL_MMA_TENSORS_; + /************************* WARP Tiling part-2 *************************/ + static constexpr int WARP_N = WARP_COL_MMA_TENSORS * MMA_8; + /*************************Thread Block Tiling *************************/ + static constexpr int TILE_M = WARP_M * BLOCK_ROW_WARPS; + static constexpr int TILE_N = MMA_8 * WARP_COL_MMA_TENSORS * BLOCK_COL_WARPS; + static constexpr int TILE_K = WARP_K; + /********************** #Thread per Thread Block **********************/ + static constexpr int BLOCK_WARPS = BLOCK_ROW_WARPS * BLOCK_COL_WARPS; + static constexpr int BLOCK_THREADS = BLOCK_WARPS * WARP_SIZE; + /******************************* Others *******************************/ + static constexpr int SMEM_SIZE_B_TILE = TILE_N * (TILE_K + PADDING_BYTES_16) * 2 * + PIPELINE_LEVEL_GMEM; // sizeof(half)=2, doubleBuffer=2 + static constexpr int SMEM_SIZE_C_TILE = + TILE_N * (TILE_M + PADDING_BYTES_16) * 4; // sizeof(float)=4 +}; + +/************************ General Config for Quant-LLM **********************/ +#define WEIGHT_FRAG1_BIT_WIDTH 2 +#define WEIGHT_FRAG2_BIT_WIDTH 4 +#define WEIGHT_BIT_WIDTH (WEIGHT_FRAG1_BIT_WIDTH + WEIGHT_FRAG2_BIT_WIDTH) // 6 +// #define QUANT_GROUP_SIZE_DIVIDED_BY_64 4 // +// QuantGroupSize: 4*64 = 256 +/*************************** 64*64 Weghts of A WARP *************************/ +#define WEIGHT_PER_UNIT (WARP_M * WARP_K) // 64*64 +#define SMEM_SIZE_IN_BYTES_PER_WARP_A1 \ + (WEIGHT_PER_UNIT * WEIGHT_FRAG1_BIT_WIDTH / \ + 8) // 1024 Bytes #doubleBuffer not takedn into consideration +#define SMEM_SIZE_IN_BYTES_PER_WARP_A2 \ + (WEIGHT_PER_UNIT * WEIGHT_FRAG2_BIT_WIDTH / \ + 8) // 2048 Bytes #doubleBuffer not takedn into consideration +#define SMEM_SIZE_A1_TILE \ + (SMEM_SIZE_IN_BYTES_PER_WARP_A1 * 4 * \ + PIPELINE_LEVEL_GMEM) // #WARP=4, #Trible-Buffer for 3-level pipeline for A = 12 KB; double + // buffer for 2-level pipeline A= 8 KB. +#define SMEM_SIZE_A2_TILE \ + (SMEM_SIZE_IN_BYTES_PER_WARP_A2 * 4 * \ + PIPELINE_LEVEL_GMEM) // #WARP=4, #Trible-Buffer for 3-level pipeline for A = 24 KB; double + // buffer for 2-level pipeline A= 16 KB. +/******************** Global Memory Layout For QUANTIZED DATA ******************/ +#define NUM_INT4_PER_UNIT_2BIT_FRAG (WEIGHT_PER_UNIT * WEIGHT_FRAG1_BIT_WIDTH / 128) // 64 +#define NUM_INT4_PER_UNIT_4BIT_FRAG (WEIGHT_PER_UNIT * WEIGHT_FRAG2_BIT_WIDTH / 128) // 128 +/******************** Register Allocation For QUANTIZED DATA ******************/ +#define WEIGHT_PER_THREAD (WEIGHT_PER_UNIT / WARP_SIZE) // 128 +#define REG_PER_THREAD_2BIT_FRAG (WEIGHT_PER_THREAD / REG_BIT_WIDTH * 2) // 8 +#define REG_PER_THREAD_4BIT_FRAG (WEIGHT_PER_THREAD / REG_BIT_WIDTH * 4) // 16 +/******************** Register Allocation For QUANT Scales ******************/ +#define WARP_REG_QUANT_SCALE 4 // 8 rows per thread -> 8 FP16 scales -> 4 registers +#define WARP_REG_QUANT_SCALE_DISTRIBUTED \ + 1 // T0-T3, T4-T7, ..., T28-T31 share the same scales, using shfl to get all the scales for + // each thread + +#endif // CONFIGS_H diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh new file mode 100644 index 000000000000..aa6ea6c4b1c2 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh @@ -0,0 +1,261 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +#include "configs.h" +#include "utils_core.cuh" +#include "utils_gmem.cuh" + +/* + * C = A*B + * A: row major with ahead-of-time layout transformation, FP6 + * B: col major, FP16 + * C: col major, FP16 + */ +template +__global__ void QUANT_GEMM_Kernel(const uint4* Weight1, + const uint4* Weight2, + const half* Scales, + const half* B, + OutputDataType* C, + const size_t M_Global, + const size_t N_Global, + const size_t K_Global, + int Split_K) +{ +#ifdef DEBUG_MODE + assert(K_Global % TilingConfig::TILE_K == 0); + assert(M_Global % TilingConfig::TILE_M == 0); + assert(gridDim.y == Split_K * (M_Global / TilingConfig::TILE_M)); +#endif + extern __shared__ __align__(128) + half smem[]; // Dynamic shared memory for FP16 A tiles, 128 Bytes aligned + half(*smem_array)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] = + reinterpret_cast( + smem + + (SMEM_SIZE_A1_TILE + SMEM_SIZE_A2_TILE) / 2); // Dynamic shared memory for FP16 B tiles + __shared__ half QuantScales[64 * TilingConfig::BLOCK_WARPS]; // static shared memory for + // quantization scales, 64 row per + // warp * 4 warps = 512 Bytes + // Thread Block Mapping, considering SplitK + const size_t BatchID = blockIdx.y / (M_Global / TilingConfig::TILE_M); + const size_t x = blockIdx.x; // Output Block ID: (BlockID_Row = y; BlockID_Col = x ) + const size_t y = + blockIdx.y % + (M_Global / TilingConfig::TILE_M); // Output Block ID: (BlockID_Row = y; BlockID_Col = x ) + const size_t Tile_Start_M = y * TilingConfig::TILE_M; + const size_t Tile_Start_N = x * TilingConfig::TILE_N; + const size_t NumColumnToCopy = (N_Global - Tile_Start_N) < TilingConfig::TILE_N + ? (N_Global - Tile_Start_N) + : TilingConfig::TILE_N; + const size_t NumBlock_K = K_Global / TilingConfig::TILE_K; + const size_t AverageNumBlock_K = NumBlock_K / Split_K; + const size_t ExtraNumBlock_K = NumBlock_K - AverageNumBlock_K * Split_K; + size_t NumIter = AverageNumBlock_K; + if (BatchID < ExtraNumBlock_K) NumIter++; + size_t StartBlockID_K = AverageNumBlock_K * BatchID; + if (BatchID < ExtraNumBlock_K) + StartBlockID_K += BatchID; + else + StartBlockID_K += ExtraNumBlock_K; + // Warp ID. + const int warpId = threadIdx.x / WARP_SIZE; + int WARP_i = + warpId / TilingConfig::BLOCK_COL_WARPS; // WARP_i: row number; WARP_j: column number + // int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS; + // Global Memory Address for Matrix A (Weight) + // ///////////////////////////////////////////////////////////////////////// StartPTR for each + // ThreadBlock(TB) + const uint4* TB_StartGPTR_A1 = + Weight1 + (y * TilingConfig::BLOCK_ROW_WARPS) * NumBlock_K * NUM_INT4_PER_UNIT_2BIT_FRAG; + const uint4* TB_StartGPTR_A2 = + Weight2 + (y * TilingConfig::BLOCK_ROW_WARPS) * NumBlock_K * NUM_INT4_PER_UNIT_4BIT_FRAG; + // StartPTR for each WARP. + const uint4* WARP_StartGPTR_A1 = + TB_StartGPTR_A1 + WARP_i * NumBlock_K * NUM_INT4_PER_UNIT_2BIT_FRAG; + const uint4* WARP_StartGPTR_A2 = + TB_StartGPTR_A2 + WARP_i * NumBlock_K * NUM_INT4_PER_UNIT_4BIT_FRAG; + // StartPTR for each WARP, considering SplitK + const size_t WARP_Start_UnitID_K = StartBlockID_K; + WARP_StartGPTR_A1 += WARP_Start_UnitID_K * NUM_INT4_PER_UNIT_2BIT_FRAG; + WARP_StartGPTR_A2 += WARP_Start_UnitID_K * NUM_INT4_PER_UNIT_4BIT_FRAG; + // Copying A tile from Global to Shared, using double-buffer + // ////////////////////////////////////////////////////////// StartSPTR for each ThreadBlock + uint32_t* AFrag_2BIT_SPTR = reinterpret_cast(smem); + uint32_t* AFrag_4BIT_SPTR = + AFrag_2BIT_SPTR + + SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * TilingConfig::BLOCK_WARPS * + PIPELINE_LEVEL_GMEM; // 8 buffers including double buffers, 12 for trible buffers + // StartSPTR for each WARP + AFrag_2BIT_SPTR += warpId * SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4; + AFrag_4BIT_SPTR += warpId * SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4; + // Pre-fetch of A tile + for (int i = 0; i < PIPELINE_LEVEL_GMEM - 1; i++) { + CopyFromGlobalToShared_A( + AFrag_2BIT_SPTR + i * SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * 4, WARP_StartGPTR_A1); + CopyFromGlobalToShared_A( + AFrag_4BIT_SPTR + i * SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 * 4, WARP_StartGPTR_A2); + WARP_StartGPTR_A1 += SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 16; + WARP_StartGPTR_A2 += SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 16; + } + // Global Memory Address for Matrix A (QuantScale) + // ///////////////////////////////////////////////////////////////////// + const half* TB_StartGPTR_A_Scale = Scales + (y * TilingConfig::BLOCK_ROW_WARPS) * 64; + const half* WARP_StartGPTR_A_Scales = TB_StartGPTR_A_Scale + WARP_i * 64; + CopyFromGlobalToShared_Scales(QuantScales + WARP_i * 64, WARP_StartGPTR_A_Scales); + // Copying B tile from Global to Shared, considering SplitK + // ///////////////////////////////////////////////////////////// + const half* BTile_GPTR = B + Tile_Start_N * K_Global + StartBlockID_K * TilingConfig::TILE_K; + for (int i = 0; i < PIPELINE_LEVEL_GMEM - 1; i++) { + CopyFromGlobalToShared( + smem_array + i * TilingConfig::TILE_N, BTile_GPTR, K_Global, NumColumnToCopy); + BTile_GPTR += TilingConfig::TILE_K; + } + // Register Allocation for A,B, and C, Initilazed to Zeros + // ///////////////////////////////////////////////////////////////////// + constexpr int NumRegSets_a = + WARP_ROW_MMA_TENSORS; // 1 set = 4 registers, containing a 16*16 MMA block + constexpr int NumRegSets_b = (TilingConfig::WARP_COL_MMA_TENSORS == 1) + ? 1 + : TilingConfig::WARP_COL_MMA_TENSORS / + 2; // 1 set = 4 registers, containing a 16*16 MMA block +#ifdef PIPELINE_LEVEL_SMEM + uint32_t a[NumRegSets_a * PIPELINE_LEVEL_SMEM] + [4]; // double/Trible buffer is used // Registers to store decompressed FP6 + uint32_t b[NumRegSets_b * PIPELINE_LEVEL_SMEM] + [4]; // double/Triple buffer is used // Register to store FP16 B matrix (a slice) +#endif + float c[NumRegSets_a * NumRegSets_b][REG_PER_THREAD_C_TENSOR_16_16]; + for (int i = 0; i < NumRegSets_a * NumRegSets_b; i++) + for (int j = 0; j < REG_PER_THREAD_C_TENSOR_16_16; j++) c[i][j] = 0.0f; + // + cp_async_wait_all(); + __syncthreads(); + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + uint32_t Scales_RPTR[4]; // 4 Registers per thread for Quantization Scales + ExtractFromSharedToReg_Scales(Scales_RPTR, QuantScales + WARP_i * 64); +#ifdef PIPELINE_LEVEL_SMEM + // Initializing the Software Pipeline: writing registers. + // //////////////////////////////////////////////////////////////////////////////////////////////// + initialize_mma_slice( + a, b, AFrag_2BIT_SPTR, AFrag_4BIT_SPTR, smem_array, Scales_RPTR); +#endif +// The outer loop. +// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma unroll(1) + for (size_t tile_id_k = 0; tile_id_k < NumIter; tile_id_k++) { + // Trible-Buffer for A Tile + uint32_t* __restrict__ read_SPTR_Frag1 = + AFrag_2BIT_SPTR + ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) * + SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * + 4; // 1024 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16 + uint32_t* __restrict__ read_SPTR_Frag2 = + AFrag_4BIT_SPTR + ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) * + SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 * + 4; // 2048 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16 +#ifdef PIPELINE_LEVEL_SMEM + uint32_t* __restrict__ read2_SPTR_Frag1 = + AFrag_2BIT_SPTR + + ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * 4; + uint32_t* __restrict__ read2_SPTR_Frag2 = + AFrag_4BIT_SPTR + + ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 * 4; +#endif + uint32_t* __restrict__ write_SPTR_Frag1 = + AFrag_2BIT_SPTR + ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) * + SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * + 4; // 1024 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16 + uint32_t* __restrict__ write_SPTR_Frag2 = + AFrag_4BIT_SPTR + ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) * + SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 * + 4; // 2048 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16 + // Trible-Buffer for B Tile + half __restrict__(*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] = + smem_array + ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N; +#ifdef PIPELINE_LEVEL_SMEM + half __restrict__(*read2_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] = + smem_array + ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N; +#endif + half __restrict__(*write_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] = + smem_array + + ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N; + // + bool GlobalCopy = (tile_id_k + PIPELINE_LEVEL_GMEM - 1) < NumIter; + // Copying A tile from Global to Register, Bypassing L1, using double-buffer + CopyFromGlobalToShared_A( + write_SPTR_Frag1, WARP_StartGPTR_A1, GlobalCopy); + CopyFromGlobalToShared_A( + write_SPTR_Frag2, WARP_StartGPTR_A2, GlobalCopy); + // copying B tile from GlobalMemory to SharedMemory + CopyFromGlobalToShared( + write_SPTR, BTile_GPTR, K_Global, NumColumnToCopy, GlobalCopy); + cp_async_group_commit(); +#ifdef PIPELINE_LEVEL_SMEM + core_mma_slice(c, + a, + b, + read_SPTR_Frag1, + read_SPTR_Frag2, + read_SPTR, + Scales_RPTR, + 1); // read_SPTR_Frag1, read_SPTR_Frag2 are different for each + // WARP; read_SPTR is shared among WARPs + core_mma_slice( + c, a, b, read_SPTR_Frag1, read_SPTR_Frag2, read_SPTR, Scales_RPTR, 2); + core_mma_slice( + c, a, b, read_SPTR_Frag1, read_SPTR_Frag2, read_SPTR, Scales_RPTR, 3); + // Barriers and Synchronizations + cp_async_wait_group(); + __syncthreads(); + core_mma_slice( + c, a, b, read2_SPTR_Frag1, read2_SPTR_Frag2, read2_SPTR, Scales_RPTR, 0); + // Updating global PTRs + WARP_StartGPTR_A1 += + SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 16; // 4KB/16=256 (1)/16: int4*+1 = char*+16 + WARP_StartGPTR_A2 += + SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 16; // 8KB/16=512 (1)/16: int4*+1 = char*+16 + BTile_GPTR += TilingConfig::TILE_K; +#else + PipelinedCoreLoop( + c, + read_SPTR, + read_SPTR_Frag1, + read_SPTR_Frag2, + Scales_RPTR); // read_SPTR_Frag1, read_SPTR_Frag2 are different for each WARP; + // read_SPTR is shared among WARPs + // Updating global PTRs + WARP_StartGPTR_A1 += + SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 16; // 4KB/16=256 (1)/16: int4*+1 = char*+16 + WARP_StartGPTR_A2 += + SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 16; // 8KB/16=512 (1)/16: int4*+1 = char*+16 + BTile_GPTR += TilingConfig::TILE_K; + // Barriers and Synchronizations + cp_async_wait_group(); + __syncthreads(); +#endif + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Store the C fragments to shared memory. + float(*smem_CFrag)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4] = + reinterpret_cast(smem); + StoreToSharedMemoryFromRegister(smem_CFrag, c); + __syncthreads(); + // Now that shared memory contains all the D tiles, stream them to global memory. + OutputDataType* BlockGlobalPTR = + C + BatchID * (M_Global * N_Global) + Tile_Start_M + Tile_Start_N * M_Global; + for (size_t i = warpId; i < NumColumnToCopy; i += TilingConfig::BLOCK_WARPS) // i-th column +#pragma unroll + for (size_t j = threadIdx.x % WARP_SIZE; j < TilingConfig::TILE_M; + j += WARP_SIZE) // j-th row + { + if constexpr (std::is_same::value) + BlockGlobalPTR[j + i * M_Global] = __float2half_rn(smem_CFrag[i][j]); + else + BlockGlobalPTR[j + i * M_Global] = smem_CFrag[i][j]; + } +} diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh new file mode 100644 index 000000000000..8c49f8b0b3a5 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +#include +#include +#include + +#define REDUCTION_ELEMENT_PER_THREADBLOCK 256 +#define HALF_PER_128BIT 8 + +__global__ void SplitK_Reduction(half* C, + float* Reduction_Workspace, + size_t M_Global, + size_t N_Global, + int Split_K) +{ + half* WARP_GPTR_C = C + REDUCTION_ELEMENT_PER_THREADBLOCK * blockIdx.x; + float* WARP_GPTR_R = Reduction_Workspace + REDUCTION_ELEMENT_PER_THREADBLOCK * blockIdx.x; + half* THREAD_GPTR_C = WARP_GPTR_C + threadIdx.x * HALF_PER_128BIT; + float* THREAD_GPTR_R = WARP_GPTR_R + threadIdx.x * HALF_PER_128BIT; + // Initializing Thread-Local Results + float Results[HALF_PER_128BIT]; +#pragma unroll + for (int i = 0; i < HALF_PER_128BIT; i++) Results[i] = 0.0f; + // Reduction + for (int i = 0; i < Split_K; i++) { +#pragma unroll + for (int j = 0; j < HALF_PER_128BIT; j++) Results[j] += THREAD_GPTR_R[j]; + THREAD_GPTR_R += M_Global * N_Global; + } +// Writing to global memory +#pragma unroll + for (int i = 0; i < HALF_PER_128BIT; i++) THREAD_GPTR_C[i] = __float2half_rn(Results[i]); +} diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh new file mode 100644 index 000000000000..7f36cfd5d961 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh @@ -0,0 +1,52 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +#ifndef PTX_CP_ASYNC_CUH +#define PTX_CP_ASYNC_CUH + +#include +#include +#include + +template +__device__ __forceinline__ void cp_async(half* smem_ptr, + const half* global_ptr, + bool pred_guard = true) +{ + static_assert(SizeInBytes == 16, "Size is not supported"); + unsigned smem_int_ptr = __cvta_generic_to_shared(smem_ptr); + asm volatile( + "{ \n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred_guard), + "r"(smem_int_ptr), + "l"(global_ptr), + "n"(SizeInBytes)); +} + +/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block. +__device__ __forceinline__ void cp_async_group_commit() +{ + asm volatile("cp.async.commit_group;\n" ::); +} + +/// Blocks until all but previous cp.async.commit_group operations have committed. +template +__device__ __forceinline__ void cp_async_wait_group() +{ + asm volatile("cp.async.wait_group %0;\n" ::"n"(N)); +} + +/// Blocks until all previous cp.async.commit_group operations have committed. +// cp.async.wait_all is equivalent to : +// cp.async.commit_group; +// cp.async.wait_group 0; +__device__ __forceinline__ void cp_async_wait_all() { asm volatile("cp.async.wait_all;\n" ::); } + +#endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh new file mode 100644 index 000000000000..f13abe036279 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh @@ -0,0 +1,125 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +#ifndef PTX_MMA_CUH +#define PTX_MMA_CUH + +#include +#include +#include + +#include +#include "configs.h" + +#ifdef PIPELINE_LEVEL_SMEM +template +__device__ __forceinline__ void B_FromSharedToReg( + uint32_t __restrict__ Reg[][4], + half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8], + int slice_id) +{ +#ifdef DEBUG_MODE + static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) || + (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0)); +#endif + + const int warpId = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS; + int warp_start_col = TilingConfig::WARP_COL_MMA_TENSORS * MMA_8 * + WARP_j; // each warp may start from reading warp_start_col'th column of + // the B tile in shared memory +#ifdef DEBUG_MODE + assert(warp_start_col == 0); +#endif + + int col = (lane_id % 8) + (lane_id / 16) * 8; + int row = (lane_id % 16) / 8 * 8; + uint32_t smem_local_ptr = static_cast( + __cvta_generic_to_shared(&read_SPTR[warp_start_col + col][slice_id * MMA_16 + row])); + if (TilingConfig::WARP_COL_MMA_TENSORS == 1) { + asm volatile("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n" + : "=r"(Reg[0][0]), "=r"(Reg[0][1]) + : "r"(smem_local_ptr)); + } else { +#pragma unroll + for (int i = 0; i < TilingConfig::WARP_COL_MMA_TENSORS / 2; i++) { + asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n" + : "=r"(Reg[i][0]), "=r"(Reg[i][1]), "=r"(Reg[i][2]), "=r"(Reg[i][3]) + : "r"(smem_local_ptr)); + smem_local_ptr += 16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half); + } + } +} +#else +// Debug: Whether ldmatrix.trans is required??? +// B is in column-major +template +__device__ __forceinline__ void B_FromSharedToReg( + uint32_t __restrict__ Reg[][4], + half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8], + int k_offset) +{ +#ifdef DEBUG_MODE + static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) || + (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0)); +#endif + + const int warpId = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS; + int warp_start_col = TilingConfig::WARP_COL_MMA_TENSORS * MMA_8 * + WARP_j; // each warp may start from reading warp_start_col'th column of + // the B tile in shared memory +#ifdef DEBUG_MODE + assert(warp_start_col == 0); +#endif + + int col = (lane_id % 8) + (lane_id / 16) * 8; + int row = (lane_id % 16) / 8 * 8; + uint32_t smem_local_ptr = static_cast( + __cvta_generic_to_shared(&read_SPTR[warp_start_col + col][k_offset + row])); + if (TilingConfig::WARP_COL_MMA_TENSORS == 1) { + asm volatile("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n" + : "=r"(Reg[0][0]), "=r"(Reg[0][1]) + : "r"(smem_local_ptr)); + } else { +#pragma unroll + for (int i = 0; i < TilingConfig::WARP_COL_MMA_TENSORS / 2; i++) { + asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n" + : "=r"(Reg[i][0]), "=r"(Reg[i][1]), "=r"(Reg[i][2]), "=r"(Reg[i][3]) + : "r"(smem_local_ptr)); + smem_local_ptr += 16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half); + } + } +} +#endif + +__device__ __forceinline__ void MMA_FP16_M16N8K16(uint32_t __restrict__ c[], + uint32_t __restrict__* a, + uint32_t __restrict__* b) +{ + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" + "{ %0, %1, %2, %3}," + "{ %4, %5, %6, %7 }," + "{ %8, %9 }," + "{ %10, %11, %12, %13 };" + : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) + : "r"(a[0]), + "r"(a[1]), + "r"(a[2]), + "r"(a[3]), + "r"(b[0]), + "r"(b[1]), + "r"(c[0]), + "r"(c[1]), + "r"(c[2]), + "r"(c[3])); +} + +#endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh new file mode 100644 index 000000000000..713cebc57e33 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh @@ -0,0 +1,246 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +#ifndef UTILS_CORE_CUH +#define UTILS_CORE_CUH + +#include + +#include "configs.h" +#include "ptx_mma.cuh" +#include "utils_paralleldequant.cuh" + +#ifdef PIPELINE_LEVEL_SMEM +template +__device__ __forceinline__ void CopyFromSharedToRegister_AFrag(uint32_t Reg[], + uint32_t* SPTR, + int slice_id) +{ + SPTR += slice_id * (NUM_INT_PER_THREAD * WARP_SIZE); + int lane_id = threadIdx.x % WARP_SIZE; +#pragma unroll + for (int i = 0; i < NUM_INT_PER_THREAD; i++) { Reg[i] = SPTR[lane_id + i * WARP_SIZE]; } +} + +template +__device__ __forceinline__ void initialize_mma_slice( + uint32_t (*a)[4], + uint32_t (*b)[4], + uint32_t* __restrict__ A1_SPTR_read, + uint32_t* __restrict__ A2_SPTR_read, + half __restrict__ (*B_SPTR_read)[WARP_K + PADDING_SHARED_MEM_FOR_B_8], + uint32_t* RPTR_Scales) +{ + // Writing registers + // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6 per thread => 6 + // register per thread; + uint32_t a_1[2]; // NO double buffer + uint32_t a_2[4]; // NO double buffer + CopyFromSharedToRegister_AFrag<2>(a_1, A1_SPTR_read, 0); + CopyFromSharedToRegister_AFrag<4>(a_2, A2_SPTR_read, 0); + Dequant_32FP6_4Way(a, a_1, a_2, RPTR_Scales); // SIMT Dequant: dequantizing FP6 to FP16 at + // register level, dequantizing a slice each time + B_FromSharedToReg(b, B_SPTR_read, 0); // Loading B from shared to registers +} + +template +__device__ __forceinline__ void core_mma_slice( + float c[][REG_PER_THREAD_C_TENSOR_16_16], + uint32_t (*a)[4], + uint32_t (*b)[4], + uint32_t* __restrict__ A1_SPTR_read, + uint32_t* __restrict__ A2_SPTR_read, + half __restrict__ (*B_SPTR_read)[WARP_K + PADDING_SHARED_MEM_FOR_B_8], + uint32_t* RPTR_Scales, + int slice_id) // writing slice[slice_id] to registers, k=0 -> slice_id=1 for prefetching +{ +#ifdef DEBUG_MODE + assert( + (TilingConfig::WARP_COL_MMA_TENSORS == 1) || + (TilingConfig::WARP_COL_MMA_TENSORS % 2 == + 0)); // if WARP_COL_MMA_TENSORS == 1, B tile in registers is padded to a 16*16 MMA block +#endif + const int NumRegSets_a = + WARP_ROW_MMA_TENSORS; // 1 set = 4 registers, containing a 16*16 MMA block + const int NumRegSets_b = (TilingConfig::WARP_COL_MMA_TENSORS == 1) + ? 1 + : TilingConfig::WARP_COL_MMA_TENSORS / + 2; // 1 set = 4 registers, containing a 16*16 MMA block + uint32_t(*c_uint_ptr)[REG_PER_THREAD_C_TENSOR_16_16] = + reinterpret_cast( + c); // Registers for accumulated FP32 results + + // Setting RPTRs for double buffers + uint32_t(*a_read)[4] = a; + uint32_t(*a_write)[4] = a; + uint32_t(*b_read)[4] = b; + uint32_t(*b_write)[4] = b; + if (slice_id % 2 == 1) { + b_write += NumRegSets_b; + a_write += NumRegSets_a; + } else { + b_read += NumRegSets_b; + a_read += NumRegSets_a; + } + +// Reading registers and issuing core tensor core computations (a slice of A and B tile in shared +// memory) +#pragma unroll + for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) { + if (TilingConfig::WARP_COL_MMA_TENSORS == 1) { + MMA_FP16_M16N8K16(c_uint_ptr[i], a_read[i], b_read[0]); + } else { +#pragma unroll + for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS / 2; j++) { + MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS], a_read[i], b_read[j]); + MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS] + 4, + a_read[i], + b_read[j] + 2); // c+4; b+2 + } + } + } + + // Writing registers + // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6 per thread => 6 + // register per thread; + uint32_t a_1[2]; // NO double buffer + uint32_t a_2[4]; // NO double buffer + CopyFromSharedToRegister_AFrag<2>(a_1, A1_SPTR_read, slice_id); + CopyFromSharedToRegister_AFrag<4>(a_2, A2_SPTR_read, slice_id); + Dequant_32FP6_4Way( + a_write, a_1, a_2, RPTR_Scales); // SIMT Dequant: dequantizing FP6 to FP16 at register + // level, dequantizing a slice each time + B_FromSharedToReg( + b_write, B_SPTR_read, slice_id); // Loading B from shared to registers +} + +#else +// Old version with naive pipeline design +template +__device__ __forceinline__ void CopyFromSharedToRegister_AFrag(uint32_t Reg[], uint32_t* SPTR) +{ + int lane_id = threadIdx.x % WARP_SIZE; +#pragma unroll + for (int i = 0; i < NUM_INT_PER_THREAD; i++) { Reg[i] = SPTR[lane_id + i * WARP_SIZE]; } +} +template +__device__ __forceinline__ void PipelinedCoreLoop( + float c[][REG_PER_THREAD_C_TENSOR_16_16], + half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8], + uint32_t* __restrict__ read_SPTR_Frag1, + uint32_t* __restrict__ read_SPTR_Frag2, + uint32_t* RPTR_Scales) +{ +#ifdef DEBUG_MODE + assert( + (TilingConfig::WARP_COL_MMA_TENSORS == 1) || + (TilingConfig::WARP_COL_MMA_TENSORS % 2 == + 0)); // if WARP_COL_MMA_TENSORS == 1, B tile in registers is padded to a 16*16 MMA block +#endif + const int NumRegSets_a = + WARP_ROW_MMA_TENSORS; // 1 set = 4 registers, containing a 16*16 MMA block + const int NumRegSets_b = (TilingConfig::WARP_COL_MMA_TENSORS == 1) + ? 1 + : TilingConfig::WARP_COL_MMA_TENSORS / + 2; // 1 set = 4 registers, containing a 16*16 MMA block + + // Registers to store FP32 results + uint32_t(*c_uint_ptr)[REG_PER_THREAD_C_TENSOR_16_16] = + reinterpret_cast(c); + // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6 per thread => 6 + // register per thread; + uint32_t a_1[2 * 2]; // double buffer is used + uint32_t a_2[4 * 2]; // double buffer is used + // Registers to store decompressed FP6 + uint32_t a[NumRegSets_a * 1][4]; // No double buffer + // Register to store FP16 B matrix (a slice) + uint32_t b[NumRegSets_b * 2][4]; // double buffer is used + + // Overlapped Smem and TC pipeline: pre-loading from shared to registers + CopyFromSharedToRegister_AFrag<2>(a_1, read_SPTR_Frag1); + CopyFromSharedToRegister_AFrag<4>(a_2, read_SPTR_Frag2); + B_FromSharedToReg(b, read_SPTR, 0); + +#pragma unroll + for (int k = 0; k < WARP_K_MMA_TENSORS; k++) { + uint32_t(*b_read)[4] = b; + uint32_t(*b_write)[4] = b; + uint32_t* a_1_read = a_1; + uint32_t* a_1_write = a_1; + uint32_t* a_2_read = a_2; + uint32_t* a_2_write = a_2; + if (k % 2 == 0) { + b_write += NumRegSets_b; + a_1_write += 2; + a_2_write += 4; + } else { + b_read += NumRegSets_b; + a_1_read += 2; + a_2_read += 4; + } + // data loading + if (k + 1 < WARP_K_MMA_TENSORS) { + // updating SPTR for fragment1 and fragment2 + read_SPTR_Frag1 += 2 * WARP_SIZE; + read_SPTR_Frag2 += 4 * WARP_SIZE; + CopyFromSharedToRegister_AFrag<2>(a_1_write, read_SPTR_Frag1); + CopyFromSharedToRegister_AFrag<4>(a_2_write, read_SPTR_Frag2); + B_FromSharedToReg(b_write, read_SPTR, (k + 1) * MMA_16); + } + // SIMT Dequant + Tensor Core computations + Dequant_32FP6_4Way( + a, a_1_read, a_2_read, RPTR_Scales); // Dequantizing FP6 to FP16 at register level, + // dequantizing a slice each time +#pragma unroll + for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) { + if (TilingConfig::WARP_COL_MMA_TENSORS == 1) + MMA_FP16_M16N8K16(c_uint_ptr[i], a[i], b_read[0]); + else { +#pragma unroll + for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS / 2; j++) { + MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS], a[i], b_read[j]); + MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS] + 4, + a[i], + b_read[j] + 2); // c+4; b+2 + } + } + } + } +} +#endif // #ifdef PIPELINE_LEVEL_SMEM + +template +__device__ __forceinline__ void StoreToSharedMemoryFromRegister( + float (*smem_CFrag)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4], + float c[][REG_PER_THREAD_C_TENSOR_16_16]) +{ + const int lane_id = threadIdx.x % WARP_SIZE; + const int warpId = threadIdx.x / WARP_SIZE; + int warp_row_offset = warpId * (MMA_16 * WARP_ROW_MMA_TENSORS); +#pragma unroll + for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) { +#pragma unroll + for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS; + j++) { // Dealing with one 16*8 Tensor + int RegSetID = i + (j / 2) * WARP_ROW_MMA_TENSORS; + int RegOffset = (j % 2) * (REG_PER_THREAD_C_TENSOR_16_16 / 2); + int Tensor_row_offset = warp_row_offset + i * MMA_16; + int Tensor_col_offset = j * MMA_8; +#pragma unroll + for (int r = 0; r < REG_PER_THREAD_C_TENSOR_16_16 / 2; r++) { + int row_offset = lane_id / 4; + if (r >= 2) row_offset += 8; + int col_offset = (lane_id % 4) * 2; + if (r % 2 == 1) col_offset += 1; + smem_CFrag[Tensor_col_offset + col_offset][Tensor_row_offset + row_offset] = + c[RegSetID][r + RegOffset]; + } + } + } +} + +#endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh new file mode 100644 index 000000000000..62b77edaa37a --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +#ifndef UTILS_GMEM_CUH +#define UTILS_GMEM_CUH + +#include +#include "configs.h" +#include "ptx_cp.async.cuh" + +/* + * Copying A1/A2 from global memory to shared memory. + * Usually 1024 or 2048 Bytes + */ +template +__device__ __forceinline__ void CopyFromGlobalToShared_A(uint32_t* SPTR, + const uint4* GPTR, + bool pred_guard = true) +{ +#ifdef DEBUG_MODE + static_assert(SMEM_SIZE_IN_BYTES_PER_WARP / WARP_SIZE % 16 == 0); +#endif + int lane_id = threadIdx.x % WARP_SIZE; + half* SPTR_HALF = reinterpret_cast(SPTR); + const half* GPTR_HALF = reinterpret_cast(GPTR); + SPTR_HALF += lane_id * 8; + GPTR_HALF += lane_id * 8; +#pragma unroll + for (int i = 0; i < SMEM_SIZE_IN_BYTES_PER_WARP / WARP_SIZE / 16; i++) { + cp_async<16>(SPTR_HALF, GPTR_HALF, pred_guard); + SPTR_HALF += 256; // Forward 512 Bytes + GPTR_HALF += 256; // Forward 512 Bytes + } +} + +/* + * Copying 64 Quant Scales (FP16) from global memory to shared memory. + */ +__device__ __forceinline__ void CopyFromGlobalToShared_Scales(half* SPTR_QuantScales, + const half* GPTR_A_Scales) +{ + int lane_id = threadIdx.x % WARP_SIZE; + int Offset_Shared = lane_id * 2; + int Offset_Global = lane_id / 4 + (lane_id % 4) * 16; + for (int i = 0; i < 2; i++) + SPTR_QuantScales[Offset_Shared + i] = GPTR_A_Scales[Offset_Global + i * 8]; +} + +/* + * (1) Copying X rows * 64 columns of FP16 values, originally in row major + * (2) Copying 64 rows * X columns of FP16 values, originally in column major + * 16 Bytes per thread -> 512 Bytes per WARP = 4 line per WARP = 1 line per 8 Threads + */ +template +__device__ __forceinline__ void CopyFromGlobalToShared( + half __restrict__ (*SharedPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8], + const half* GlobalPTR, + const int GlobalStride, + const int NumOfLinesLeft, // To support arbitrary N dimensions. + bool Pred = true) +{ + // static parameters: 1 Group (8 Threads) can copy 1 line (64 FP16) each time + const int NumOfThreads = BLOCK_WARPS * WARP_SIZE; + const int NumOfGroups = NumOfThreads / 8; + const int MaxIteration = (MaxNumOfLinesToCopy - 1) / NumOfGroups + 1; + // runtime variables + const int line_id = threadIdx.x / 8; + const int line_offset = (threadIdx.x % 8) * 8; + // PTR for source global memory and target shared memory + GlobalPTR += line_id * GlobalStride + line_offset; + SharedPTR += line_id; +#pragma unroll + for (int i = 0; i < MaxIteration; i++) { + bool AsyncCopyPred = (line_id + i * NumOfGroups) < NumOfLinesLeft && Pred; + cp_async<16>(&(*SharedPTR)[line_offset], GlobalPTR, AsyncCopyPred); + // + GlobalPTR += NumOfGroups * GlobalStride; + SharedPTR += NumOfGroups; + } +} + +#endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh new file mode 100644 index 000000000000..ff13868c1347 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh @@ -0,0 +1,127 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +#ifndef UTILS_PARALLELDEQUANT_CUH +#define UTILS_PARALLELDEQUANT_CUH + +#include +#include +#include + +/* + * Input: R1 + * Outputs: R1, R2 + * Note: Simplified Exponent calculation is applied. + */ +__device__ __forceinline__ void FP6_FP16_Cast_4Way(u_int32_t* R1, u_int32_t* R2) +{ + *R2 = *R1 & 0x80808080; + *R1 = *R1 >> 2; + *R1 = *R1 & 0x1f1f1f1f; + *R2 = *R2 | *R1; + *R1 = *R2 & 0x9f009f00; + *R2 = *R2 & 0x009f009f; + *R2 = *R2 << 8; +} + +/* + * Input: R1 + * Outputs: R1, R2 + * Note: Simplified Exponent calculation is NOT applied. + */ +__device__ __forceinline__ void FP6_FP16_Cast_4Way_Naive(u_int32_t* R1, u_int32_t* R2) +{ + //*R2 = *R1 & 0x80808080; + *R2 = *R1 & 0xc0c0c0c0; + *R1 = *R1 >> 2; + //*R1 = *R1 & 0x1f1f1f1f; + *R1 = *R1 & 0x0f0f0f0f; + *R2 = *R2 | *R1; + // + //*R1 = *R2 & 0x9f009f00; + //*R2 = *R2 & 0x009f009f; + *R1 = *R2 & 0xcf00cf00; + if (!(*R1 & 0x40000000) && (*R1 & 0x0c000000)) *R1 = *R1 | 0x30000000; + if (!(*R1 & 0x00004000) && (*R1 & 0x00000c00)) *R1 = *R1 | 0x00003000; + *R2 = *R2 & 0x00cf00cf; + if (!(*R2 & 0x00400000) && (*R2 & 0x000c0000)) *R2 = *R2 | 0x00300000; + if (!(*R2 & 0x00000040) && (*R2 & 0x0000000c)) *R2 = *R2 | 0x00000030; + // + *R2 = *R2 << 8; + //*R1 = 0x3c003c00; + //*R2 = 0x3c003c00; +} + +__device__ __forceinline__ u_int32_t MultScale(u_int32_t PackedFP16Pair, half Scale) +{ + half* FP16_1 = reinterpret_cast(&PackedFP16Pair); + half* FP16_2 = FP16_1 + 1; + uint32_t output; + half* output_half_ptr = reinterpret_cast(&output); + output_half_ptr[0] = __hmul(__hmul(*FP16_1, __float2half(4096.0f)), Scale); + output_half_ptr[1] = __hmul(__hmul(*FP16_2, __float2half(4096.0f)), Scale); + return output; +} + +__device__ __forceinline__ void Dequant_32FP6_4Way(u_int32_t __restrict__ Reg[][4], + u_int32_t __restrict__* read_RPTR_Frag1, + u_int32_t __restrict__* read_RPTR_Frag2, + u_int32_t* Scales) +{ + u_int32_t* OutputRegs = reinterpret_cast(Reg); + u_int32_t* Frag1_PTR = read_RPTR_Frag1; + u_int32_t* Frag2_PTR = read_RPTR_Frag2; + half* Scale_RPTR = reinterpret_cast(Scales); + u_int32_t Packed_FP6 = 0; + u_int32_t tmp = 0; +// Dequantizing 32 FP6, each Loop dequantizing 4 FP6 +#pragma unroll(8) + for (int i = 0; i < 8; i++) { + // Frag1 + Packed_FP6 = (*Frag1_PTR) & 0xc0c0c0c0; + if (i % 4 == 3) + Frag1_PTR++; + else + (*Frag1_PTR) = (*Frag1_PTR) << 2; + // Frag2 + tmp = (*Frag2_PTR) & 0xf0f0f0f0; + tmp = tmp >> 2; + if (i % 2 == 1) + Frag2_PTR++; + else + (*Frag2_PTR) = (*Frag2_PTR) << 4; + // Packed_FP6 + Packed_FP6 = Packed_FP6 | tmp; + // + FP6_FP16_Cast_4Way(&Packed_FP6, &tmp); + // + *OutputRegs = MultScale(Packed_FP6, Scale_RPTR[0]); // Muliply FP16 scales + OutputRegs += 1; + *OutputRegs = MultScale(tmp, Scale_RPTR[1]); // Muliply FP16 scales + OutputRegs += 1; + // Updating offset for FP16 scales for every two iterations + if (i % 2 == 1) Scale_RPTR += 2; + } +} + +/* + * + */ +__device__ __forceinline__ void ExtractFromSharedToReg_Scales(uint32_t* Scales, + half* WARP_SPTR_Scales) +{ + int lane_id = threadIdx.x % WARP_SIZE; + uint32_t* SPTR_uint = reinterpret_cast(WARP_SPTR_Scales); + uint32_t tmpReg = SPTR_uint[lane_id]; +#pragma unroll + for (int i = 0; i < 4; i++) { + // T __shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize); + Scales[i] = __shfl_sync(0xffffffff, tmpReg, i, 4); + } +} + +#endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h new file mode 100644 index 000000000000..c8cc7243f341 --- /dev/null +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h @@ -0,0 +1,204 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 + +#include +#include +#include + +using namespace std; + +void Padding_8_FP6_To_8_Bytes(unsigned char Padded_FP6[], + unsigned char* FP6_Array) // padding 0 to the lowerest bit location +{ + Padded_FP6[0] = FP6_Array[0] & 0xfc; + Padded_FP6[1] = (FP6_Array[0] << 6) | ((FP6_Array[1] >> 2) & 0xfc); + Padded_FP6[2] = (FP6_Array[1] << 4) | ((FP6_Array[2] >> 4) & 0xfc); + Padded_FP6[3] = FP6_Array[2] << 2; + Padded_FP6[4] = FP6_Array[3] & 0xfc; + Padded_FP6[5] = (FP6_Array[3] << 6) | ((FP6_Array[4] >> 2) & 0xfc); + Padded_FP6[6] = (FP6_Array[4] << 4) | ((FP6_Array[5] >> 4) & 0xfc); + Padded_FP6[7] = FP6_Array[5] << 2; +} + +unsigned char Extract_2_Bits_From_4_PaddedFP6(unsigned char B1, + unsigned char B2, + unsigned char B3, + unsigned char B4) +{ + unsigned char out; + out = (B1 & 0xc0) | ((B2 & 0xc0) >> 2) | ((B3 & 0xc0) >> 4) | ((B4 & 0xc0) >> 6); + return out; +} + +unsigned char Extract_4_Bits_From_2_PaddedFP6( + unsigned char B1, + unsigned char + B2) // The highest two bits are already extracted by Extract_2_Bits_From_4_PaddedFP6(); +{ + unsigned char out; + out = ((B1 << 2) & 0xf0) | ((B2 >> 2) & 0x0f); + return out; +} + +// dealing with 4 1*8 blocks of FP6 +void Assign_32_FP6_To_4_Thread(vector Seg_2bit[], + vector Seg_4bit[], + unsigned char* PTR_1, + unsigned char* PTR_2, + unsigned char* PTR_3, + unsigned char* PTR_4) +{ + unsigned char Padded_8_FP8[4][8]; + Padding_8_FP6_To_8_Bytes(Padded_8_FP8[0], PTR_1); + Padding_8_FP6_To_8_Bytes(Padded_8_FP8[1], PTR_2); + Padding_8_FP6_To_8_Bytes(Padded_8_FP8[2], PTR_3); + Padding_8_FP6_To_8_Bytes(Padded_8_FP8[3], PTR_4); + // + unsigned char Seg1_Byte1_T[4]; + unsigned char Seg1_Byte2_T[4]; + unsigned char Seg2_Byte1_T[4]; + unsigned char Seg2_Byte2_T[4]; + unsigned char Seg2_Byte3_T[4]; + unsigned char Seg2_Byte4_T[4]; + for (int t = 0; t < 4; t++) { + Seg1_Byte1_T[t] = Extract_2_Bits_From_4_PaddedFP6(Padded_8_FP8[0][0 + t * 2], + Padded_8_FP8[0][1 + t * 2], + Padded_8_FP8[1][0 + t * 2], + Padded_8_FP8[1][1 + t * 2]); + Seg1_Byte2_T[t] = Extract_2_Bits_From_4_PaddedFP6(Padded_8_FP8[2][0 + t * 2], + Padded_8_FP8[2][1 + t * 2], + Padded_8_FP8[3][0 + t * 2], + Padded_8_FP8[3][1 + t * 2]); + Seg2_Byte1_T[t] = + Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[0][0 + t * 2], Padded_8_FP8[0][1 + t * 2]); + Seg2_Byte2_T[t] = + Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[1][0 + t * 2], Padded_8_FP8[1][1 + t * 2]); + Seg2_Byte3_T[t] = + Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[2][0 + t * 2], Padded_8_FP8[2][1 + t * 2]); + Seg2_Byte4_T[t] = + Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[3][0 + t * 2], Padded_8_FP8[3][1 + t * 2]); + } + // + for (int t = 0; t < 4; t++) { + Seg_2bit[t].push_back(Seg1_Byte1_T[t]); + Seg_2bit[t].push_back(Seg1_Byte2_T[t]); + Seg_4bit[t].push_back(Seg2_Byte1_T[t]); + Seg_4bit[t].push_back(Seg2_Byte2_T[t]); + Seg_4bit[t].push_back(Seg2_Byte3_T[t]); + Seg_4bit[t].push_back(Seg2_Byte4_T[t]); + } + return; +} + +void BitInterleaving_2bit(unsigned char* PTR_4Bytes) +{ + unsigned int* PTR_UINT = reinterpret_cast(PTR_4Bytes); + unsigned int input = *PTR_UINT; + // + // int order_2bit[16] = {1,5,9,13,3,7,11,15,2,6,10,14,4,8,12,16}; // pre-defined order for + // bit-interleaving in QuantLLM + int order_2bit[16] = { + 2, 6, 10, 14, 4, 8, 12, 16, 1, 5, 9, 13, 3, 7, 11, 15}; // pre-defined order for + // bit-interleaving in QuantLLM + unsigned int Frags_2bit[16]; // The highest 2 bits are used to store the extracted fragments. + for (int i = 0; i < 16; i++) Frags_2bit[i] = (input << 2 * (order_2bit[i] - 1)) & 0xc0000000; + // + unsigned int output = 0x00000000; + for (int i = 0; i < 16; i++) output |= (Frags_2bit[i] >> (i * 2)); + // + *PTR_UINT = output; +} + +void BitInterleaving_4bit(unsigned char* PTR_4Bytes) +{ + unsigned int* PTR_UINT = reinterpret_cast(PTR_4Bytes); + unsigned int input = *PTR_UINT; + // + // int order_4bit[8] = {1,5,3,7,2,6,4,8}; // pre-defined order for bit-interleaving in QuantLLM + int order_4bit[8] = { + 2, 6, 4, 8, 1, 5, 3, 7}; // pre-defined order for bit-interleaving in QuantLLM + unsigned int Frags_4bit[8]; // The highest4 bits are used to store the extracted fragments. + for (int i = 0; i < 8; i++) Frags_4bit[i] = (input << 4 * (order_4bit[i] - 1)) & 0xf0000000; + // + unsigned int output = 0x00000000; + for (int i = 0; i < 8; i++) output |= (Frags_4bit[i] >> (i * 4)); + // + *PTR_UINT = output; +} + +/* + * Inputs: + * (1) unsigned char Weight_6bit [M*K*6/8] + * Outputs: + * (1) unsigned char Weight_2bit [M*K*2/8] + * (2) unsigned char Weight_4bit [M*K*4/8] + * + * Assumption: Weight_6bit, Weight_2bit, Weight_4bit all stored continuously in row-major. + * 8 FP6 = 6 Bytes + * 8 FP4 = 4 Bytes + * 8 FP2 = 2 Bytes + */ +void weight_matrix_prepacking(int* FP6Weights, size_t M, size_t K) +{ + assert(M % 64 == 0); + assert(K % 64 == 0); + // + unsigned char* Weight_6bit = reinterpret_cast(FP6Weights); + unsigned char* Weight_2bit = Weight_6bit; + unsigned char* Weight_4bit = Weight_6bit + M * K * 2 / 8; + // + vector A_Segment_2bit[32]; + vector A_Segment_4bit[32]; + // + size_t BytesPerRow = K * 6 / 8; + // Pass-1: (1) 2+4 split; (2) assign weights to 32 threads. + for (size_t i = 0; i < M / 64; i++) // + { + for (size_t j = 0; j < K / 16; j++) { + for (size_t k = 0; k < 64 / 16; k++) { + size_t row = i * 64 + k * 16; + size_t col = j * 16; + unsigned char* StartPTR_1 = Weight_6bit + row * BytesPerRow + col * 6 / 8; + unsigned char* StartPTR_2 = StartPTR_1 + 8 * BytesPerRow; + unsigned char* StartPTR_3 = StartPTR_1 + 8 * 6 / 8; + unsigned char* StartPTR_4 = StartPTR_2 + 8 * 6 / 8; + // Dealing with each 16*16 blocks then... + for (int l = 0; l < 8; l++) + Assign_32_FP6_To_4_Thread(&A_Segment_2bit[l * 4], + &A_Segment_4bit[l * 4], + StartPTR_1 + l * BytesPerRow, + StartPTR_2 + l * BytesPerRow, + StartPTR_3 + l * BytesPerRow, + StartPTR_4 + l * BytesPerRow); + } + } + } + // Verifying the length of 2_bit segments and 4_bit segments + size_t BytesPerThread_2bit = M * K * 2 / 8 / 32; + size_t BytesPerThread_4bit = M * K * 4 / 8 / 32; + for (int i = 0; i < 32; i++) { + assert(A_Segment_2bit[i].size() == BytesPerThread_2bit); + assert(A_Segment_4bit[i].size() == BytesPerThread_4bit); + } + // Pass-2: Optimizing coleasced global memory access + for (size_t i = 0; i < BytesPerThread_2bit / 4; i++) + for (int t = 0; t < 32; t++) + for (int b = 0; b < 4; b++) + Weight_2bit[i * 128 + t * 4 + (3 - b)] = + A_Segment_2bit[t] + [i * 4 + b]; // why (3-b): special byte order within a register + for (size_t i = 0; i < BytesPerThread_4bit / 4; i++) + for (int t = 0; t < 32; t++) + for (int b = 0; b < 4; b++) + Weight_4bit[i * 128 + t * 4 + (3 - b)] = + A_Segment_4bit[t][i * 4 + b]; // why (3-b):special byte order within a register + // Pass-3: Bit-level interleaving + for (size_t i = 0; i < BytesPerThread_2bit * 32 / 4; i++) + BitInterleaving_2bit(Weight_2bit + 4 * i); + for (size_t i = 0; i < BytesPerThread_4bit * 32 / 4; i++) + BitInterleaving_4bit(Weight_4bit + 4 * i); +} diff --git a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu index 84a9906cf037..cfa62f94596a 100644 --- a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu +++ b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu @@ -119,7 +119,10 @@ void launch_gated_activation_impl(T* output, DISPATCH_UNROLL(5); } else if (unroll == 6) { DISPATCH_UNROLL(6); + } else if (unroll == 7) { + DISPATCH_UNROLL(7); } else { + // TODO: provide a kernel with an outer loop to handle larger columns. throw std::runtime_error( "Called with more columns than supported, please report this bug and this limit will " "be increased."); diff --git a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py index f9da7ac5d23e..ebdb59bca920 100644 --- a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py +++ b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py @@ -164,7 +164,7 @@ def process_layer(layer_container: LayerContainer, l_name: str, cur_offset: int) strides=tensor.stride(), offset=cur_offset) - cur_offset += pad_to_aligned_offset(elem_size(param.dtype) * param.numel()) + cur_offset += pad_to_aligned_offset(elem_size(tensor.dtype) * tensor.numel()) layer_metadata.params[p_name] = param_metadata diff --git a/deepspeed/inference/v2/modules/heuristics.py b/deepspeed/inference/v2/modules/heuristics.py index b89e95c0d834..d176206f3c60 100644 --- a/deepspeed/inference/v2/modules/heuristics.py +++ b/deepspeed/inference/v2/modules/heuristics.py @@ -86,8 +86,15 @@ def instantiate_linear(linear_config: DSLinearConfig, engine_config: RaggedInfer A linear module implementing the given configuration. """ - # Currently, we only have one implementation, so we just return it. - config = ConfigBundle(name="blas_fp_linear", config=linear_config) + quantization_mode = engine_config.quantization.quantization_mode + if quantization_mode is None: + config = ConfigBundle(name="blas_fp_linear", config=linear_config) + else: + # Currently, we only support ``quantized_wf6af16_linear``. + if quantization_mode == "wf6af16": + config = ConfigBundle(name="quantized_wf6af16_linear", config=linear_config) + else: + raise ValueError(f"Unsupported quantization mode: {quantization_mode}") return DSLinearRegistry.instantiate_config(config) diff --git a/deepspeed/inference/v2/modules/implementations/linear/__init__.py b/deepspeed/inference/v2/modules/implementations/linear/__init__.py index e76aab71c4cf..0501af54c4e6 100644 --- a/deepspeed/inference/v2/modules/implementations/linear/__init__.py +++ b/deepspeed/inference/v2/modules/implementations/linear/__init__.py @@ -4,3 +4,4 @@ # DeepSpeed Team from .blas_fp_linear import BlasFPLinear +from .quantized_linear import QuantizedWf6Af16Linear, fp_quantize diff --git a/deepspeed/inference/v2/modules/implementations/linear/quantized_linear.py b/deepspeed/inference/v2/modules/implementations/linear/quantized_linear.py new file mode 100644 index 000000000000..933cf55b2391 --- /dev/null +++ b/deepspeed/inference/v2/modules/implementations/linear/quantized_linear.py @@ -0,0 +1,205 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from typing import Any, Dict, Optional + +import torch + +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder import InferenceCoreBuilder +from ....allocator import empty_from +from ....inference_utils import is_gated +from ....kernels.core_ops import ( + CUDAWf6Af16Linear, + CUDABiasActivation, + CUDAGatedActivation, +) + +from ...interfaces import DSLinearBase, DSLinearRegistry +from ...configs import DSLinearConfig +from ....inference_parameter import InferenceParameter + + +def fp_quantize(input: torch.FloatTensor, + num_bits: int = 6, + exp_bits: int = 3, + min_value: torch.FloatTensor = None, + max_value: torch.FloatTensor = None, + group_size: int = -1): + """ + Args: + inputs (`torch.FloatTensor`) + The input which needs to be quantized + num_bits (int, >=4) + Number of bits to use for quantization + exp_bits: + fp exp_bits + min_value/max_vlue (torch.FloatTensor) + Used for static activation quantization + group_size (int) N + The quantization block size, each N numbers has its own scaling + factor and off-site. -1 means use the last dim as the group_size + Returns: + quantized_fake_fp6 + The quantized weights, in fp16 format and contains fp6 value. + scales + Quantization scales + """ + + try: + from qtorch.quant import float_quantize + except ImportError: + raise ImportError("Please install qtorch to use this function") + + assert (min_value is None and max_value is None) or (min_value is not None and max_value is not None) + + assert input.dtype == torch.float16 + + orig_device = input.device + input = input.to(torch.float32).to(get_accelerator().current_device()) + if num_bits == 6 and exp_bits == 3: # this is default + q_range = 28 + else: + raise NotImplementedError + + man_bits = num_bits - exp_bits - 1 + input_shape = input.shape + + if group_size == -1: + group_size = input_shape[-1] + else: + # Only support per-channel quantization + raise NotImplementedError + num_groups = input.numel() // group_size + input = input.reshape(num_groups, -1) + + if min_value is None: + max_input = torch.amax(torch.abs(input), dim=-1).view(num_groups, -1) + else: + max_input = torch.max(min_value.abs(), max_value) # .view(-1) + scales = max_input / q_range # q_range + 1 + scales[scales == 0] = 1 # avoid zero scales + scaled_input = input / scales + + quantized_fake_fp6 = float_quantize(scaled_input, exp_bits, man_bits, rounding="nearest") + + quantized_fake_fp6 = quantized_fake_fp6.reshape(input_shape).contiguous().to(torch.float16).to(orig_device) + scales = scales.to(torch.float16).to(orig_device) + # Now the dequantized value is quantized_fake_fp6 * scales + + return quantized_fake_fp6, scales + + +@DSLinearRegistry.register_module +class QuantizedWf6Af16Linear(DSLinearBase): + """ + Linear DSModule for FP6 weight-only quantization kernel, where weight is FP6 + and activation is FP16. + """ + + @staticmethod + def name(): + return 'quantized_wf6af16_linear' + + @staticmethod + def supports_config(config: DSLinearConfig) -> bool: + if config.input_dtype != config.output_dtype: + return False + + # As for fp6 data items, they are packed and stored in a set of fp16 + # tensors. E.g., 8 fp6 data items are stored in 3 fp16 tensor. + if config.input_dtype != torch.float16: + return False + + if is_gated(config.activation): + try: + _ = CUDAGatedActivation(config.out_channels, config.output_dtype, config.activation) + except ValueError: + return False + else: + try: + _ = CUDABiasActivation(config.out_channels, config.output_dtype, config.activation) + except ValueError: + return False + + return True + + def __init__(self, config: DSLinearConfig, implementation_config: Dict[str, Any]) -> None: + super().__init__(config, implementation_config) + + self._linear_impl = CUDAWf6Af16Linear() + + if is_gated(config.activation): + # In the FP6 kernel implementation, the MatMul is W * A, where W is + # the weight and A is activation. M is the output channel size. + self.out_channels = self._config.out_channels * 2 + self.in_channels = self._config.in_channels + self._is_gated = True + self._act_fn = CUDAGatedActivation(config.out_channels, config.output_dtype, config.activation) + self._double_buffer = torch.empty((config.max_tokens, config.out_channels * 2), + dtype=config.output_dtype, + device=get_accelerator().current_device()) + else: + self.out_channels = self._config.out_channels + self.in_channels = self._config.in_channels + self._is_gated = False + self._act_fn = CUDABiasActivation(config.out_channels, config.output_dtype, config.activation) + + self._output = torch.empty((config.max_tokens, config.out_channels), + dtype=config.output_dtype, + device=get_accelerator().current_device()) + + self.inf_module = InferenceCoreBuilder().load() + self.inf_module.create_handle() + self.preprocess_weight = self.inf_module.preprocess_weight + + self.quantizer = fp_quantize + + def transform_param(self, param: torch.Tensor) -> InferenceParameter: + """ + Converts param to same data type as input and output. + + Parameters: + param (torch.Tensor): Weight or bias tensor. + """ + # It expects that the quantization scales are store in the attribute `scales`. + + if param.ndim == 1: # bias, do nothing + return InferenceParameter.initialize(param) + + quantized_fake_fp6, scales = self.quantizer(param, num_bits=6, exp_bits=3) + + # This is for debugging, will delete before release. + assert (quantized_fake_fp6.dtype == torch.float16) + assert quantized_fake_fp6.shape[0] == self.out_channels + assert scales.numel() == self.out_channels + + weights_2bit, weights_4bit = self.preprocess_weight(quantized_fake_fp6) + + return InferenceParameter.initialize(weights_2bit, weights_4bit=weights_4bit, scales=scales) + + def forward(self, hidden_states: torch.Tensor, w: torch.Tensor, b: Optional[torch.Tensor] = None) -> torch.Tensor: + weights_2bit = w + weights_4bit = w.weights_4bit + scales = w.scales + output = empty_from(self._output, (hidden_states.shape[0], self._config.out_channels)) + if self._is_gated: + staging_output = empty_from(self._double_buffer, (hidden_states.shape[0], self.out_channels)) + self._linear_impl(staging_output, hidden_states, weights_2bit, weights_4bit, scales, self.out_channels, + hidden_states.shape[0], self.in_channels) + self._act_fn(output, staging_output, b) + else: + self._linear_impl(output, hidden_states, weights_2bit, weights_4bit, scales, self.out_channels, + hidden_states.shape[0], self.in_channels) + self._act_fn(output, b) + + return output + + @property + def output(self) -> torch.Tensor: + """ + Return the padded, pre-allocated output Tensor. + """ + return self._output diff --git a/op_builder/inference_core_ops.py b/op_builder/inference_core_ops.py index 8073b63ad16b..3c53774d0a50 100755 --- a/op_builder/inference_core_ops.py +++ b/op_builder/inference_core_ops.py @@ -57,6 +57,8 @@ def get_prefix(self): return "deepspeed" if os.path.isdir(ds_path) else ".." def sources(self): + import torch + sources = [ "inference/v2/kernels/core_ops/core_ops.cpp", "inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp", @@ -69,6 +71,15 @@ def sources(self): "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu", ] + # The source files with specific GPU architecture requirements. + if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda + cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda + if cuda_capability != 8: + self.warning("FP6 quantization kernel is only supported on Ampere architectures") + else: + sources.append("inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu") + sources.append("inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp") + prefix = self.get_prefix() sources = [os.path.join(prefix, src) for src in sources] return sources @@ -83,6 +94,7 @@ def include_paths(self): 'inference/v2/kernels/core_ops/cuda_layer_norm', 'inference/v2/kernels/core_ops/cuda_rms_norm', 'inference/v2/kernels/core_ops/gated_activations', + 'inference/v2/kernels/core_ops/cuda_linear', 'inference/v2/kernels/includes', ] diff --git a/requirements/requirements-inf.txt b/requirements/requirements-inf.txt index 7a40ae814cbe..b7fd13787e8b 100644 --- a/requirements/requirements-inf.txt +++ b/requirements/requirements-inf.txt @@ -1,6 +1,7 @@ google lm-eval==0.3.0 protobuf +qtorch safetensors sentencepiece transformers>=4.32.1 diff --git a/tests/unit/inference/v2/modules/test_quantized_linear_module.py b/tests/unit/inference/v2/modules/test_quantized_linear_module.py new file mode 100644 index 000000000000..a7bd965072ac --- /dev/null +++ b/tests/unit/inference/v2/modules/test_quantized_linear_module.py @@ -0,0 +1,184 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from typing import Optional + +import pytest +import torch + +from deepspeed.accelerator import get_accelerator +from deepspeed.inference.v2.inference_utils import ActivationType, DtypeEnum, is_gated +from deepspeed.inference.v2.modules import ConfigBundle +from deepspeed.inference.v2.modules.configs import DSLinearConfig +from deepspeed.inference.v2.modules.interfaces import DSLinearRegistry +from ...v2.inference_test_utils import allclose + + +def reference_implementation(hidden_states: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor], + act_type: ActivationType) -> torch.Tensor: + dtype = hidden_states.dtype + out_states = torch.nn.functional.linear(hidden_states, weight, bias) + out_states.float() + + if is_gated(act_type): + act_func_map = { + ActivationType.ReGLU: torch.nn.functional.relu, + ActivationType.GEGLU: lambda x: torch.nn.functional.gelu(x, approximate="tanh"), + ActivationType.SiGLU: torch.nn.functional.silu, + } + + act_act = out_states[..., ::2] + act_linear = out_states[..., 1::2] + + act_act = act_func_map[act_type](act_act) + out_states = act_act * act_linear + else: + act_func_map = { + ActivationType.RELU: torch.nn.functional.relu, + ActivationType.GELU: torch.nn.functional.gelu, + ActivationType.SILU: torch.nn.functional.silu, + ActivationType.IDENTITY: lambda x: x, + } + + out_states = act_func_map[act_type](out_states) + return out_states.to(dtype) + + +def _fp6_quant_dequant_weights(weight: torch.Tensor) -> torch.Tensor: + from deepspeed.inference.v2.modules.implementations.linear.quantized_linear import fp_quantize + weight_quantized_fake_fp6, scales = fp_quantize(weight, num_bits=6, exp_bits=3) + return weight_quantized_fake_fp6 * scales + + +def quant_dequant_implementation(hidden_states: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor], + act_type: ActivationType) -> torch.Tensor: + dtype = hidden_states.dtype + weight_dequantized = _fp6_quant_dequant_weights(weight) + out_states = torch.nn.functional.linear(hidden_states, weight_dequantized, bias) + out_states.float() + + if is_gated(act_type): + act_func_map = { + ActivationType.ReGLU: torch.nn.functional.relu, + ActivationType.GEGLU: lambda x: torch.nn.functional.gelu(x, approximate="tanh"), + ActivationType.SiGLU: torch.nn.functional.silu, + } + + act_act = out_states[..., ::2] + act_linear = out_states[..., 1::2] + + act_act = act_func_map[act_type](act_act) + out_states = act_act * act_linear + else: + act_func_map = { + ActivationType.RELU: torch.nn.functional.relu, + ActivationType.GELU: torch.nn.functional.gelu, + ActivationType.SILU: torch.nn.functional.silu, + ActivationType.IDENTITY: lambda x: x, + } + + out_states = act_func_map[act_type](out_states) + return out_states.to(dtype) + + +def _fp6_quantized_linear_helper(tokens: int, + in_channels: int, + out_channels: int, + dtype: DtypeEnum, + act_fn: ActivationType, + use_bias: bool = True, + expect_failure: bool = False) -> None: + # The current FP6 kernel only supports NVIDIA Ampere GPUs. + if not 'cuda' in get_accelerator().current_device_name(): + return + major, _ = torch.cuda.get_device_capability() #ignore-cuda + if major != 8: + return + + # Input vals + hidden_states = torch.randn( + (tokens, in_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01 + + weight_out_channels = 2 * \ + out_channels if is_gated(act_fn) else out_channels + weight = torch.randn( + (weight_out_channels, in_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01 + if use_bias: + bias = torch.randn( + (weight_out_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01 + else: + bias = None + + # quantize and dequantize output + ref_quant_dequant_output = quant_dequant_implementation(hidden_states, weight, bias, act_fn) + + linear_config = DSLinearConfig(max_tokens=2048, + in_channels=in_channels, + out_channels=out_channels, + activation=act_fn, + input_dtype=dtype, + output_dtype=dtype) + bundle = ConfigBundle(name='quantized_wf6af16_linear', config=linear_config) + fp6_linear_module = DSLinearRegistry.instantiate_config(bundle) + weight_fp6 = fp6_linear_module.transform_param(weight.clone().cpu()).to(get_accelerator().current_device_name()) + + if expect_failure: + with pytest.raises(ValueError) as excinfo: + ds_output = fp6_linear_module(hidden_states, weight_fp6, bias) + assert "The out and in channel should be multiple of 256 and 64 respectively." in str(excinfo.value) + else: + ds_output = fp6_linear_module(hidden_states, weight_fp6, bias) + # The current FP6 kernel uses FP16 Tensor Core. + tolerances = (3e-2, 2e-3) # tolerances for fp16 + + # Check DeepSpeed implementation + assert allclose(ds_output, ref_quant_dequant_output, tolerances=tolerances) + + +all_acts = [ + ActivationType.RELU, + ActivationType.GELU, + ActivationType.SILU, + ActivationType.GEGLU, + ActivationType.ReGLU, + ActivationType.SiGLU, +] +all_tokens = [1, 37] +all_in_out_channels = [ + (4096, 4096), + (8192, 28672), +] + + +@pytest.mark.inference_v2_ops +@pytest.mark.parametrize("tokens", all_tokens) +@pytest.mark.parametrize("in_channels, out_channels", all_in_out_channels) +@pytest.mark.parametrize("act_fn", all_acts) +@pytest.mark.parametrize("use_bias", [True, False]) +def test_fp6_quantized_linear_act_fn(tokens: int, in_channels: int, out_channels: int, act_fn: ActivationType, + use_bias: bool) -> None: + _fp6_quantized_linear_helper(tokens=tokens, + in_channels=in_channels, + out_channels=out_channels, + dtype=DtypeEnum.fp16, + act_fn=act_fn, + use_bias=use_bias) + + +# Other shapes, not supported by FP6 kernels. Will raise ValueError. +@pytest.mark.inference_v2_ops +@pytest.mark.parametrize("tokens", all_tokens) +@pytest.mark.parametrize("in_channels, out_channels", [(4608, 1728)]) +@pytest.mark.parametrize("act_fn", all_acts) +@pytest.mark.parametrize("use_bias", [True, False]) +def test_fp6_quantized_linear_act_fn_fail(tokens: int, in_channels: int, out_channels: int, act_fn: ActivationType, + use_bias: bool) -> None: + _fp6_quantized_linear_helper(tokens=tokens, + in_channels=in_channels, + out_channels=out_channels, + dtype=DtypeEnum.fp16, + act_fn=act_fn, + use_bias=use_bias, + expect_failure=True) diff --git a/tests/unit/inference/v2/ragged/test_manager_configs.py b/tests/unit/inference/v2/ragged/test_manager_configs.py index bdd513445ddb..a5f270cced8c 100644 --- a/tests/unit/inference/v2/ragged/test_manager_configs.py +++ b/tests/unit/inference/v2/ragged/test_manager_configs.py @@ -5,7 +5,7 @@ import pytest -from pydantic import ValidationError +from deepspeed.pydantic_v1 import ValidationError from deepspeed.inference.v2.ragged import DSStateManagerConfig From 0a979f8bc1eebb7720f3c7834b17e8c91d28b857 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Thu, 7 Mar 2024 17:00:15 -0800 Subject: [PATCH 10/13] FP6 blog (#5235) Co-authored-by: Zhen Zheng [zhengzhen@microsoft.com](mailto:zhengzhen@microsoft.com) Co-authored-by: Xiaoxia Wu [xiaoxiawu@microsoft.com](mailto:xiaoxiawu@microsoft.com) Co-authored-by: Haojun Xia [xhjustc@mail.ustc.edu.cn](mailto:xhjustc@mail.ustc.edu.cn) Co-authored-by: Olatunji Ruwase [olruwase@microsoft.com](mailto:olruwase@microsoft.com) Co-authored-by: Leon Song [leonsong@microsoft.com](mailto:leonsong@microsoft.com) --------- Co-authored-by: xiaoxiawu-microsoft Co-authored-by: Arash Bakhtiari Co-authored-by: Xiaoxia (Shirley) Wu <94406484+xiaoxiawu-microsoft@users.noreply.github.com> Co-authored-by: ZHENG, Zhen Co-authored-by: Michael Wyatt --- README.md | 6 + blogs/deepspeed-fp6/03-05-2024/README.md | 147 ++++++++++++++++++ .../03-05-2024/assets/fp6-design.png | Bin 0 -> 445557 bytes .../03-05-2024/assets/hero-figure.png | Bin 0 -> 800588 bytes .../03-05-2024/assets/servingllm/100-1000.png | Bin 0 -> 866799 bytes .../03-05-2024/assets/servingllm/100-250.png | Bin 0 -> 868325 bytes .../03-05-2024/assets/servingllm/100-500.png | Bin 0 -> 827892 bytes 7 files changed, 153 insertions(+) create mode 100755 blogs/deepspeed-fp6/03-05-2024/README.md create mode 100644 blogs/deepspeed-fp6/03-05-2024/assets/fp6-design.png create mode 100644 blogs/deepspeed-fp6/03-05-2024/assets/hero-figure.png create mode 100644 blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-1000.png create mode 100644 blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-250.png create mode 100644 blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-500.png diff --git a/README.md b/README.md index 02dc3aec7f7f..a7fffac15f62 100755 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ ## Latest News DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat). +* [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) * [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) * [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html) * [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp) @@ -254,6 +255,11 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information 28. Shuaiwen Leon Song, Bonnie Kruft, Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Masahiro Tanaka, Xiaoxia Wu, Jeff Rasley, Ammar Ahmad Awan, Connor Holmes, Martin Cai, Adam Ghanem, Zhongzhu Zhou, Yuxiong He, et al. (2023) DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies [arXiv:2310.04610](https://arxiv.org/abs/2310.04610) [[blog]](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/) 29. Zhewei Yao, Reza Yazdani Aminabadi, Stephen Youn, Xiaoxia Wu, Elton Zheng, Yuxiong He. (2023) ZeroQuant-HERO: Hardware-Enhanced Robust Optimized Post-Training Quantization Framework for W8A8 Transformers [arXiv:2310.17723](https://arxiv.org/abs/2310.17723) +30. Xiaoxia Wu, Haojun Xia, Stephen Youn, Zhen Zheng, Shiyang Chen, Arash Bakhtiari, Michael Wyatt, Reza Yazdani Aminabadi, Yuxiong He, Olatunji Ruwase, Leon Song, Zhewei Yao (2023) ZeroQuant(4+2): Redefining LLMs Quantization with a New FP6-Centric Strategy for Diverse Generative Tasks [arXiv:2312.08583](https://arxiv.org/abs/2312.08583) + +31. Haojun Xia, Zhen Zheng, Xiaoxia Wu, Shiyang Chen, Zhewei Yao, Stephen Youn, Arash Bakhtiari, Michael Wyatt, Donglin Zhuang, Zhongzhu Zhou, Olatunji Ruwase, Yuxiong He, Shuaiwen Leon Song. (2024) FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design [arXiv:2401.14112](https://arxiv.org/abs/2401.14112) + + # Videos 1. DeepSpeed KDD 2020 Tutorial diff --git a/blogs/deepspeed-fp6/03-05-2024/README.md b/blogs/deepspeed-fp6/03-05-2024/README.md new file mode 100755 index 000000000000..dbd6b2d081aa --- /dev/null +++ b/blogs/deepspeed-fp6/03-05-2024/README.md @@ -0,0 +1,147 @@ +
+ +# DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models + +
+ +
+ +DeepSpeed-VisualChat! + +
+ + +To cite DeepSpeed-FP6, please cite the following two arxiv reports - ZeroQuant(4+2) and FP6-LLM: + +``` +@article{wu2023zeroquant, + title={Zeroquant(4+2): Redefining llms quantization with a new fp6-centric strategy for diverse generative tasks}, + author={Wu, Xiaoxia and Xia, Haojun and Youn, Stephen and Zheng, Zhen and Chen, Shiyang and Bakhtiari, Arash and Wyatt, Michael and Aminabadi, Reza Yazdani and He, Yuxiong and Ruwase, Olatunji and Song, Leon and others}, + journal={arXiv preprint arXiv:2312.08583}, + year={2023} +} + +@article{xia2024fp6, + title={FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design}, + author={Xia, Haojun and Zheng, Zhen and Wu, Xiaoxia and Chen, Shiyang and Yao, Zhewei and Youn, Stephen and Bakhtiari, Arash and Wyatt, Michael and Zhuang, Donglin and Zhou, Zhongzhu and others}, + journal={arXiv preprint arXiv:2401.14112}, + year={2024} +} +``` + + +# Table of Contents +1. [Why 6-bit Floating Point (FP6)](#introduction) +2. [System Support for FP6](#system-fp6) +3. [LLMs Serving with FP6](#serving-llm) +4. [How to Start](#how-to-start) +5. [Software Improvements](#software-improvements) +6. [Acknowledgments and Contributions](#ac) + +# 1. Why 6-bit Floating Point (FP6) + + +In the evolving landscape of Large Language Models (LLMs) like GPT, our research aims to boost computational efficiency and storage while preserving model quality. This focus brings us to tackle the complex challenges of 4-bit quantization, where optimizing performance, efficiency, and accuracy is crucial. + +**Exploring the Challenges of 4-bit Quantization** In our recent research findings -- ZeroQuant (4+2)[1], we explore the capabilities of INT4 quantization techniques (like the GPTQ algorithm) for serving Large Language Models (LLMs). While these techniques reduce memory and computational requirements, they often perform poorly on a broad array of tasks, including generative tasks such as code generation and summarization, due to overfitting issues. This highlights the urgent need for new quantization approaches that simultanenously improve both the efficiency and effectiveness of LLMs. + +**Breakthroughs with FP6 Precision** Our exploration of different quantization methods led us to the FP6 precision standard. Despite the challenges in integrating and accelerating FP6 with current AI hardware -- which we will address in the next section - this format excels in performance and flexibility across various tasks. Notably, we observe that for generative tasks, FP6 quantization can match the performance of the half-precision (FP16) format. For example, with FP6 quantization, StarCoder-15B achieves comparable code generation results to the FP16 variant, while a smaller model, such as BART-460M, achieves comparable summarization performance to the standard FP16 equivalent. In order to preserve these quality gains, while matching the system efficiency of INT4 quantization on AI hardware, we propose a novel 4+2 FP6 scheme. This innovation makes FP6 a promising direction for improving the efficiency of LLMs, marking a significant leap in AI technology advancement. For more details, please refer to our research paper - ZeroQuant (4+2)[1]. + + +# 2. System Support for FP6 + +**Pioneering Full-Stack GPU Kernel Design** A key challenge of FP6 quantization is the lack of efficient GPU kernel designs for this irregular, i.e., "non-power of 2", bit-width. In our recent research — FP6-LLM [2], we introduce TC-FPx, the first full-stack GPU system design scheme with unified Tensor Core support of floating point weights for FP6 and other irregular quantization bit-widths (6-bit, 5-bit, 3-bit, etc.). TC-FPx breaks the limitations of the underlying GPU hardware, allowing the GPU to support linear layer calculations on model weights of arbitrary bit width. By increasing the number of bit-width options for efficient quantization, TC-FPx significantly mitigates the "memory wall" challenges of LLM inference. In TC-FPx, Tensor Cores are utilized for intensive computation of matrix multiplications, while SIMT cores are effectively leveraged for weight dequantization, transforming the x-bit model weights to FP16 type during runtime before feeding them to Tensor Cores. It has the following key innovations: +
+ fp6 design + +
+ +* *Ahead-of-time Bit-level Pre-packing*: resolve the challenge of unfriendly memory access for weights with irregular bit-width, and enable optimal GPU memory access. + +* *SIMT-Efficient GPU Runtime*: minimize the runtime overhead of weight de-quantization. + +* *The software pipeline of TC-FPx kernel*: efficiently utilize SIMT cores, Tensor Cores, and the GPU memory hierarchy for high performance. + + + +On average, the TC-FPx kernel demonstrates a 2.1-fold improvement in processing speed over the FP16 cuBLAS benchmark during memory-intensive General Matrix Multiply (GEMM) operations on NVIDIA A100 GPUs. Notably, the implementation of the FP6 kernel through FP6 quantization facilitates the operation of LLaMA-70b on a solitary A100 GPU. This remarkable feat results in a normalized inference throughput that is 1.69 to 2.65 times superior to the FP16 benchmark when conducting inference tasks with batch-size under 32. Currently, TC-FPx kernel only supports NVIDIA Ampere GPUs and is only tested and verified on A100 GPUs + + +# 3. LLMs serving with FP6 + +We have successfully integrated the FP6 quantization kernel [3] into DeepSpeed-FastGen, facilitating on-the-fly, weight-only quantization. This enhancement permits the efficient quantization and deployment of large language models (LLMs) through a unified configuration option within DeepSpeed-FastGen. Detailed information regarding this feature will be provided in due course. Through our interface, users have the flexibility to load a model checkpoint from either HuggingFace hub or a local directory. While loading the checkpoint, our system applies FP6 round-to-nearest quantization on each linear layer, and transforms the quantized weights into 6-bit prepacked tensors. These tensors will serve as the model weights for inference, while the original FP16 weights are discarded to release memory. Throughout the inference stage, the FP6 kernels leverage the 6-bit prepacked weights, ensuring a seamless experience for users engaging with our platform. + +We assessed the LLaMA-70b model's serving performance using FP6 quantization on two A100 GPUs-80G, and observed a *1.5x* reduction in inference latency and a *3.5x* increase in inference throughput compared to the FP16 baseline. FP6 quantization offers two key benefits for model inference: it enables the deployment of large language models (LLMs) on fewer GPUs — for instance, LLaMA-70b fits on a single A100-80G GPU with FP6, versus at least two GPUs required for the FP16 baseline. Additionally, it significantly accelerates linear layers in memory-bound scenarios, which are common in LLM inference. Moreover, FP6 quantization reduces the GPU memory requirements for model weights, allowing for more queries to be served simultaneously, and thus increasing serving throughput. + +Our system demonstrates exceptional efficiency in handling long generation sequences. As illustrated in Figure 1, for generation lengths surpassing the prompt length, our system exhibits a notable performance superiority. The disparity in performance between FP6 and the FP16 baseline widens with the extension of the generation sequence length. This trend is primarily attributed to the inference process becoming increasingly memory-constrained as the decoding length expands, favoring our weight-quantized GPU kernels by facilitating faster compute compared to the FP16 baseline. It is important to highlight two factors contributing to the increased memory constraints in longer decoding scenarios. + - Firstly, the memory usage for the KV cache escalates with the sequence length, reducing the feasible batch sizes and leading to memory-bound GEMM operations. + - Secondly, within the context of DeepSpeed-FastGen's prefill-decoding-mixed-batch technique, scenarios involving extended token generation encounter a reduction in prefill-chunks available for mixing with decodings. This results in a higher frequency of batches dedicated solely to decodings, further intensifying the memory-bound conditions. + +

+ Caption1 + Caption2 + Caption3 +

+ + *Figure 1*: End-to-end serving performances in DeepSpeed-MII with 32 clients and total of 128 requests, for LLaMA-2-70B model on 2xA100-80g with two-way tensor parallelism. We experimented with different number of requests between 128, 256 and 512 and found that the speedup is simillar. + +Despite the significant benefits of FP6 quantization, the current implementation faces limitations. Notably, in scenarios where GEMM operations become compute-bound due to large batch sizes or sufficient GPU memory, our weight-only quantization kernel may not sustain its latency advantage, especially against optimized libraries like cuBlas. However, our system's memory efficiency remains a key benefit. Currently, support is limited to Non-Mixture of Experts (Non-MoE) structures, with efforts underway to extend support to MoE structures. Additionally, the system is compatible only with FP16 input models, as the FP6 kernel processes FP16 activations exclusively. + + + +# 4. How to begin with DeepSpeed-FP6 + +The quantization-and-inference experience of DeepSpeed-FP6 is straightforward and convenient. Here we give an example based on LLaMa-2-70B model: + +```python +import mii +pipe = mii.pipeline("NousResearch/Llama-2-70b-hf", quantization_mode='wf6af16') +response = pipe(["DeepSpeed is", "Seattle is"], max_new_tokens=128) +print(response) +``` + +You need to install the following: +``` +pip install deepspeed-mii +pip install qtorch +``` + +To benchmark with our DeepSpeed-FP6, please visit the following script: +```bash +https://github.com/microsoft/DeepSpeedExamples/blob/master/benchmarks/inference/mii/run_fp6.sh +``` + +Please also visit the [FP6-LLM github](https://github.com/usyd-fsalab/fp6_llm) for the standalone kernel of FP6. Don't forget to star the repo to show your support! + + +# 5. Software Improvements + + +Currently, DeepSpeed-FP6 supports only dense models with MoE models support upcoming. We will continue to improve DeepSpeed-FP6 with your feedback and support. DeepSpeed-FP6 is a component of the larger DeepSpeed ecosystem, which includes a range of Deep Learning systems and modeling technologies. To learn more, + +* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation. +* Follow us on our [English X(Twitter)](https://twitter.com/MSFTDeepSpeed), [Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed. + +We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. + +* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeed-MII GitHub](https://github.com/microsoft/DeepSpeed-MII/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work! + + +# 6. Acknowledgments and Contributions +We thank the collaboration of the University of Sydney and Rutgers University. We also thank the open-source library [aspuru-guzik-group/qtorch](https://github.com/aspuru-guzik-group/qtorch). + +Contributions: +Xiaoxia Wu\* $^1$, Zhen Zheng\* $^1$, Haojun Xia\* $^2$, Arash Bakhtiari $^1$, Michael Wyatt $^1$, Shiyang Chen $^3$, Stephen Youn $^1$, Reza Yazdani Aminabadi, Yuxiong He, Olatunji Ruwase $^1$, Zhewei Yao, Leon Song $^1$ $^2$ (project lead) + +\* Equal Contribution +1: Microsoft +2: University of Sydney +3: Rutgers University + +Reference: + +[1] ZeroQuant(4+2): Redefining LLMs Quantization with a New FP6-Centric Strategy for Diverse Generative Tasks. arXiv. https://arxiv.org/abs/2312.08583 + +[2] FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design. arXiv. https://arxiv.org/abs/2401.14112 + +[3] FP6-LLM kernel release. GitHub. https://github.com/usyd-fsalab/fp6_llm diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/fp6-design.png b/blogs/deepspeed-fp6/03-05-2024/assets/fp6-design.png new file mode 100644 index 0000000000000000000000000000000000000000..5024332a8f3324c677e181bd90b14ee2599cbb84 GIT binary patch literal 445557 zcmdSAWpf?D(k6Js%*>2O%*@QpEQ|TbVwOdgg%;xxlf}$vFguY@s>sSG^U3dODzYeugopqD07YI-N&^6Z6$1dE*x+IRz46Aa?EBY1yGzJx z!T)>w;lD%y0OSC9DRC|D?6Vv%Z{oq{C(#D4Qj2jMFCsjq71%ihlJ#Tm1Q~e=OvsR@ zDUOl|zT|`%iKeeMa^&db2zEXGP$6P~wV#e`urYnhLyPjDmh~U)U%g&GK9(K)qy zFj4>cMl7#(G1rHo+dU$l=$K9K|Ip_srU693|NNU@jh=}$UMl7a{O~dJUv(=l&LE=y z&kjuDK~%l{pM5&B|8L*_XOm3%8zH6sFPk>0i&_QvuPpyv$!*HsHc+e+1R0bV2v=04 zK@g%lM3T__Ux5zIG%3R;Og?yu;moG*i}u}37f!to8l7Df|1-^K@V8$8hD1gE>s49D zrR1!s&~qnmX%E$ip%Hjn&luYMR*5AB1?i1uV6jUg2^y>@^~ob8_;rqR)@u zU2fZvSe*FPOLydHX!xHcDo6048H}5>AqPE=?>|ONf=wn`8C)QWb1G@j*dSn_`#)7; z`#jE8ypgAf%k*!ARL2k2%ThhtJ_(g|+itGM`Oix%)fLJN)CRKcY9Rg4JX`)Fn<<4t zQB*9rQ;ePu85)E{$Zj{WJ)u64a*Ey!%NMK@7VbSiCuS^ow2-J@?sEd%B{(7*8>VjX-x&UUsDXC z7e*ff%zi9V1mY73NjtKvd(9uirOcP=Ub{)+Prz!5d0%=WosU%T6-wZ6#CumZGTYS{ zl$Q2-$W9b9`DEUYf>EDVZ3mgAry}&AHnIVNs9aONXBMH8rHwzQtWYA?+pMgyn@MIr zez@5LpiJYbv>o&V$<0L*I)YopDk)Wuz4CTU}W3MzBV#&h-phKQVpPzDz>^uSfR0 zRfqYhg&AslROGgSgg>I`)Z&ad0$7x>U{#-3VW5JBkeliojF*V1t%I6VO1hxb#-xP{b z!?5d}RwxZlVCzei^0m$k6MwYX^$o=E1!BDWnNBs?+i0MWF2MVLYrDZ?9IqkAJ`NA+ zGxd>Bu}L+^x(7mg%S6nDrd8CBU!_4Y2CQU_^~4gaPd?ctx&HbZov>LM^my8=``OCC zA8ZVUf}BpUw-xONjH*G8A!0Ob4{9eCiyztt%wq|&A>T6%NIyF0 zx){M}32LJdu+q8fYTy|Fd=W%sGrP0c9xn%E0Jkik9UAy=cICb2<0|L>rbsCHBxgp~ z?6D5;lu}cINE^n$C9Z#J;6b9Cb7w8czG~c`r}$HK=O-{UI|G%7Sc$NRbU1tJtVKe@ zec!cG12{d!`j$rnLLP9)QRC`zN|E8AQvr|7ekKAHd#^JPn2+)LI202J7VF7^Zogz# zLyIR)dXe=Nvdyx7eQOA9T|lGHyz>HNdVzaqZ_G+xH*x$q;Dc63$Wz6r zZz4m{H3f}L@BFvTe6Ri4T>;-8i*iyH#QW)3)s5MqYH=6r z&tyurZ!H34H5k1#jju6Orl$KB^r6N+goxjaRN-(PR2sLwO%=ML+m8<-i}Gf?nYS;0 zJP*!mRpx00q7;$RWwPR4PgkbH<~L`vEKp%MXuZc>0HjYI?>;a5^G~6iUZN}enO`(z8yP%>?5akHQ?41;ms)P>T2*k<6v*Qq7F(w z;yd^pD2W7GqHyD+Nxv?!6u9!C^_g~K*7!3U=j$2oh{~J)rVMkm+AVGVcezBUzLcB~ zu`UT`%**Ugw_md77w;gKIF}*?{gF4fU3ED7+A(@+23E?ecpGi)N|62%L=9<%(Ax!q z)joJEVRZ;CKuN%MD&-YhHAOD9`#c96oK&!{^;uSDOW0jmRp54o4RE6`SbY-`G-(U1 z;1K0TV&3`cHx@tgNU5gnhSxOia5Yvoe}GP0ZMJGd|Dhkf<^IDC&{baj1SRrpHP#OE z>OyE&L-lh?(~R%Y^2}rRx!$x8su;O~k@WC``SY*ut0^Eib@OT8evx@)=w)zD-bE?> zZT(kG@UCp(?3WxUR{YyBORn=q+-xdUe=JoxW~wd_bXIB?p$Yb&nv6B8eM>uT%soI$ z*C%tBs>e9lHW=4UavUQ{5!>vX|1+5O|%% zb)G8~|8cHYQuceO2l=roORgO+gMt8Z)J!tF(yMq%Fq@ z+@s*}%sOCM$XuRX0{r6OsF*es9d)m3%tKVOo>AOw+m2;L?fA2uPqyeV$h+DpO@;~S z$R_wG#3p2E_EE8L6jgOpt70q&0+Mp=FZ)%6=QyxLoj*w6S|uJM-(Y_WL#j8!Q0(As ztHrQX)>t|oVqCmZI8Al+Qc`!h^^X4r(RX!e`FnIi*mPn&QCjzq5kJLgfCPC99lY>d z8arLd1|IdE7(vmSqO=rx<_#;7->2uP>e}>|(H7TCWAbWp?BMlH#nJPou*C@Bbg9mm zsoE56LTinsj8A;j#I z%?!P^%kVXi-`nVQH~e7`97=&DQgW0BHk%2a0h{50g3pW^f?u3uZ5l%~%o~bEEG>UB zbuJZAxb3j0y>CimD+3{QsIj@>n+`1b`d^)3@|%q?=_=79)%h>Yk)5vI7RcRV!s4}4 zwxU&2cXfjJ@w@&In;{c7`z#hXY*L#B5C(5SM4I^oHkyT}9rJBA$rS#@Pg+ycli}Gb z5m`0IA8En~8g{kN41_j#;P_ZyV|#*b4dK46ZdhctAPDC(3ZO^m6stRhv$J=yKE{aG z;Qy3HraN2lhr0qTR?h+(${2x4t5^?YP3a$n&a&a*N3@0dE&I|Fiy_V2?uzebVeTOp z2w7{3{tVyE8Z_y;|D!hdd~;E>4TO793iS4q4Mg_Fp1;z zR?L7(d`CyEDe=Giz(g&j@`U{fUXd__`m$+4m0qh`coGv zQ{sI9VjRy^{VTn=h#Z-7P0TNBDqRO<4(YJJg%rWj_*pPg1WlKrsqwTEVRJ>-r|;&K5pEYdho5j}AGJ@w2mzInlvx z#e*s<4Q$&mqJ?Nc)bq`1#U!ZrrJSEn55a_$CKY`NVxG7YHGl%6^0^j4ac4Hh101Sn zLZR$0?#RrY$;xcBXQiFPpM86MmzsXh2Z zPOlC~>u5R`_@ol=ivT8;w^}(rD-(jPN&YHu=O%!Sx0-6M(1(@83W+(;T?q^8cDY)7 zw&v=8Cx8M~fol@4`|(}2YUT5zEZyrnyyM%kn+Ahe=Zyp7QS6vnEnE-3o5AYkg@LAg zfRp-NgI#7!)(acy{TG||Tz*6li0ojNlXT;TYVbB!>9^i!EY_$ z1m(IvJoZ3z%iw&^p*F0lZQPh`s6RS_h8^#JPO+U2wP@0OO@%yW*I?44U!VnB=R699 z9}z8rrVz4Iaa5~$w5{VslYLEkMXWl4-pp($t!UhH%&Q>25Q=v-wsl?hP)MW4@60lg zLkkDiH7RJ{^=tNV-mAyD8vEw0#Us?*;kJ%|t?~kBQ4qxgcps!`rEEC~E;icmiL~Lh zyvRS^*BFlYC!Ogy@Fh%l{euaAK->QtCJ?VN4u^!Q({+vCAv0MV$*B^*rHhTPXifD% z4SwrT=t8p}YhYO;dSr_zOPXxWJ!!aG#PR>(I ze}%7P75eGxW?Y#Il{-Mks;TW}r38S~(l&}ywcgVKlhr!gqv8sN1d*Mo_#?2@35kU2 zjm1YHxU#)H(nOOL%=$y9#|$U#_3zk9UfV|=bb;u&e*oruF$cr4b6sds+<*0e9#3S; z*90him5MCGtrB!cPVcDAHT_({+{A)YV}1wl4pY*R5Sa; zCRYgI3=$40(VGhW+9Km7r}dnVy??K->~4&ap5YfR#U_bx6eTfo`c*iGqQxckL=~Pl zW3DjVyI5ApVs**%8D{cpa&mjaL?V_O2$VhWfx3R8eJqi_8%Ho{#838p%%XWmCur-^ zApTG;kkZDyOE$JvqMX9%8AQlGAm@LS*{c2GE+FAXli!TMF8wW3EHCyt$r&D4rNOD_ zRIJ5f@&0oOzWFXbd;|ngqG;%He5rLOy$^6^{1hlNnLXO#pq@zG?)0l#mB^1f@Fev> zYhh_iG#k$gyJ-As*u!xk&mT*y0-x17zo#Bl{B8qWz(1Sn0$~(`Aj~MC>2-RPHLKBR zZj@uL^pLJl8wA>hZ%6gvWy9RZq18GEzG`=wb}iX*5xD(3Nzt5TYaF-`Am2b0TCDV> zz{(6!xX7fk;tq|SXMfQxc2OON^ttAY_Ns74JzrQ8exmr7+Yjx`-`<|uXwd;$GmDsJ zR~Rm}rH6lx`^`>I_V7i|j4Jh(b7(+R*-!fzc~P#kkiEZu4Gr#D*4l&EO(wM8S+l5vGlCCllcUA<+(xdXAcfkkI zKK5IfT*0W^u#$A~iba#Wlx}&e35q_^6w-$tI-hRvgpnK%ydLYy z6iqIH3SnZVqhJ{#?tLI)=6#oejf}nK_6sb@_vQBohu1g)bmXQSUzCZFDa6ojQe>^s zM3s&4@;BKHRL9A(4~;fBoE-OEH-LlNFTW9^UN<54gIrh2o8@XlbjY_ZwIu?9awWevtFs5NHdcclV<(dSruNUD!@~4IWDoK#$SM@@ zed(-i{kYkKqYwei-Ej|A%ZnxKQ5f4~F?6PEUQ8dK(Ld{6!o+|O0a_8`<^UGeDc zddkyWLom|*SgKA*r`8EJnMq)3)gcS@I;#+}BYTlt6 z+aaT2|A8RMZSmu~3xC5;=Q7P*-W2#rc!K@#fVEsba&@s_v=R6gxV^#SHWn zo>dYIp7_Yh9uQja>?K`7T<+@5tih8-{MLmBnD-}_D@qW!SO=F`H{_};%` zEFPY7-ojrau4fmtn%N9`dakjk4MsoZaPARCgzZt-d$?Tf6w_M7CO1V!3wsN!`BQt} zjwkB_eDRi&nll`Mfj0Nr1fHC-v#`q)V66X zs+HP$k($|$u+UsF*a^O%t)CEgazGBgI~oU2emqCYx??==3&P)Cm%SExcps1wdYS}( z<|Hu>IAXoT8GX}D-Bw?kOZ0cIenU1VJWVBbAb_S%AV(%_fdo4aRRtn2Ik%wxKiEMI zW`vEc_rM-O6+MG|aUU}=4FSK}Kn5(3grfK%mPZp{TdtIz!TBR=Ok`+>l4*W~MdTSb z#VVW~33mI+-KE!zzvCBcY^pn-cslf3;*?=Vy3nFGvteqQMq8dNNJ7X1!Am!AEMRXH zZ+$Aobg!m0_M|?$$UDcPfAWWO(P#S7)8Lg$qoyKj=Z#?nWB0cnp)Q=H^dtAv*GKl7 z{Zj_#VXWQ0^B?8qdpa4=7ww|A%ULiG4L+$S0*s8- zz*!(BOq+NlZ3#9f(xgQ3l&^PF5}(FJ3ri}!H2kcOh5T>m)=&HZ z>ssAEn~>9Cyd*dj?4ZGlYIb8~xK>r%>P!d$6eQn{N{W7`d?ijL`78fT6`Kyhc8} zzo*yo)p%Ihhr9U3^1+6?vI-xjB+-J{sJH6JrTIRCS6CNIw;nz8z;gI*!g9T^-MdJ3 z6V8%(=5>B!_q4F~=a6Y&TvZW)09h?i>=0T}uf`sc^v zNX%*#*2?3v2g2=^w)cU~n-$s~t7X#=}6Yt2AX4x}nL5=(R#pXj02+{B*tn$iDVC6{ud<^e;CP z0w5yv5VP@o{1{3iA71KwV~i*N53=jA@bvY#q%I}hx@D0$ZQ-li;}v~bes*H!u+SX@ zoe+XI3UtkXAx_4B7z5A78-bh_-xP3qHid4#0q$DbUN)fXL#JQX5c3#ZVVKs4=yJ>} zXZ^TwF9iRfmF-s+xJp|afW}c)aZ27v$#*n(2+~Re6Q8TjbHW81Ee7!+V6QiSK`K;j zM``ix(-}@ZkZ!!_>V5D%6Me}^J{R7+v0T&Sp^;LvG!oMChx*{sc{|1S(hm@ z*z+R2U^j4Ac(5hZ4SuEQBo|0 zgKbvPEyTUmnK2AJPi%>46%$sg+mDgIZ4gfZex=Lk)djX1q_9qJT2@QT?DH=Rs@==& zkCoUhOm1}>k@sXqQikJZ=nj?mad)SC=h{aNF76W@lM7VW8@m=ApGt)GrP4vK1*5lD zqpamjJ-~`D(J?-{xfbWIHjp;ENh0P}3ct}^gg$QO^Kl*>i3f6HEdlh|#6ik1Pvg_| zDa@H~=KU*l%O^Nk{u~fND@Th|1dnIG_Chph+L&lS__8!r>kG*s~mdYz$RrhVtP12}m242)1izM>bZ|(%V_jW?|zGJ7V zzcrp@kTc1pI61X-ANUC;Kat^NAZ}uittNjW^*;*=JYyo!VWWf-;)~w5)#6x^^0JP} zY^6!Bp$~n@@5V3wgT}YeY=+bvvg-s*C;aMC~vV?st zjLANhvY&hUA-<8iN00EHW-;u;@auZ)hn9~2mM?@%GB=}68b>d3`?9tNfBe};6zTia zU=?N+2nTJxTf$>dYfX{YK=n>kCm_Fhr}E`7C>)z zSZPOihyMhbC2DN>qb&wud-zw|gR zEJ(W0mwnfYOr3^zl}1MDg&HHWSHIs57_g(&gf;k{SvNy}$+d9;{XR z?z!wgGy*LgAB`9$)I>P(DvMml9@FzbaCHYTliM>#1f>YnI!>gwjWpC5MN3N{v+hZm zbu5)*8xS=}|i77u{GNkNGsP&w@C?Dez%XQvRp+swOb<5D=vD!fs_{B2&d|KS24%iGBx;_#;2cHzghwsAX)#KhwBz2PVW0)L)_YdS5hJ6zF5sHBT& z1cJj}3p9krQEj5~oJ2lgKW=O){m>U*<@^1(u<-{gcuk>T6P_&YdTW?IcD4s+2B!gI9pV4Ih#nb}e$&4s~m<0YCH)vifTt z&(1`l-f;IJv;Hi*{xg0H@VZnKS7XNXI+R@7>X&ml&#e+p!UXPzE0tAx*f|4|5FNq# zvt%|ZDqnLrUC%mN7Z{yQ=AW;0;b20#sp!-wZGP(iG>7`;;k~=rQT~AaeT#;B&0Jsp zDM+XsI#flc=boYZt(7>~4K^+Dg{JZntr_{tWGH{K9L=Cx1U#zjO(3JToCWwTo%GJzBMK_sO1%ev7g`tDG35$gr)?o2u4bC`KZ`<} z9B&iH9tLGX6$0!ES(#ts`ZC~t7KJ#Y!8ygc{)Q33cG-<)b zaQHE0>B{$OtBt;bLZ|r5qdDfTG2WASAU8 zU&bZD$l!q_NBdy-)IyEKH~gL~*Z!08Oo54_Ahg5VYo{W5?Rtzy=;Zneq0TK=K^!Zn zvWX&~^Us%bhwlLsh9l>*0CUEj9@j2akQ2Ro{sX~xyexIqljX6&bNOC3aO z^qZH;Jhw+a&<*J9`=N+PMgc?GPg|J%q$7*5cs9zs>{GQyGg#W?p}ZbPLTSw(A#fb6 z(Qyb|_rK}T66~o;(NDZ`#$KjX8k`yt>AIpRSNegNjkKh%rAm9WUN&#mbl}fNX^`J) z(6AZsj_?k{!!VwjPL#F%uV%y#X0XV2RS^`r45*gO@tVbr-_;&EyUOhOK9|Xj%+AJt z>te%mBOQmUEPBAI5i_;7WLy`-q{1c1+_~ApBDBjq*=2cLMyeiz0j-yQa9bOu#Jc9+ z;N0{h_;3+d38Cad6&tB1TC@*R3&=4MDLCd~V}pbYCV>BZ*_^bq{9&G$?3^${x?R_B zFc#M8iRv)$aaWz!<2_X4c<1e)`xusbMuE>rB9vUB*YpD*vJxRUrD;=5tayIZqK|9l z_DDa&gpF(@VYufyv7Mi)`(^!6xuXOEW_b?+&EkBwhlW9v3CZ$T(3DUIpwN6t?&T~X zmu_Ffy4;vx1F}X)%EQP6FZhG%7yp-uL7ws5_xy^tjN|ytMfy9!H-t&QxB*v;E>#N& z=jo5+u@nS~CP1Lnd93A(zPToEaNA zky=qibhz>(KDdN<98XEZ+JF9{WWnd!i;4cCeSl0vz14XGbfyKr5H_2)g)f@0U{+zY zuen)dF{gQ|@1TNX=hbVo4|9baZ(jSZuFgysEj>b4sCgyTZ?cSXkue3bU3N)W)WsoG z`sGdwcw^GeStGEjnY9MyWWy9%zRx_~@=Y z5xP=Yl1UFbbNp+S2Wf_5iR3}pr1NK)g}$A2Tm4OQvP2FSA$h2-MLZGPdj%XX!hGAcjuqlpI98Wgy#a)#CNF4u9y}l?JadDi65Lr=$c(|%NT>;!H)e^AFz(awm0%( zP#rDnlNS}~fXN+4^NGJp5Mc9z(_^Ld%z!%r5LG|~-<7UEJR|Os{nb;-9qEk|6NH9m zF5Vn=bjc=aXFmq(()F`;X}$I4rE5DZ?Z#_opdtU=@hE(3I=0yrRR!Zk)Wh@`4EWg2 zgZ|^@ay9PmtS5y+HUMrA7F%42w!ZL&{iKD|j=79jjsH{Ra4fqLp8T7W&(rv&V~bL! zYm4p88g$#wj0)iO>1CnqjZxTlt(3Jr5S}s%L?GP0xiv$iB;(;2*|qNeN??zqOn}U% z`WAJXuR%OR@4jHA&ubu1>YJG2hFbHI5Ui2J9h?V?41x5l4f*_Gfs}Kp-uuwv=H))6 z4r??+RSG1s-!W2B!%F*!tzkx;%?Hu^yZ5}W0u9-Y!+i4lWOFWw+dFaVS~b(Tj^m~T zYUP(7mOfv%V0s6&Qo)&D|6wknEcLT5#b(>BEyvtnP?=u?gM<3K*!r(CyEwuYKP*&wg@5+o$R&j=X(h^_4!0q`b2mkHIUwbr@zajl*gFzJ|%;W;|&DErm<=I2X zIErJ2jG~3@{^?yV)jH8Cf~2P{>>-pobQ8K+sW~#U^AKXbFmk_v6F@Hw?ZC)$!2$R` zE)&JAn(eJ#7%&lqaRVw~m`+)iS6eH33lW+70tOz%u}Sa*tw>R-u8ijUW{&y+o}NU{ zz?3&cq;LAinI)dE`dwkYzdYVn2BtQperU$~wc3yL^24I1DFTtrE(z$^!(U2Y+tIx@ z=Q4zK{qX0VnXDe$_Z)NmLUibLYQ9#`D!R_ID!xO;w|`#~&6dmi`;>fZxS#$DZl0+= z#>tly1QUN&wbR^PYEwmqi%21*C>h)diAOoJ$d+t01&zC#BCf@fo*zIy`aXrG8@TKvJG}6s){1dhNS7YVfx6x(i)Wp(ErH5DI*p zIesbGic+}XKa*ujGT_q`Us|G6|>ggD^Q4;OtIuA#9wf z!jrmV4ZCp8{mS3MDI$A@7N_H5fsUe&GfC6Z_yf(V3by5f9E9yl$Y<7XD|qJhz_(6o zXJW?uI{9gHKZgkL+c{;jJ3jPW8b&ca{!IU#rr?xKqs`ps-fS02vyrp{f}tph`0Zxj zLq8%cj*e#Kp{JgU_179Wx%cLsd6$kZcRB|AufN-fV`d5PuFr@mqqOsv&JvPTkj3q$Sw_63(09cJ>zXy`_cNMY)))n z+wAI`77QPb^sLm~fVs{Z3kLYy&C)*gw;NHbdvQ>`5GYkpzxYDM9x9GO1j=Fa&DY2L z=+Fk#C#F{mPi)W^mQJ4V5Ix~dG&^B}{9f%2-j5$S=fT&b-2YrX^YUzdHOj3*zQV9@ z-24RB{o>&rb?|E3>pnh^)D}9+1stxMTUa5)f*cVltJ+2!rRr#m^!5)7ZRWY@QiN3n zD6_y=AVu)c#T}%bD!2*2nMY*bkv>y$YDwSu#jiq49FC@zSNe&n$l2m9zfkMBxjC`V z(+Ni#4(g{f07Omre^f;K=o`CeF~3Rv>94Z=5WnbsZgOQh_0sx4SVzxntC)@M zQSn1L&j$d{myN1a0;st>()L7^RUx5eHBXQ{y1hr&A$0Cf(?D#)@QTjHU!h6sGK_1C zb(5VExgZSqD$xdaH3oRIYAE=Qead4Trt7YI2#+{z8 z_koSk`Z)m;Wj^7p=qk0piuiUw$1WQdKf6AtuGNfAr4>)<9GLkuo z%Y33yX4_t5z7Uk0DsqT#^AbHHZvUG|-X1MnYR=Oy%-?oC@L6xO!L{bAqO^169;X~+ z(D4ORfI-vqHVaF&m(EMAV@z^m5o3`!?U;t-@p8ym@?W?l%#ZQcU(>G~oZ2sMUvngo zh3tt;o|zYLNXgIGBoJv;8`(& zQL#Z^9KUfWrH!CrK3rm03mp~=tj1s5e)H;$ZD;Z;0Hi55t6Bixc z;j5CI){=GR)Ty3-SkV{m9^eq;YouT2e#!c>yYn*xO|im3v^NZl!A{Zg=q!`XmO;SC z{IBB;CCfo5yk44p5oS+8h05OAK2ubEX)yZ z_?~83X(T@|^}V_2{@ZX6O2^Wu$qnL|xW*APBs~V&f*ZeYN~m}^>)vt z8Do6-aLq4eo(e_k5kq<#_!J`QVXIy0*W6gbN!=Kz8XB!QRy~j+)$8gD=CgnU*6s$G z@0n~XRx)-iW}==P7JGi+xFd&8A5@;%x|ZqB8>n8Tzmq>(dz*urUtW1ewb&gkP(E(@ zZQD)|$*vL6pOXD#RP9R}`2VH1&#G0hH0c2kCl-t2Td- za%#|G$m9OQXS2QNFUcaV!Mm zvOVN%zs_6y@_D2%`a$K0RSgew_Si@GZS*4706$vBb?ash!{Zl0pPwch{pzIz7VdFEI=- zg{sQu?>BF?TW(VhTZP3UxE1~Ta=Y$ccLXmG_iHNAo8VY*T%nXo?2v3O&~mw}j;+~F zTaAt0t&-0%KM);Xt1&a`hALN3=Iz_=GlmPLZIvK2Tnoe!b&9ZBk(Pg0%|9=+sdd3r zu(EKeSK00;gYfTQ1F$*{;T7lZOgFa3bp>3`vz-)qrMqs&V2aIkE|`;eX9?v8xEfVY zM^@&a){{v`6|yJ&%M&IUU-mnx=&hDs;7DXqXPx98*R|D%51&BSf^2SqxtH1Qa`19r z57LdjqRMTjH!90Z;lQEN=k)k0Vzhm#ph08n$l@N>qzx?}jrK=I`6l#3u@jk@jWWg> zkgO@ZmW$Yr@9d6fr2Pg;EJMg+v*#zO-+WJ7slDK3z8Fh%JJ;6gbjt%fL4@Lo(n^ue`JSp0{jq6geWjsrEDnU3W!fsH(f;Fw zIZ~F8*~FR{$ilN_b>eJ#%x+bfpG8P*c&yIC(mvA#GdBRn7521n@ zz&t^VlZ(nagHQ0PY}@(}0X0H?vX*BXZuD1Os?gy_A-Fv{<46<`xB_4vA-h} z4>(E8r*O-34vx7#IO01tl+=yoDuCdlX+2$(Mi>%9vUgv*Ir)5E9@X!pXCtSNl>2rZ zx9o=!Ep@GlIohKwQv5Fg|E>vl15?Y5SKa2d9A@xzvGba1aw5}}W&xW~N83)K1K-V} zLm8v{DpTO?Yj@gL73h}LAG*0E6}wC7OT*c@2?C9>!}QM9lW*hf+2y=6)-$6|b_cyn z9a=$N6ul!gG)P=F6&r7$bD0Ncd=j98z;ic!t|maXuE95ILqQb~2j@#SzbLoSoZDPq z-FZf@Y5W`;BVZ>mLuC{|Eu<)V;uT|gTYu|%_|T%iFz+Hr_*tD6{D9-TVm5gCN!z09 z3#=4#SEbGMVrOG@%7B*$$_kw=K;TRIExrC{waX}BiI9_AyVmAh`!`SwHm)b<#mTBJ z(~~T^uRORIz|Nm@o-!=kC5X-Twci-woXmxHUTdb&g+5h`T(mlS)ML|V0)T(*!_!Wq zZh?n@WMFW*6NOX*A31QEo^8S`X1_B|gsrCQ7l9KuVITStS=C<{VIJG_)YhJ|%-8&} z*zLt4nv|%dn+hpKZ#~%w)SzX9W?KW*g>Cr#WK|8}qy zDwaFw%X9=ob61!K+f$V2p1S*S^q)Thg#k2Qm+S@#ong%4h$FvA7m zhL*ka!!CnZKVnFS%2O8CM*JuKyq%fWfT&N@8M`gIZp_)#6jmt|7!8D@A&Vg zkqKW7Ij;9HVweu~Mo@o()=N54SKl4SQ@9QvoVzi>F?4Pl%zP{LZ?)++l6|?%HVI{o zBc104Sxmhzro7{U^Mt_4jPuO78mPeu*Rx*}hnaFW5Bm}Qr@QDed>>9XFG9UvwNv%i z#H=5g+u|b%InArrN%_Ob0-&5(h}zeL`q?@{LF7i2Tum4coUR~uwnXm z)HZPYMX&@~$wRs=H&w5tIemoB{HPnoeJjJdQiF9r(lzQBpvxIs(wGy03_3l>_Ire} zv>8yv00c~1SX~jht1GB^Yis&K0L(w0w8ceaidM`{BR8@lsH58FlQ6J*mk@ri4MNb+ zcHJ&b2G+n2W7m-!3MNWhF>?>IsZ~XL5g1wj@h!x2rJ;B7$0h|sc@Cy$)#SuRi&VAQ zX*67-nu`I^?RDCHlz}O~@hDNe){-~l--mt}wwoOZk* z|I6Y|vvMw=;bu+sqHH;}f3VvD2ro^RJtfr_WTcyucf3~I-XSqmw+kcJ)`pOWdMUB7XLv7lv2O;PJ%X5Cq|wi!4cu^UYC%s=i2KEq4TS3kiicgR33 zb|Jt-ZYKq8>uf2wJ9~~?9`vrh718pNacH-fhxyU|qe%xuXW_~Jaj!v6ipX)x=Yr0B zSl#ePsSvM{W*@?o5X@@|A6btxi5X2Wi739Z5f-c4Vcc13!~yx7q?lj zcIQ~Ib;V5I4B*ChlvJPoy71j24YIxUbBaOEscaSJCAM;~@B{nf4%y^yyamfg^mw6; zYHY`l2KT3rKZwtHAuw=UO3BV9V0n}t3sJ4&fE*~sL9w!oB=H~|8QOftPp+;#JN3%B z!$b5Fzrdhy_Kfadvi@g{=O3GAKdwuDW_iG)v{*30QR?<6Ux}Ho{|){JsH4_+!f2wm zn5&fIN;!Fte);V$|I-%>nPScwDbHv*ejumwzc`wag(o=SvmE_;eThnJP zX~KUDd|H4>XR5VTkvJdp8u6iOc{VamuxsrMZz-_=QXD|)P0V`#*bScxb;yu#VeR3p zv#VPiSJJHN=DnLa?5$={Vy!jKzY-7nHTIU(FjdmR`UxT&M@Ius1`z_Jvwl-I*+vIS zHs>+Uoi|=~5F)J}BD$ev+?~66|32*o4lFZ_MDVc6wDlQ?sW0lpo3!DcYF~mQmcUtz z?ZUVwfrYj|Zf3<;%n>sk_?*5jdC(~c&n0g7WQ}YmSNp)$n5w zD2CdJzU5n^Y*w8;_4VG_6UZw_W+NCKa(>++q2Jg7`LtWr$3>|8pau;O)0+t86w3=h z^E~mdLH=#}ff132{xAj4_Anuvo^S0{Zu+>;Id&17SWAB%ZaVglry|Zr2*f^-(&>6O zj^q3M>5AzfI9GVNJbXQ>^L7vv9Y_b+1nfcbcjwnPeucv&&A3vbpytiThA56f<5 zcq+*TTh&cMMHTv?ptghONozO6D-Kfa5HS(0(A$@Lkq=iSf<0k}#nDAB4?lnQb7->) z!PFQ<$-dK-zfsYFlL}JB*Pln}xGvAL)4JgQZhJ~DV?XeSysehhm4 zhYRq!2UUlA-ijjs$`XxY0h4;FhPNiiL~4&@jooad1eeb;e(+Cwgs3r)`v#udg7%~z z(R1)HM!o7^{Xn zQ{6<@2;JoxPwF|D4&LL2xr~dH1~A@5B9o2?sS)d#4<@nmn9kb>kSI9&SDZy5lx};E z&i}F~d0Ossh3G+A6^qT|BGL(q54QFSip*!$eo}h-+{fIY=Uivux(ag`!I$Q9o#Q%i;$T!N3Pe@pr*> zC-&VQh!rC|&M|-CpGKA1e>b*aZ@VQrc6=*D4;i&jBzM(qzR*hXL2-osdqjFN%$XCS5dGfxb_Ft&} zFVfyRDvoCPAI9C?WpQ_RcP9aYyDkKW-~@Mf2?Pxe0Tw5?lK{aM*ASe=7kRn&x%Ym5 z|G($#nX|Lo(^LJanl9<;8khTf-4DkB3dz9xd)wwbY`HMrPAUNHGR90`>HDItZ`fb; zH7ylaH>;o-tz?M@TYTE@<{BZuwt8{o!Ic#O>yX>U7uaECaPRO(1X9ltt=wOkjjWa) zXtm6b9X*SeuPb8b_rh^4uu`6t2OF4YpnN;glGU6dZMZN3lrmYAEE^|>Flkv7oK6mr z41_coT60;N2%@|uoBoo|EF)PiF1s7~E=HYQ4=;YctpUCP@0eC{?jD|Ju1-&9X3l1& zX1aej)0}&V&y@5Y2!(d5O^ANBg_H5$dooO5j)^lC)Lv!et>CySLKiV>Oh^i7Me-MG z!!~b@I0njU+k`Ek-%OrK#fSxG&9v|eMD5){t5CUc_IcE!L<)XbtnB!4?P+$dkpssU zF|`*Hi5d|8ByHPl`Ok4K8~+J!eW-W-;6`8@GZc+0)aZ0p-M`68;h5fQgyzDH-!JgY zxwm6r@4p+5D64Jm39EM!q@OoCPq`sm>z_@~=hd(1Atc_+sOW+$u(oDKVlmWYZ+{d? zRLV?4T~W$hS1J)%?M#D|1^9OIPCqAbtrC6OceoB%GmP$4EcM$5&eKffJHtj$FA4ml zWKYugtv}=bPH8Uuz8$O(hw02WQKb}XTB6e#c~9pVTaDIAKHoC1W@kxokG9u9*ucbkIQADJqNe!~d+nlJghF3D(&KSp& ztC}7n9EZmDnH5TVBTxjl2U%*M)5klnm$Epq*Uw{0OX_>RAmh9o78j7tmP9Xzs4&M& zP~qrK#{KGup5awy`-SholreAf(3^8oU6z58nHB4Or+l6&jlzJgv*mmo{Co!0-Z)@) zWxb6lUaC8KN0*EKgXcVwB4q#~;lT@tPTeYp#X5Ebl}XlJmDZYOg0|l|to2)4`!CRu zHre~vM03mlB*TmYvIrd|jjwA3gNf*`s(OPRS7yyWL5?`TZ$C^-yYXeEax}fK#qPR> zVkeSo0#(E}zqevXNrcfLDq{=Qi34n9EfjQmp8})bsh{3pO$sMzupwy!8@{fs<@|1X zcdN9}^Oz{i(1H5nCta|7xj+9+7%6BeIK6YpcMz_Nl0VTy4`mP$d_~pxei6<P@%H z;Il_;TZ9QhzqIvncBo2(UruRrORS)G*6QPkiS-wdr$!tdI4_$-B$(nT5y{7uAr>>I zeN@@@xmUUi`fiS`5Wisk5Pe!dpw67p0axB+v-P=kGkMFogKwq!d|Xcxo7y|bSN}5^v^uiBhX}k;QVpjBt@TA{tkDr>~ zJJV=jRt6W0 zTJT@gd_j!T)t3klPnEge-#)&7ovl*mm_k2QQho@VtQn~Wg9xJcG)+`V+PdqAPlZ%DhTfx`dF~%DXf+NL_YTM|1s{h#06%R6j8NzMSF*=8#pER7 z@ispS(He>zYHoF@K;N&N<5;6uz913M#j+dlj?=SQ1H{8xnJm=affY@))s@mD&@vsN)gSr12s}YPqBY%-kNxfbpcD)L54L?Wrqifo2x>Y$9)JZL&dyMWGN!CS zoG_nHT7ymj;@+*kLu^tQ7-CmT`gwiuG96#n!bE$JXeZiZ73s}&5;`33rn5XCC`+>v z@tBjh4o&=AoS2Y&#Pgq?W2yQ%d#y|Q0XUu&o3hLlZlLdnD(!@2n!%j>4esleA|XC7vfUwpG$M`|-?`)i!bu|Z90pm0!Y+>rdeom3OXKNBqXG>=B&4lYwM*# zGd=Gt{rXgC`h=V7Dp3Zt9Gun=V=Vo17gt23TPbBtr}sCI;F}!io^QqTJkmwPHCuxc zI#xDV!FkBUmrY~3z;Z#l(2ck=qgH#45XJN4 zHHR-%UZ@(s)<2+`wmXoRK5FiALV=u@{w?aQ$QMUyudYpYNHK~QL$Jx(t_Os*ibpsK zp`)F{K@XoP2O|$!vLseVA8A8mw-FGFBUzzhtWyH+`IXC%s(c>?wzxN4i7GRnr4`!q z!8F>zljm+3H3F};PnLvzm_#*f_u(XS9);=qI@bCk-ic~)?3crxWRZAL*te?9SC2|)oRx0%+LNeT1eV-W91{bBQYh(5BxLv?+2nGjjvsi& zLXO8ZD_h>dY1>n4s=&NZY3puiE|%m3m7AD0-?^=6?z7)WfHXCk!=&+9?KQQ-na>HY z8vRmkH!ZRpKH)PSSt->;aVNrQfMMAo`0IS0!8K#J?z_Ox`WQrxqAHsycHB#%xnx>w z-7FESVkb9w>lDC$!-o}zoZ=BNVUE-!U%9+n1gm$W{)^^WOMA2LI;Gl+ju_W_g4-$mP zvZkn+prwyDoc8Ig-Da83j4kF*lEbz?1C|5yiKe>VlF*cThocw*zK_+_MS7!S_m6Cp ztZI%l6RPn=;bD|d1s_SqP3#QfDYb=dSHt~gDau)=5GG7kpnbB%1X`7-_0d%|6Yd?b z=pwBtD3B`%jzyyJWs(y=zvcMexvtn@tZCn|_9(5%M&=F=wy}|m#CPVo`G3NN*D(GP zQ`sVEP;R7_+4K34tn$@}ne$lsEp4O4xTC-O_@slp=5R9(c0j>GUFvV;YnYDG4V)|M zkZq*}Ic29nqra`%4NBxE?SV`6HV8@+vX0pnpoRAsk8KL?+iCLs({u>F<24Hm$5c&q ziSeseRr5RPuj%B1QMNEG=H0QSyykwQ)W3Yjp?|Kh)_sg7Sc~BhEMBHiuSqY-e&52f zYiO?1_5WB{dOvjPjB!mR`5C1KTs>pVa^{C^Ynoi@ZrzYoCx}FzT`+0`6=Ea}an0EG zI%$iYycWc5rwF`0vv)m#b%nEm4%A-Bx?VQX!`?%$i$|or2eQc-`QQ|C<8dWHK z{lUHa#|2nK!)JhJhI2&lDsQnNAIAi&v*qyUCaZLhuAXb~BtvWFvLR@{K+UdDuJu$Q zL8+R5KG%t<*>X5{IcL^$u)o*dY~@N|{(y75jZ#nUNm?Gpyc(-DI=)Dt z(fmu6hS$oCgdoRepThEY@8^%be&>?kPHIT3q!Vmgu*)oO&JeM8dH(dH=bA$UO$dp5 zAOb$55CXkZ_R*d0+}X_+q~Y?Q-I>k0SJ~3<#cD0B&fF!z9Cx0OZ!)#Um)yT^t-r*2 z=&1xSK2An^cp?%n1#;DEu;=*myz{}}d6}Z;i)Rg>eTx&Z(4q%I6GgWFhA)(Kl%RWa zN$slJNYrA6aR5vx7zr3vK007T#D}CViO9hgT1-BA7RIU*v%9q#F*hdlye9DYwthEt zw*1vK%R4OjHn*m*f{Vu%kE@E^{U!y-;f)1hx^tV?WRTS@gHLZ%g62s6h+OA>_a^gS z;L0(8+E>bk9u1@=2M^i=prub`syB7v?0^@Ril_~ON2!iuucSG%U(4wy_>oTU1mmT| zt8C#+g`NYy0f|ey2k0SqdM6L;ISlc_1`vO$L38NvVzmR;WFzu3?(NN-?(^ zl8QB`wb9)uuR92-G_^J6&aV;9W^is(SDz7jOW|c{wv-m<;mJX)0o7Rx82(_`?ts0> z(Me36erHiNmfMm^Z@!yjUCgLzna+rC;k!UgK#d~q) z_0!B~5=n@ZnBt#opclCpsk2lFk86FC<&jS9^A3*dhexUwsWZ*NZxTB_k)i8ap!itW zqMt{N;y=E?hx8MlQ_yd6EGz{_?<67X0V`pn-lW=MFs;em6H@f~IHvWX(?h@%%3)D{ zoMT%8!Z_&uB*jp;*y25iGeY6ZpV9+|@MW!Awza3A^XQJT=S+wl{I)q1A44YrKNjGg zWVLrei`-t0$DCUP&t-G-lzZt+hFukz@-z!elha!DKWM1=2?Xg2! z?dfRFu00;bnfIvGeJcK&`K3Jh9Pe^GVb8U$*(yP07+#M4CT&q;f~crxXTpzv)LNLI z^A~=vEteKhdhTF59FGyiT`^Sl;IlT^{joIQ?z?p*Te`FV@#++2d1`{3^Ev{Kh*Rw! z`0_c4OF6g2yfYE7xw@|SgS*(JPL6GAdQJOPVv+0iOo?ANP48r5&1kTZPRrWtbe`jL zU1C&qjYzKcyQ5W3=?IiVTrL6ScL*9@#vgB)PS+{uozTiM#14Du!i`-7fKgZg4m z@?XlhHx0L=^eBy=tN6)tt?c#U0?nnC6 zXaxuUh;dJUGjG>Cr06s#-xP46_9d#x!x_r{%Buc_$o=b5>uf=rxek z)xKzl41vHbLevNTb9*X>Y1ntjx5ySl_iIE39omj(b~})Ey;Yty4^yz<-Itf&lc1eK zYQ7LV)TVp>j$0^odO8F1*Dur>?T)# zOf`^B5xOPYZ@;QKY^(~f@Q;3hudggE6t8b%N}nRuUkSknO2(90eIP*f7TARML^9~< zDtbH$LW$e~|NS{=cBtv8_2bxDmixTby&pth9#x!3*>mR{X{4Ij6aToX_TljpXsFTR zVB7>cZY8_T4B7k>pP0dT)2>b85?+2p4^gUr{xiy7E5Xj0F}`z`oJ5|F&w)Bh0Rh)` z^FHh62gxa*rx6|o` zB{Pk|JX-eP5F&2=kDG~;ObjeYCUazDM%;|D-%uaih`NXVKKATU;szJS9@Lp|(xPW^ zdkh3GXoN^zNZ82YXyS+2W%E!sXL^FV6J;^p<%o5F4h}%T#R00(O+|mPsE-`{4`ju; zl(G@5>bzX;yQR^F(1+yR46Krmq;F3tzCe#yrN`>~z8>cs+u&?S2kW%%w-XLSs^J`7 zH(-~o&|jwl?&;WLRs65Vjc&TRDQYbj@~BOMXHSdasJ?V_%hR~Qhj(V2LU%mFu6Dtv zbQ?k65wMA6z!>svuGV+>GaS-YS8Jf3PsuE|_ER+~So3+xRw0c%BtcKs8TrwxRX)|# z+;<<}+LVx^r#u@rl1h>O3#e`ezdV#d2!EZzZBs_1i*;tXf9c!0zbMD2aT6q6 z*`niCFoVNxXGz~=xWsdF7>}vfpSj(QIpR*CEdumZ=NqFi!7be$)}U(?I)1-?3&Ru= zrIajF%oypWGHdqAcIlk>;Fx>g7m!?czW@=(S9iHh5aDa)3D22eTpT(6s+Lk4@QOvMAl7s9*R0(A>cHYW zQec4|*)ED-Atc*ttv|+nN!#@b@&Xgn6qh6+rNJVsd9BF8A5gUXS+j>2FaFbjGk$o* zv_!0ccha$2mYrhKt4_fjUrf9Pe2}7ep_fMCr|H3FR=LDH79j&(k0&~nK7N5~#^!0g zU=sL}QjTk~M{zTq>|Y@WbC`(VmS?s!Z8gifvAz$9Vf3my3y{&!= zr#J;D=6zj_o}Bu`dG1a&h?Tzqu;HHZ+uz8}S)Z!<>SnQU&o)n(3Af7<&M|~yFAk6( zTE-B^dP4R?^TYE)Z;IP+kFbfmvK{fjrwriw@nN~mQ7E`)iyA?LtX;rs8cLB0v< zY!Yg`g;+Ncsdm*{E?WZm-oMYmCJA2Xy(WheF`lsSK6$ohz0&O(svKrwNR(#P_^xM9 z!e1F7HY9%*U)X6MZeuuTat07NAZL@vawjA{j=W<7H`C7L;E;`xq2&*Crxf{Lp3=_| zg!GkhAWkv@P-#>0$koy(&_$zh0x_2`Ur+}}pCg*ND@sllRLyoypQt*SGJh5!>vEg9-dvAg6Mx5(m@X`f!boq zy;50~ET`g3^$qtuu`R%O8T~vZQV~~vIjlSxMFcdTvL z1a>^_DnW)&b1MWtWS!dL{vQ8mHKg8`3-0ea?`)jxxn(iW->Px_ycYEL*x3hlD-3`> zNX*%?;nSKFK{$3%U=l01($=^~=WWMMV!3(`84HBh>5^wi?1u}e0 z$Aoz%33?YJRwHX|aKdK$TCMHgyN6NaDHYOT;@{=Cb^UVdF3vPkl4nTVcv=@xUchS?l$6$~%G z>^=3%!rTV(ZQW3`=FPHXBs^%MP({H7a>fQ2xq=Hu!pULeGM@>(=dt@EEV6l8{;b{w zJm^csCdR*Hd^bCnw2(;2_ns|F^WdzmPG)mDt5MRno%Pz`LBc%y!6iq2;R1{cnw&n> z2WvGJNSjV8jO)?COCa^AotGqwU81G0i?B@(>#vB^LQB&GAU;}o=7|(iq<%Rb5G-P? zKH>Tk4(A5gqGWL2oW5tWsVeRvxBY7SPQ;c%xfr{|> zYcV_%&+9Z_tjjU=T|#15nr1Q2pU}Sfsm~@hy6ArSLZedcV{KRjJBaUG1@&x0#=do$ zw^$yzy7vtOU<5Xh$lZJi;}MNu?1{~P+ew7Q;KmQP4<3vp)hgfOei<;4$qYgt{I&2K zAwrqE@i^y0=k+u3{sHXOWEU->fZI`q54Q9E%P=5xMb`aDRP^?P;%m<)@_VbQa&g>7{opj{vAJX{Cfoo zbt7>{`F*S8vi^}ZUg}0K zMz0+}n5o7CTjw0qUG6KjRZykl9j6%@BonH$=1R9dQ)c~`D)hly^r<}D68gh;1>7QJ zotoDNlj|0-39ylfyik3?RU;XM`e$tKM45hs#JDIt+Alo3zF!E}&_R=nbo??wH8~-| z)#x~EvB>qv`0f@k>et_@eWX&nt#>5WE!GiWK`Nh1O8xTft>HjB*FeOVGKdq>nI{q< z@DHuTcpyq(V!#P0fjbP(r{+%^RJF$ZX%3!R1H;ZFpxJ1zzYjcFb@Kj^@#-3Rcm{k{ zYe2v4yf_+e^~J9V6tB1*&z9`ti;vbx51D*!K1@ao{avU5999bvn>5xX3LE4r{_R++ zZ#&m|EEdeLn7XBzZ;urs@%Z3R!m@# zCFzw$2#)3F^LRS-JB|R%MY`2-V_zaeVYBp_t}RbXC5!|@2q-)2Y%YSSX=$A#D+dq) z;4qf&ya(12+|QC}<$pdkI`v9_vT)^GP=G|S`BxBIX8hGE`wgh|BaG8<+fvQQ$LShp@L^|EQgjMEQTzu@lFSi$q(TmZOT&=QqmMtW03tYEU;HrU$1X*sjB*&9P*PcAP0IOZ2hpnBv(-*;Ow zLEguluFzh;-S;dA*FP4u9nD~{$|uZFykLDS$C07U{u&`JwHY6eJ}FY%QdLx8$-!%= zm#+~po{x@Qwr`E24~U3IlME({CUWn_oVQcPG9+APdw0IM!EjO5E8RX^S%HmQg#ldF z{mVjO3NzguVXRQ%PgvCo2li2Ww;hqDBjWN7Q{)^_L$a7MjKW(qy3y#MB?Bbb2i=M3 zw}7B$SkS{x&#uGjTq3NsREgZuCw6MOT}S-Jywu%iYagBqRar+ZZjUFtO$nBAmc|~# z0fi8|z5Co=XO$b*sWZ{N55ZckLFmG`e@3omZ>sHzvdm~#RX3e&rqSAAV$R}Oe#xiyqk`5c(T?3rW38{Ly*6>UvI zcF5@*t7BOK{p&+f6iOFiOG(J+&a;)Zl#63N*ws#J#a=lVc=9BPvs{?cAnuob1=rbP zu+qbQqWUzzEzgGz5{{?f(mVMmi!s=BP&NM^Lbx-k2ONcgH~wqS6X9S}>c8GT`^(*1 zsMgL(7FydJVEqAaU3DxMEuBKv`5Prh!bqEm7Vd@Uz6#td_irEUr*TgX@@nq$RLy<{ z9v*z+50<5YL-Z};`|c|`>X{NGa-`L@316)Ym&WL3& zDvx#QZBG;$K09|e80$;SFZU$PD3tvJ>#K3Fwk_0pdEQ_auoW~h&-{lGToOU5S-M3R zRX*3P$T`tD-o_O59AvFblB3|C6rS95!b!lhxYS_EzJK67H#}3{Hi+W}`znzm(=lGx z_xuB^pEoXNmU`5xn!DjZUz4}6x(#`U#a4F6P|h|^5KkTTolgixp|VG^HS9hj8v1Pw z_N?t-Uf-t;A=(`1ou{AA->=d+tN{^P+b*GshYvP(aS}z^CQ?Eql9r|Oqcx{67TDc~ zhH%|7{$E--rLikkpPNvhMWae$eL+*%jS{I&VpvA(9Xw=h+OBpL%S+^H97uzL6Vpol zNQ~g32oW5&Cp230v5sE6PHo_*6^rLU7zulRHTMs=Jf{F=JjRJSW4`?g!TnYg1^tzt z*k73{6<8Nn8Tw#z9RrB7n*luCXyTNpxsfHepQ`6wLk&q&6`Z^d+9Ux~?r)`(GBX9B zxi0tDD3wB_3PTx?QtuQ+GQ<-pwZb%3!D1mIpG9ct`*?>gKAca<#4V#dlNG=4L*C+d zPl*gJ>Mrh;+meiF6N{|*HW9T?Z+xMPvTzQ)MgLlm=c*45tK}o_$Qu4$sAMPf&BdJl zeEa>IS4eXs4je6sKz0eq`x8|jLt>n3LqumO(RXrf_3czELqk6=^2A{SbO$bRsYp

DOa?C5Bjo+)RJD`htKdE1?nK^f*GmerYu{iAlUG1Sy2Bx@$9NV(dpuDM;u z+_NK)rxD0^a+z47Z^xa)EKX|yQ?&CN29rKIN{65vXRTvjk!Y$r>(`qy?sy2wFLM6& z%-Lf8g8uAN{6hdfc``7x^j~+1EB(#^WyDz#^ltZOTVXT^^H z?xEyg%_g!bj$f=lg&zHgdz)^CPn@fp`^o^cM$VZ%1uL&T1y-XaLojk!!FdyKkr)LFzJyGic?HTi=S|2(H6mX} z;0NY26mJ3IkyGQfA8#9p5=f*l)dI+)Ko3&kc~; zP!ZYG8FC-~x1hfM3gDnG7`MrxJ?r8<30Gu!{^c}TjO%Bf(Vn;D13_T^mH;7=jt{-= zo(p!XoA-YPt)P=pNJ?n)Zyuygr99i??hDFW0K;@22}6cj(%U_UGw4~TZuD2kFe>2+ z4WA#*YHO*{n>a~m^qbSxv4Zo{G|?N`ognmMM?8M4x$(H2?HCaXaF2j`qLtF^f_Eul*`|U+LGC2N?M_In)1Ln^9O$dTRO~ zLigE*$+o1w2x-Id!_{ar{67q}_#fHdk%?kp!J$Yr2~!IBq0=*)%RV)Q|D`UjdA{T_ zi}2sW&bS32WWZ74o+sm!UOOPP0VZ&})}1Q3^eQVJ3t756+HS zabf9~9>Eczk^RWvI>rANwST_Xm2u%6IJH0=k*K5z=>G42UG4?=|H!c;7sk&Y4W8Rw;?I$V~IkRFkQh#LLxaVh|^_)iBcxwo2 zxV0)iv5M@6y397)Sysx$TaM*uztz>yLV- zJmeJzgq|O0k4SG>Z8>d`ZAI5r&CaHZlod{y9;64sFXd!o^EpgOrKrz%FKC`+ zR+PabQ}2L&v&}vw*#E;8E>(-}!t85qky0U9D0a~1x_z)em944Lqy3qO$k8a9RF@d^ zaTEE>&mJyWc^Xrit}%7Nx%ozOMMwT-Dj67t zu>&o;e|E9p8Rzd>L<2tLpk`TggOtwg#!x#r{#uxCO&wOQH;^oVfJ+P3sIBp2w zyO`y-&4S5Kd1!P#?d1aW7Zxi~hHMwyy|CZ)jjg2;epa!!CL(j@muDL#`5%hFVl14bvS6&^2K+P-9n-H`g@G>!_RIQ2PADChX}sZw4p4j- zqmQ_d<=Z?>iE9AMo3*EK74k2|8c5n0Ls&g&R3-#)Crw0A;jpUvb4b(OWC2t66~&Jf zm+HPLrb(Ys^!~30I>1E?C+E;5Kw$LoX<1~33&mS5$t1_xsGeIe^P(pC{4Tlxs08P! z+yKMs8T?x4md~c$K@LY-`@szEiFi3-$Wcp)dJDf&^YsvXCi#^FU^p3R8$+{zS;@D+a*SFgVKXV| zcjs)>e;#zPdK&GAR)4c?t8Bh7`&5>>GU3XE<|^WGMgGX{1#Ob*Q1vAz z_!o|y6ndg<1o%+_vNiT$JyP_}Afyxe1PqA5)0b~m341PUxnDG#vhPhVOx7i7AqcuU z(q=?UG-nA!hMG=@W5Yx38d7KB%_-?R|FSR%KdV>VN!N-Vr^epG4e3@(y(#?+%ETGv zn4(`XIx%d^RB=;Xrk_*rF9w(pn;Kd+UD6P1*F%Mv-*!_V$TJ@jXMtoNTn65Oil_-u z!AvxYT$AG*)m7wY1JqGF95!o{ipy@H>6hJjCM8BGTf>Ao46L6zMMi()y6~9(rSOe) z%dY{A-+sUzhR}??hn<6O`s;l+YXIvJ@7c{3$R-=n?CHsVC_U5(~>&_Z1#z)G}>ke5TsG8mEJ2OzNDEqJ(c5>~mYQ}1%% z^a(-dA`t#^$nYc(`$PM~(H`6DJNb$f)i3uE5U>_}FqqgcbxL;UvfnqwIe3`mTzE^J zeyvbJ9Hgl@GVcWnKbnqFAG%p)*~^>9%-_YX++&q){?W=kJ6bA9iF6sfOsQirEq4$P zp9($bRdZE(R1{^$n>i$h?RzeUK+8!dNaCi9eFq^nqQp9q7>eRci!|0(DHygl(zPe1 z4};ESL{YDp{@IA6A39G(^)wTBhYau4=@ar5Fc36l9y_7kAqtNv35BVLPxj%7>83^| z701}6HG1uQsnEYkj!Xd6wqly|jk`k8L$a)o1Vts@ZMz-Nsc4!wr#rJ`+0|{QGSedU z1K;=5@+&T>t;u3bDuD;Ke!SJ9@|T%T-G3FHmh+Dw*%;nlo9n*9H-1!(0W!|b&{%^J z?9v;Epu&UdC07~#v*eag`#jQc%4vd^rZ4roQ0A2#O<4{+^DCBcLK+6Y4Qd(mk<>By z4z2v(`fmpM;g1^_(ohtk85ttMO{IZ=>wj5n&0l$7QE9J8%^b=FVgS~HW-&<${Zexo z^XNKR6nGcLzTz_4lITbHDWKm5fH>@U_PGX+nFmxn&V8*IZraR9XnnuxUT3&0Fc3TU z&K9qDADyNpA70w2|nj_T|8&2qH=7+N3 z7v};G4#1xdye8KmB(1TW@rfH4@gLJ*KM_k_7HJc$9JX-{*g4{+v>+1Pfm)BG5RA(} z_a2T#QK(H^E0&eUFpYqMJ;^>Z8%{agHQ;^|xvj4he~ZQgEUD?QGuhzX2Q6CP!+H<^P6AQ1 za-`r=tq+HRY4H;KUo#&t*Ai5;3oqQvU8!U`>Hd4)$F z;1_K2v5dTip>#4IBV|N9)e7+WDq4FMlk?z&;h10s7}3vYTf>Y z+oSF*C+B|3bhapj#`-0_jH*GDW+Tpb+dn@jbWG1}RPT?ZqR+!@lFITSp)Q;B%XWqY zCMlmC0Mb4>w!x>R(>rHqGe=>A%Cq=@z#*IDom3|svCyJ}7?Va>NyAk_J9GDnQw!rh zNHf)M{P84FJxBC=kieUh;v*V|gATGBgL{S9ljtJ{=(7OsB7)w68I;j9Xvwpe5uE6S z$u&>Q;f8innTH~eyQo4)`|eLcHeBMlN`JX2>zCXFFMrN&C$AJArK{Zg(BlOUV&Z>K zTR_8!XRZlHNOK*-64Gfs?ns|L3?cn{A7tOBhuiQB9V zYNcFA;0lS4%Zty3EwV~`;EFRwMM7H0iw?D{UO#4<|5- zz?Gp}HJOuLfS2HAmZ+P$CGk+L?Nay7zv2w%s=N^Tm{>Ptt_T#iT)n+&N z+HVT9ED3x?`4yi>Gbku&=d~0@{`7^r`K1mR6THi**ht#2agu-|#T_{fe}zYBhIktTU} zGT{qTAG9T7ZA?3NQGFX`S`xNnEH3=$^%D^_W!WFJ`t=qm{siixc2RjH1>rQ2X%*ZR zX6}IGM`+;IiUEt00yB4VDfDeTL*N~4k2^niPq;GO&;=ZJ6Yn0B;O)TkSo+EhYFw}G zcoa3PQT&N#Diw!3mOeFrE5@BZ8jb;X2a!&Am-p;jTrR+L6xy~fZATldP3l@iB(xD! zB5P5o)tt6tX%8NMb7?4w+J`)qIWgKz&9ZGtM`@^(_)-LgXC6)tGx+dP#rQZUuS`rV zjZ@(0DeHC55<@q!yn?$%+D{ej!7L(0qYQ;`WvEB+ZnSan*tqe3HD-9I(|WdkpGpPK zejP3Fv{79xjFV-Neli@2TN|F9&C+l!eNJ)z4g8Fr0^L{CbWrTp_%L~{ayIFV;>srj zx-Qe6mXx^Q1PKS?Ef_GoyR(X07%=;L*8 zlt^yU`6gG>nCyp|(>`&@!=_=ol27}(UJBTVUmOOKo@e;$71>tkhSI00!0ucj?8N$} zy%mPm*&vx75o3y-U#%!HX79ZG|= zyMG1ESe)HOriB9;T(DCmD@5cZ+q*x(`wfW48?J6dMFcwdwC$(4tYVgX`Sgoi5#qd& znxJ zau{v@WryWF7gtPX#BK&Y$u|Mu8c6#4i!bZ*`FSJo6vE3Dej1<_-2!O_{pPGogz;bsZU zU)#15QG0N)hepIFph$DRhpCcMMxrLRk*^Puv1_j;-9F(H>=|%-dh~~x)N0M)0OC3S zP%oyNwf>u64Pq?sF5Q2Hu>$u+1|Fb6HnK0Uc-hzTLkbYwJ;eRwI$x@t86U|=tuh%syl z#+gd~gE+K{%JTPtcz9N&0-z$>WICcVS4uH>gFWpT9QiSKPCP&v$|X`S67Jt6n)hS5 zALWeLuYjJB{*#_nT7n?7J(7)9`@xSxTcccmKFT??M%HcxjD0wJC;uf|xgPEeJ@w(f za^_9~C+w-4J-H8|;{_iV8zrVMq+@MTpINU+3IA0?WrKZ z<7npc;^}6|hbzP)xC1V~y9dIZ9(+7hy7N`K)I*(kX1(~!?%7AtKEl*z(1X(E>sujq zJGdR5)H)B=RFn&iy~j14ZNthbm(1kg-pK1w!lAy;o^|*`vjLrS)_2DKSw<+LbgI9m z$3WHv=fi%d*ZSQ&$E5U1YP5#Q*)S5gzTzvE64Gz+M;A9mkq~6MGe~-B*n!B9LLd*X zOkTYDnI__yqFF=;eeR-RB(th0mCF5#$OXL3rH`mN5OItq;P_AOFA~$lTNwpMzC3F| z6S4yx&);k4ZN8=-RP5z*{#KGUNop_{Y7^K(FEHgn_zDp$*?@<;eq*)H#qn1fW+1h^bby%0EVcXqIm4AHJ^OF5 z7~M%AwQ0eo+{~*`YBAy!^Wl@L%d=_0)gxeO85NhxM+PCddi94yOz7N=>|G17)iad4 z6!amr6yvk_<~dzSER{zaAuU`mPW&^VawfmIqhB0Ez{3j9!`{WVeODcG`IRjt7?G8T<41XD+y4C~^^88FKacWpQewhb#EGL9d;}EVzZM;MH8_nEg9s zU%PCb`gp;atCGrcOKe6m8-SMQn|h38Map}Ie96)**-N3ry+>wMJ;l;TS7g-&>sG)l z=K%o9tUBRVkH#bIeemyz1pYS{AmlT%7!!>(IMpwz9iqJ1=Zs6+9gxH@vl*i|n&Mex zB~L0QSW`2F`3iMM^3dR-yNc~CD-yd?*mK~iEHWR3Ot~ZO6UG=!>e`5Fls9^e{vFyR zrkj1Fr9UyQM@x}Nz1>Ml-yC(0Kcm=H+#~Ho8)@guSfNH$Ks6i6F;NaYsJG}N^*^xO z%gqK%L=FjP80*OfE{~#={%rOOvLh*dVk@~BVc;D0!_^HJUq$1|cGUZ~)EHUG$7@p6dAI6zhI%>pR$GZ`0oYh z{G2Cm8$qf%kkeT$8bOh2X-{Z7_$HEm@4<>LixB<9EK5YF(;W!&HqV8;r+fc103aLr zN)!LWX!EO&0GB2ku;vlinc++L+P+yn!OVh;c-5^5py^V7{U`KT*uMEGx+gc3{Y?zts{<$dmD*+yiS7{lOHlXEtEP2 z8}rdi#OkP!%@ePe>C-bsG}FgT{MmR0+;(6e@da3QFZV2Jk4KrIfLgWN`!2aKxp-5Q znmJH;VkQRe?N!khqPUl4DtmnN5p%IIf&G}LbJXJoeE2$fZyP&XKM4}h0?XOo+}T1` z`ku=E*A%EA?B$$$WYX7EoJEIbS8B?4^rgLzJabw;Uab@rqk!U}QPxdhD>$0>F5`%dZt!+ame@-Aj;`@MV)l%4m5GGQpNI05DVqv@-n;%b6* zgS!lF!QCymI|PRWcL?sm-8Hxb5AN>n?(QywI}Fa9|DJPR=4tlowfE|-s!zH~dOUL* zY4kFjoNRV)ezJL&J}_InBHJ%I_)mIxMR8tu^Se&CizEJMg!A1XHgmCD*%m&V6Jhe6 zTPn89fxkWZ(&;og07?#Um03*@S=c(&l-4_VO|S>C->lsj&i)}M(F$@C2Pfk55j-(5$XHvGZ zerf%vj~8;KCTB%8X4gke>)t=Cc;=h?|9C7r!3~ixKyer70!}rtPG}CWjHKl5sByhxXOSCWH^(1;ZOzjXt zPMYI3`x-uKJ!z=VQj*b;4c!i7#>7v zjhBIEAp*;0{5Y??B;6wCEm*Hb#-!xY#}V+AnBv@^ZsJsThZ4SKP<{SWS20B$nGDCv<3b@!+po=ZId z*#^b{KTU==i|9rWy!EuyUUmD@E)9iTfX&raefYHiZIvg5{&Dl~z!`;b4C9u&cdm{a zVPT1*C$;(tr7C%`%U*2>g!IwC^@o~UrMAscE1gLIvltdBH>sFgGYSqxu3Hb1PZH|# z{Fs5#vRF4%2m;kI0nQG7KBM#y-L*i65XA}yO*}++os$6=D|?Zdd_AzI)C{Jna9alR z427=Dm~R|$L_1Yyk^FQ`>lRyQ&JFOJrX!G!b*um7lHV0Qdv;;;A(qY|)Ss_~8NP3Vwd-WhDwM6{KuKfcy1I;!gM7ohz}P}n$dN(&b=K6qseXiz?}<|(2!qMketHVB-RqiIke6uw zY(@m!%>Q^gVw7n;V)p%r9f1h$`d?}}=lQKoY^+aZWPyO-J}O@Q)GLoPRy+z2E6h;v zYTKn6-~EbkjctLThJ2Fl9J4l8kx&i-V6}|^lbJ$yj;?f;05jCzYhAVK51$L$)^p`x zyD+;wb4*16Z5yoHyFDY0_4l~ZlPN4_xnz>aIuj>!@}zqPLEi|*I?VlcLR0xRLV#*} zVERpX9Ja(iXPRE*To)sWH>mD%_$SK2uJ^d-DL9#|Wiq*74gk_+IbeLBIXuu1JRaL9P?hU!}ir6w_U{+dvOkj z7M%ASnBNgKebb8Gu;z{x!CpmBvzJ_JVbCQ0Q<0B@2_DXP_L~Z$H;-kHHzD%3%XIP` zddR&;mu)Qh`(-QEu%-m>y*2PvO(wjeK8_8DRlq_V1AI;>PuNh>D7e@N!qvXP8}K^c z^MBt23I;xvkb6;p9$3Ny&tk(9UL_;|ZVam)WS6^d57W)ICRF54!Vq(AP1J;FWY4U%%zYv$!PY{W+pEj~W$ z^1K;uddD;jtP_mI^c>10v=PQ}J!9p?3(r6MP?v6+>gkTpD%z?^n$ZmHSWdquj(v}l z^ntHUa7Uh&8#04n$G4mucai?bV2rLJ@&!`Qvo*wS?M+Yfu-0Ffp>DrSZ3MJaHZ(YK)sld?6=VG81@Q?3=;8!QbaI0hgQYCQzKr zLn3Nwm>{7?G+(Eqli!yMymsAjY@gFxX~Ux`OV6M03$sA^X+d7IWxjsi zXK7U@rP(LHxqlhH-gzpptLeQjG-tm)`*y*tyImSiZ&l@*dsIc8ZF4w^@!x1>qGWLG zwlRkqu{P!hS#Yuk?x-LNOz$UtKI(N{e>hv(TMxQ)8s(rB%-eXHafBKfYT;x9cX?1_ z_G@TO?L+0&4lY&mK>v)l!UU+kL992XzB9eWRQ*lO`1WBZhAPQD$Vk|;X4 z6Bl5xHe$Pi`j+&>Y+9L#*a_>%1OilNQbR$)-t59pGhP<9mc%#uK0fa@TW#3=tTjwX z%YH0{SoLUs1Awyaj}M^P?QJEU?`O(iW8GQS_s&^-5)J&!Ot+O0U*KTf&F*lJ`+GPQ zu;QGFbeGLu?iI=7;m^2jdK`(6`~OuIh7=V^5t&o=o20|~d7iV#+t`{6HCVPl;K#s9Uvm=iNpR~DWOMiF9WSV5^sPo`}I^hI=AKO!Pn|0zP{p>`e-X*xdZTq+_ zQuM;U0@VazI|NoGxuL56fEoW1*dr!nLJ-k>o<52jWLk`I{A~38<#X$#h}#{&MqomaID+o$6se z`l}PnEn;FRGAJB=z23?qWJ{RbYbl4+#eo!~jYeJtx*yaGM|RN5L^V=!b|WuAcZ(Lz zuB?-b)hvY|ojGSR!XWWs=rQzPwdVotF>c1>h1al0CuL?=hY}7PvLEq^&kRCG z-DYXgXa-rzjI>%qF6%#3Z~ME`VeeTd{c6mSIDL}3l?X)r@p~0-dr?k6&s=oegyVof zM#q=p8WgHBhL+-km*6MG;I<#0d+;OL=V<$AH43F#1T*$ zY8i38gU+FoB+VKYeeXEF2gF5wcPfZIQnDQJ?I3xQk6`4240Ps&IBboZH}b)ix8TRk zUSuaeQl&bC5{^5gcWOVLZz#O-hxWbyR$U+n52PPBQR}Jb3v;vd!aUf| z3tNjy_C0W5CMW&_H`;EOt?nbxsai7piG_3sYK?&+XyQG&=l*bA`TAVHuYbz&Dw_4K zopVl_*!*{&591pRyT-AyD&{B6OX} zekFt`YyjDJo6>b>ehEgT6S3Xp$6#8o=)50POjYyKJ1d2ttq@!M{dA3LB)RdyO zw)U65oR%0$LeG>L%a!{JYD#8@W62#YUNrVTXF8+5>O;_=ruTPV;1hMy|IRG&iRET1 z7PHlf*=@Ki>&d5cMhr&bCXOHeotYabQy@IvYzzv4`#SKQrBfod^lz&>+Zd~qdV%fw zgry97D+Kj7R2mT*Ogle#UjmGci#N=jaNg@PxZNTVtHvum#IVUwv1}2#E+& zxFn9jGi+3b+fXNUamMR38?CTgbz`W)NEDef%+?AJ1g?9a?m?h>(@~5jo7KKGVD@1! zrXt?#0keFz1jz5vWzCR@d7oz0M4U zRE99M1G?Y!d3Q-esSWZQ;^f+hoe!pYn6KF+Zb)!^zlFJ1zOn2r-uOyby1~??G9e!l z;OtR|pU9rLWLZ)fBC4F~hAq5O7#Yw9DpCBrE2MW9q~vw4L2)BToD#;~0LG&k?L$}L z_A>{UtjL+g!!(f@6@P?vBtOrG?>$8ZdIeovVCh1sP?LnYlev)k?0x$z#)_pJi5y>(`L^OTVOu1bUX*f#OF`v`ZqWkAl4_acqlOomzB z8k8|^s+rvEafy3AfRf1$CCx6rNB6q}3_2<6qorloi3~NPesb;@WQ3q|rEhSKgh>kn zEct1ae-V%&6H6Zj5Svb%lWo*1lD*4&^-{(09x|Ki3*(TflSm%z?&Eo>T@H`K6ewi} zMnBN*DSZ1%_5*Qzl$7!fb|8u(>>emg9tli77{10ZRbaO9Lc57c6WuCtL}A~CHRA@< zsd({b`Y+fz2+qj4O?Mki{}@))MFT-Td;AXl#LQe5Ft{Wkcb;eS)k2ZkT!1@J|8T~A z-qs6l@Njr@hp#;(Cl)a|fy1t=5)L@_tRhu9$OxxVL@oW~G8Mk*R~cJ$vWlJld^DS5 z=X!m(F*A)pI^##(n`|H)op;8W)8D=oz7lzvVCo_oe?Q|54zI!6v{sMJ!2j4riuX}+ z*ocNfBZJm>o#vnV{M^Fm?lw@S5V68TI~Lcxpc;X_K{m~8!;ob!qi-wcfI+2z^eeg3E|;^u4wwNA{WOTVOM=UW+9fKpkReTclLg) zaJCIslxn0d^YEtI_GFvOnT3VLzx@25llfxYA0@@}mIMR@$LHsi$#Q>`1C=tkPzteD zZBpc^`};zW()e8Hvw^eo^ZnpbuyrqzfmI=~AC*6vp`7Z>?7 z_K?v;nlMj0B8MJ`m8MC9&lioO>741CVUpN=p8LngI+wGbTem6AtX_{cMM^nS;6ONh zdpK@sWksNpV!O`yjhmwoyQm3`M&wJEQ|?>EDsu4~=Ee;+k40OmHE`A*hw^gW~sqUQ^#ipKClsoGywBe7$z$sA#l^(K#l_%N*&2QuleE(>wELuyKb zgFc{5X&oKHL>eXH#!j2{qOPuN@cYM7ra`(`XyKgvP6sE4oAFd|NUpKXShs8cQ$NKQ z#p@i5_4)>+>gxx`u}}aJDf9^V^M$@B?*gCslou8k0TB{BPdh=5-m!^)2)S<;tGz=W zA6@zQ_*y<*&ca@*fF3PX-bv zo?-qvtWKt|7fs3=`}C!QJM-4<5g>;TxHXXoxFe&Cj$@vF$Z;ky_Bekw69_xks-m>- zf$)GCjQ`kPegU-x+=6ZGd#(qJPG2x?i&vgk!ESm3aRC4v%kD~TJG$k~)I z_ise|;N3iWSJ=}Ioolwd^|ew4K2M2!tlJBqTAbLyZpQ!YV;|*b3s)GjKFc0R3kurJ zZtymtgL|mI$!Wz#Xm@49uwv3a;MeN5x%jsCvRr|_`mmiNInq9BdK)KL!_0raMZ3rJ z8*MM`T2zW2@xeJLH?(2zBq$gZAoWQsXcu^R80WUY4)cl1FfNuf{~4{>L9%1;Nj10K zxCaOpSi=?X`#wSght{<12I&M>jT;oA2@HyZTTY-6y4Re)s-h4AAI?%~^geG*s4vJ4BKQK#9wd6^g_@_^B>$W zRha#7RcMvSufCNaPz!!~m&xD{I?#I@Hk^G%|1{{Wtu~wYutXk+I#V%a)UZM(I)@6$ zJsA&(2&VN$%3E&tcfb!a&CqgSs<8Wwp9)D*8kos7NAnT%mQ5WNUHBaE>J+X>#Yp0a zClK628ytn_6$^tq+FewO>j5f{d+1QF%ftIE=!?$ASB(YBp88cgq)s1q0X~jk2&pcd zVJkXr`kwp_8DqY2#)r^UwK6wUE*H&`>v^ylqE;7i$c|{52fCK)l$_;`)cE37!1%_I ze^0v#Q`=%NK*+(uiK5@Z&DHij&Fr|Rur740x-%QDmX>o_oIQOSB~U4Ng}UJhphm-q zuo@7U>h0Bi7ZWzgzjGkIa|E|jG%XI_EjkdWDWIrh&5%|@c-_g7r;;#ASy|~}?+=bR zdxKHDpKo`qyP<^tMV-dR#u7LgzAWlA`MJ3{F>!J6cx5dy{{mII^o$HkTWppvaH-IB zn_RuagIZ)+=iBLi>E`MhngT|7wVKf>DL89uYmxK|6!Gw*rU~lwlGfG?;40<6XI>Vo z_u80snrw_fmz$ZsFDLRjLOv3jRO0xWO_H4P|5{PC38jv%mE}fa^tXRGiY20Jg(?|C z{BG*+&|+Yf*lFwX#%&EuRc1e-DZfFnORwp0f&rMUDSToP!O;H@}9W@x^Vm zHKj~UC=?VF8X6n1smzj8IXOAe|44;BG)6JE!7sb4c6e~CH`$uZ=1Dd*H={NFUzzA*kli*(Hl|qIHL=!LW+iLeqySKcusxh!wwA{ISRJWPU49sSH4 z&qSBPp_1f=M^m(=j?3MaaWEm}&U?V=u6jg>zGTAj-AS%gr;}#!p~Q?XnnEA>MPDz5 z8tiOLGB1zg^N!d>r2aZrp)RQ%w}GJDilg=tLtWj}@xdP)eI|z0!cZ!2pZjWht(E*Z zia+A}qAUT?mMyS~EukqGtT)dNI~ruH4`C$tSmR~0qZ?czOKxtvsV`dYtDqSJ!?Yg? ziT`kAbQ@jt&lx~|`5oT!NwQ$lAeUXzj@7kG`e|hgmv#WktnCvd8!oz{J(jAre5hY= zc}6E(oN?Q|9*I6sOhE<#B(v4G(rleQ*x32h`%^m~;hw7*t}$(2V$(TmHwSxL^^&P= zr&k0J@8vR8$lM>-siI1#5+|IcD|o9T8mk^BO8DHhDA=SSHGl0^_RYp)GU4_xRwrau zXZIBfLC1!BIs7C%WVre!nup#gN#U^{;$${KE|D>%cjH5eAq93Nl0mnUljw6OoXBqU zvxYZZ=mzsxibT^5-Ije-km|%a_MA{{%azmgz#mYP#N^7yufF2`@1$mB4dIMCpUDPJfl3G4hdJn6AC8|_jK-5Co7x;B zp*H>{H|Ce;=XO?&^3q$?ZHwSNvbi zATZs}ISDX3<|3&?J7Gt9O;<(iXKew4iIqlc+$bC-f`tCN4l?|&*vP%Q;556S>;g?w z{XK4m^sr9?^&McU6i=OFU}BJqy=O1ofqj3u(;VK{fB_RS=^!5a4^D>n@PXX1+5B}` zC@7BMkp8iOT}~(e*9gPp7#R-H!1%}dfgsnV)MGf;VY3pnN}ndO#(k2GK8vb79}&{X zb9L1O`^|VH=~b!8lC@H@k1a{%@iWbUxTF_>H_51*T9vQlcnP*S)Q9ufMIte~e&BA; z)}IBFR%DmM!rlS%)la#D9ULmzt`N`4I zRCIK833a_+*gKz=7T5hgnPVF-JV8fvw;_$^b& z9rw9k6{ZxT^{pi+dkLPOQ5KEYp<7Fjghk`Af6?fE%Fu3M0&cl ztx`Bb0%zvuak7+lNvN+(qB)639?dJ@{HLDPebVlp0k*K3I`ul}FMChxSas9*tS0L#2knOf|k2(%WGa3p#th?JOK zr9kPY*rk}u>Fo94Gcg7`^Wbr7&Lg1}voP zEoT{NI8LeVXL#vIUI%n7r;dNij&EkiVn26)A6a+A3^Bo)wg}0W`@f?wzJW%FeM4%0 z3H@AlNfsN#dx|dNoOhO_cUpqu*kF2nBt_$U z+y98^6oGx2XVpA;EMFMbSRpns0@nt*&XeCV-L^a!^>yXkmu^kRz&px$T98qvDH>b} z2@n^zwx(6;G_#uwMT_jd>i+&O0`_-_gmpiY32t!%L||6p(LVHVFa%kU8x$PKxnw1sNl6A;6V9XobsAy_PM!tBM1q; zHBLOdn$siC%}&C(W-)U)p+nEM>pE-*0X6_+p zdxq;b(o-RpxVu68?+#IjHD9DaffM!PzCxawgAo{PrZuqd__JDH zfh)soyogDlYo8Z&?SXt)@D1NVfi+`rbLhLX(I()H4tvn?Q{dG3{tUS>6{(mxAmUGr zCzSU)>=S5x=aEGPTbnO&G261_>I_~H_rwE~qA_Ld58$t;l(F7gH9r>1#>K7>^-?|kDi4X!U_G@JZ38M_O<9LtKdPihVr{WoaF7Re-2F-q zam8CNzNRebjPnw&5#DZL_jV=?kJJpkwlhDh%sSzOvaW3h<04j@9MPRB6IhzNetb~b8Z;%Sb=OJy;nF|KkHv^ zd*X+O2l3^$wl>YS-*hqi*L_GV$eY|Aw+E33asP2VOG`_+-TAY$z)WXUKc6>*%eH-H zI|q_~e({4!`cIH}zK^$LJo4-We=U~StdeNaPWP_|@BVeh=MrN_^lfVNKKS{weCUqE zHZ$kbmz7{M{A4v7OPZO+h!}9V3P)rE$2UOm7`}OCfNEr;H5FRLSsP{ID0uCdm>2-B zSPGNQ5I7hO+cs|wfytQvFdvA{W|{TeD(CUBV5Q!J#APHJfP_7@%5-#)dG& znJ&D3heeZ()cgPC+W-D6_Ou81nG>DZSwqXXII

B}ZG5%lGLQyM1!dA&clcQy%$ z>8RvQlBY~ktt?a5%flXj%2T^or#m0{MZxpfI`~b9paG`vLyc(O|GA&m8Uo-V4-P*o*a=$M^eb)oEbp()b=4UJ)NH8ISXp9K8Es99q;(3q$s)LL)_oAYDr z4osp6?P-Ih+3^%6lasl^T-gPMZX3{~WTZPh#2X6bQ2U>W=5>6MTw_F|FTgLrOM6i( z_xU}Yb74r+zk}CW7qaR3KL-Q4z3PJA#xla&?>QaHYRohcxw=NYTm?ip=U68)TXd26 zP-7o-Dg@7w!>R5rW0FY|kB+_khBiw7v}9}4u9lSU1YB%ZXf;5t6(3R%B900#gwvT)97k3NTQ9;Pa>1u()xt6k5MP0@~)pk|CE5 zYuc+0ccxA!L_<|8HX2eFX}ZzKYfmSDp$IHOqai*@#($Hifru-sn}Zk5N@0(5i}5ND z{9OGeZYKxSsAQYYiQsSa@pW&^c1caUN3W~J5yPHm}0%EzJF+xV;j02A8d|gM-5YXpR_t@KlTLn-!r1Il$A?(;b}9Mx$rB`?7ybKvA5h zrI_8EmRIK`k*CSWz!!g8sz=Pxc+#43TmiR$KW%?hHeZO171hVT&s+>UPkoF-<5-Du zZ~YU6;`D_t=EjbaMeXyrd0AO})Sc!!>ix;vHjeFgq)=^V%t?fWWIG+y%F;GCO(Q=t z3u|E2B=Dwg%{8KO#2W zuFS3Xrw@)BU1B(t|BJ4(S?P>&J8Vi+aWUBzhLwjY&tmp%zDKsmQ1uWDr1jvc-Yrn* zKJp)>%an6T^G_Xj^117+dD}Rp9e}8ScfNXqxJu6?Kl|YeU7JMSX2}xE?8QHTxu~-D zYtB%!!)#oQlbV2D-w?&nTm!dT)18Q@0eORdq0my?9!;88$??1ZtCily)#mVa?ox=0 zMHtuV2M-M%b;mulv`u@|K24!Rxx)b`cE!}?p<7tBzbpqm$+ny*SATA--gH``Dy^Bi z*|r5A@$w=|;Qka$F~MCc$_bJE`VHf#h0D>M?=V#}_-LhMLDkYnf9?0b$k6(DW6{(N z;e|K#-d=4^6(;$OOysTWazw_^x6r-S&rc1CuMD(WR=+-&BEGZ}{?e>{)@;;w7h9kA z3X-=e^F~#LM|d5nDE!SNs5e&KfJ;Ypu&cN1q`0_bF9|B@toUfJCtX#}YMgE8Rm<#r zkgbq8G#OT|**e{O42lio4SQPDgrNw<{lK6U**mx^JqP96{P|K!n^nmkuQRFX#IQ#pA@jTkhBh{Gsg9zAW%y$>Sn<`tDv$=}-E8 zKEcSZdYjF*%+s+dM*{g)e(`M`aT%EdvvgGYZLsaS!2OK)+gbO=+1VDi{JiO8`ZqAiIRhqV7eT6WUOC{n zJ3c05frsD(TXfnQRqWadQUkMl-|=EL7m2VGK>AG`E!jGUL7cUUlN9c;pwLuYtw^OX zAaD8Jv{HssUHaKV8~u-(z1SGL14AY<3RZS`4cL(C5-0N`;7hzDE9!zy8NJ*_xa!JS zhy`Rq>8a+vHkT$qwFUWSZ~KgTWP#WzTUuFms$54*0mMT}fZA@?_bqms8-C2vQvj)J zF#H@1#}Ab?2IDUgp5Yq346<}8eKccq8Y@>?7Kc>i)Dt74_fwqAZWTQuf!&$1 ze1=9+N)oGpxO{gOI4z1$&{kJr=+{>Dk^- z17CO%VAPj-3Hf-|DTghFT&6hK!0e$J1)@I-)WH-XJPYVV zp8m#u*}*Rtkm$&JK9X-?;6NMUenW(-KUH{k!d$ie^aam%D*D{@XXbCl$v=gM+AYuK z&OBiDu$=6%WaQAPLccSeV_G0lB9E_XEq^zL|8nPkUdeoKAcAW1%7d8y@$%Q)9VU<8 zu*--8tMqf|I7IDFMgDs%*J{s@C?R*Y`C=s!j+>TLmuGMSfLVKk1M(Ew66CxbhY-#1 zVE1#1bKG z8FV<%V-KRvgzz!Pen1bT$4aAK`SuR4;2QjGiqNX7JU0}Pjriq-Rd|wb{2!CC5xsD< z89uAO%~TKP-i$!0C$x+n<^{kaJpG5tsbsQjALqZ?ujw>DrB`i+EupEUD(UGQ1@~Z; zw~-YR%YYCV)h2uYdjF}QL>Ud@U|LvU;+OA17>g|1Y0H<6%ORlC%?G*5y_hb-$I&7f ziY-*6!^uF2%m>rcO<*BEigsjk5o(vQx*{j}Ia!7tjsB=JZYE19m@PY6aSYFp@zMzT z!X{s0Uo@NWSQP^&V|dP~R0io$LYh&e9}zA0LzE5qFDiO92zW;iAdSGX1;oD4PXar6 zH5d`osS$J4Z@le9JpF%RQmb2{)+OcXPXiaz!_MJV`1=M*WQ*v*t?#G_!w1d0HO$^q zYUh9T4*dP$=uQViNmPT0GPiYPVZP<#pGXlhA-ZaXt{;j4S{p)8-c}a`TPAyg;s4H# zt*-he9^-up2pA`D_94vb+pfuUO8{63_&o@-tB zZ|bN>vni+8RWTi5lkR8z)HOJoF(Ve&;AXX|GCudvRX%vK`d#Cxz;6SQ_`x+b{=oQa zW0Eex$lYwl{YsRf;C-G0CtjlAUSu1&x>d%ivxM=(Ak+%fm*yYN9{_iS!?Uc2-oR;{3!tAz0od-eKN z#+dzR_iUiYO4A?9bocDxogLCpBlHM~$gv9#;+TCbAMRzBQB!vD={Y~N$H#hBwy+#L z+}peR_YKTdxmn9OXPzFLUkRHML!$|Dg)-Dp!+-~YSY2l6HaC$yLzu(f{w79@2P`Ve$HTU>5ePz7O*!TMx(F52!Yfpl0d(bK1R_FJ!>CTIVItt*%M)(d; zv7+_u03J~`MGWUN6U#ae@BjI!&S{rJqrt%d`6Oj zHcnLe_4IE=S~)wQEJL#P(u!VE z7f>=xuJ+x?iy^Y!rU-(qwyBz9Qu#2Sy9Jkh{SMbyP5F^lioPmPM#7Y_E(glqJ?=0U z(;UD{p+YX>Wxg3FXx@Z2r*n&2fkm&cJ8Fu*7AeTvhS9w6_){p+Qd%~B*JRk)Ic^J9 zv8R=d7xPCjks4wxUd)ihkg=TZG!kzo?pY$&ii)vRZSDT9nWv$H&WCEjS8Q8k7PN5M z8Z0qNvq&cF*O{#%gjpUH11av{>)SbgvUYwyQ9AX_%slpVIfP#P?=pHgg#01S%C)Vk z5cpBF>~?+d=+pSY0UDv0$v1Q*Do&1n|0X0-tH2E+M~zurwUPT^^qlDk3v_4~pkUh~ zGAWkzk?`Nl4#jYL53{95heFg+I({1?N;)ClqWM#@PzdnNZa|it=mF{CnGw@Bc8Us! z$S3lRlkSI10JD)gt?zJ?0!Jv-Dc=~m7HBGEPR^QfDv!OHO=yh7vMhs{8}&WmOhE@M zltsA{4ubxwI|NU}J}#=h$Lg(U`#KQ3H(%nP1qwQ|M31-sb{lewHxwRwOmTUm#nnFI zg}Cspz1Xgb-?d`rHD7NKA=?mexvIYtMOGSxs!}2c*T_5#N6zG`jlEo8lBQ6ZcxQr5 zhw5;MJYUE}ni7SR-VJyLKA8ZST~;NiSa}1naTu-x z@CN8q;`5gy`k0vTdcmFzoGeGV4Mapdq88IxXwD z)jA!@F^-cj=Nyhc>lleKveqfotrEow+``A~JIu+_7-=0cscGnYP*t6wQkA$8nTN+c z_2G8)+)WElU-^Rgk^4zACY|~aXp)l91(DH&-&ox?U2tnPNy9CdJyd9;PUtAoV%Hj_TCtG$&6ZCSO{a{NE8{O*k@_5<}t7Wu4eg&^= z!!-`}`jnQJn}F*=n~j$Ep&=jCnACQ2 z11Sf(_p{1SVu)`ER{?jiiG#O@O2C(8VP;X@{JB9NbO=N+y0Do66(a94Vo$=PTfv3k zmVRC%>5gy?t_a)DP+~1QJvN4Vy+B;ZSTbl|whhS4;+=Zms%}K8*jd zDm#6+I+vs<=3TYf$Y|~J@;a60^l07D2FH^U%=-+N7asF^ziAq5R;AbaNy{5uBHVdGEAJ2U& zn_-Wc(I5BwTrgtej@Mv(_cf2vY1?oxSML8($eN6sSAA>fz)%Z%_tM8NC-pg7G~~u` zh2}_JqC4?K7DbJ*U~OsGBtIK{=Vxcm@hOz?V(9LC=fkTdHEpYj^+EeJ%o#t=Ia=u( zxZ#W@8a^j~I%Cgi76_>O^-3+joo+gB_nIq-^)`OQIE~`D%S9FN=!&>awjS1Kh`%H% z9RZ9{Z4ulwz3hOEj3X6yi8yI?3&`xUf8s7${r$ww=l*8Y++XP>wx&}G9!MDdl{+ln zYCzwSj9!Jipo8G8ztVlPS_~e6ozpdm%*=}nd%uT29r-7xj`(Cd6Y!=L1wSVlPb>E6 zFdZxAE7}I*<8HCZ6rar6I_rq-mdxdD#y@xjV^l)^U3TU_49q7rW@BHP%mjp*nc!kH zexuG^2tuA@trUC1*B1&3D<+e_-%X1EFiahx3ZS`o66+r4 zZa$2JPT&%yJNe!N&s^%pG<#PyhRG8eaCgdJO7g@D-a_J@wt7uQ(U}ZidjfMNW8&`M zM28-pUjM9KKW~goi~c$$TKMBKu&T*%B$K1QW5!`h+=5yC+BoybnVcOOIU)Y>^Z^{j z;Yei9(LuCZ%igc5d3-!nFX`C;FiykpOP%N0qfzKM5$aQdp6tnjp+{3W#$K`>c#0>F zDyW?;N;)>KZj&{fC6|ckh6~`O6ShPT8e)w(8)e$ls8_oM{Odoe4Z*qKLpO9c_M)b7 zmV|LT!zTV3u0r+t0|oyCiDJnfCX~D3C^~K)f8ehxYdnPz-C470OS@Z~&%v@gFLzQ6 zUfHh|^_R`qy~x6pazCXrMb@#{o&zI=DRCeF!cD@Gnl zber#4+r8LP7g6T6m6Kc|-RrlVL(xR5`~Xje$0@_-$m0$f)ys^=K-^lXW>J7HvFbV%z;EBQxjt> z$3a>~2PiN)%bwmlJvakn*jui2jJ#Ch_RID)l_}p;ZqCQVmT_Deo09NH{aIvjO`&6}qhnyv|3gj(gyyO(v%c zr5j)y)+dgjjz+CxvE8h!LLTXyDP&FPrsW%T;`l^GBKQPut7$=rH?p<24I5`s$# z_GzuIMp&i`yq?#CU8fx$cMf`~FE5@qTlho+b&Oz=ae9(_Ti12l9jaGE0I#oVcogX3 zSx<)ehI!2!%A$=B{N)+C26B|}8_xa5X2E^jzndOV;IuID)0b;~N}5Fr=#fA05fBa# z4&VA^{rlfO8&8&j{>^Yg%|2-^Lt{Q*bz$pAw|JpBUXTx3u_xEH(2hj`F+G*Yknn8k zNnd7XToIZ^2bra0dY=>){#GS7~T}q!RyQ zH;vPpC=@j3Z{`@TN4>Qqq`O^)zO1(4&X>S zRP$uC(?Y>+O7y&ryl*qzc(M*svY-cLLI~t61?(XUnQ}RSB{m5>j=R6*148KF9m+0WPlV4>By07QA zr2$ldWl4P4G>zt(yY8+)zJEzgsATPogYQWBM6>wGx{yMOXU>1$7rtzJGf521X3sQc zKS|paeB9TE_KlRt1YeYE>E>vn@Ez?}72aEM0a-$r(==$wxhqa5#cYU^f^zY*A!aVHIa%1dl^pr~=`NPd z{rxOxw*lYM;t@ww#vWn0j7fa)75qLO1%^{+n^wH$r;q*Kq|{BGyoIcCwPNjw(GTA_ z$mJ`>W3Q?*hg;j78M~@ujj~+i-CNG|(wW%oT}*#WLd~%g^5LIevnFDq8`LP zpih=f&>P+&my{Q7+VqtsmdR`qd{%^a$b-2cb+y9X%>zEW)}yTuxP1Gp0s(G@W9n?% zI*@&)H)G0>*zguyjRublaD`TlZ$F?+7_AyZBiOqq_aakapZ70oTIU$(i@0mzM|M-k z^l3AapS?Gu&OP>;-35LAMdBS`)1mn{43C*4OjvMP#vX?7=5L!ljt6hFEj85%mB0d z+~+=VU7xdi@kRZntt;Iyi0eD7Hr>px%V6A}Su9{3E>e-lVet^xx%MikQEo)!)6%RW#SDcv_jb3cyh&Dw`#f~}rB)enK+pwlA`}$KrHzegAPz@~c^hSW58M_4gUgb#laPpp z<%KW6zY7l!FYSruvgz36`b*yiL-a$Y5>9e9BnVK)^6*ZZHC9(g1Gf+`_LRO1Jj&GV z{@Xem^>02O|F;Ia%C!S@&0ku~pSy(OI%jb`$|s-z=kZx-w-6nxml&ye>av{1)|gVH zDFE%-HY9-QhxsG3OZeN9OjMTzar~C%6{KS5@+bv)*j+g@OzaZxqohdouPcU!b@UOk zb=iRH;Sed=iXPF0^aAoD))m3zO{~P*tc@icgRXxu`)n)aA}g2IP@?5=Y1Kp~?(o2^ z*qmB?z0^`S9lE$|HVf(VVcA56V*RuVdZC-mSqAx@bz5Ys zGTP-p(KXnW5%*aI6XDaA>s3D?GI`8M#e0S~U!SmZKCevUOmC*pnXK~n z>uyxsQF5ki^cs`WS9DkYOYODEY z{ezDt@y3&cc|m(5vP%A5*P$}L_L|Nfl5(>EvJ2M4vzMH5Zhp`W==dadW+j9(-}+L#mW z(85p5d3eqFctXrdXpdynA3k%3`ZJa@;l6Bs=0mF z|KW9Qx^~VV!qU~>MmcDNe{k3H-Z2NabAR;qpl|WWZLVHcBv@+qzTy4#bTM|WC{Yxg z^Zq604EA#PibY(opinHqzEi75+MEs&>x+BvRCfH(Tq!QMt-zafNkV{;T-mPR5nB>T za&@wDtIPh>^>!Y|V>F_xSU){7z^E$r4i&Y5Wzkv``win`vNyyp%qrH$#3>aKQR)K; zbtNa-dD=JawgCyq{x^^sXd`8(o>dorlgE_h?X#mEE~e9j*eg&`j!#$(N~E=7-2rlq z5{F~Uhbu3ekOJ%O)c*F%cD3dY?k&bj{LO)R`Gb#YFgr-?6bQeJW4r=!Kj4nKQ@EW4 zApX7>C>Mk(dp zgKO+uk&r`t?Q!C_-WJr`X(h zZsl0aRn3ctdW~qv_*tSdhvPL2$-93f?-iIA_F^6cRZIjDjgp%%pw_-j=**dO)*FBl zzh1Zb_+is|@>XQ$1?-LWUSUSo%pVp|<1Z~P4*k?MNj&j2^gF9u*NL$rePB)OuwQHa z`jZ97Gt=UVZ;?W*_eYxY6LjGQ+yXvcJ;5t{{D2{f4%!x&drLT z%w_1*4^vGJ=g!Z7G>q~74F%eVlN0inZGQ+EzSv0L#TgEj@m*T_L1xO$j!u}$?TgI? ztg*GgUe!mt9h$L7CtqT?{Y!v6@%Y?!7FgcBuX?xh7}%hXoF;J120h7y z$HKJhikeE+eKKNrYl&u8>+CUiE$FV*TeZBd<%^H^SJD-$vqav|2`=*=eYaZp1R;*C zaEO>uO!zZ2NRDsS+6-(t^$qxU9rU_>d9%*xxhjj#Dc#~KE4mT{`t<##MNskIaPO2@ zf7<#|wW)gjkiu9v-X{k6h4~K_lz#N1E$ZmsZmtmIALF)@sR~Sw<|$vRyLe=i<5b4< zZ+S%DZsC7)#?q8bD5~q*Bkjxs}_P6CV zjX?B|!s!4Q^bPBa(sPVO0M2a2Qm6r;O!QRpPQ&Dy?!EiHTJ%~{W4WlEDT{!Ubp)9% zM#e`Ad*L(CcEs&rV1<~6?~d@mR!jQz<8-X=ECktP&TVTZD5!&JLZ7>w7|ptKlht|N zCD=PQPLw;TD^OH)EjWJ)XA)1;rG?!hSIW=j=c;^})ZW2*~lOXcLh?AZ*>;nK!m*w7X~^`vmhKPLv<4q^v*{IkDV=j-{qzgB;8 z^Is839*zdnKCMM71YWfb93DXwaVM~$skggEAD(5oYgqF+=BtkMn3Aj<7Kt27&ju_j@#5HBI(u_u8P*Y@n4{zWEHo$ z=7%iS@;Ftxu79q6PNySzyvdJC1L>|k?i%w4($SN^yCiCteGGzJz2x#yi6}Hti47mW z&BeRTyN6dcf|Lrfp1|FxR{FpLfh4KWEN)XA_>dF31|i5*i2`hzfPhor zAc0%2%^E}e-%|49a`u2WE>wfX`1cdqq`0rpSQ3_4VHbDwN_l)OKOXCx+JO+|A&oOd zl%qMZoW9~<^loS^y2|-I@0b^PbW{`)VBh*rM3^l4Kn!}?fpi1GS&>do(dP>TpTqi4 zKV)Zinz2|#sPOm+^d?62yo($yB4~`YT_@ga?$Cvl7?hChXL73f`j|AogrPo2b2#3!%@Vmo9 zHt%qrq84Zbt&Neyl2a5fCI8&x>_*dEX54^|L`+Hd3vXq2Aw$*`b4>^yeKB5i1^T2u z-U_~z=35Y70_V~OOSLhtPS&sbNb(iJZLRQ@?iUW>s8;^pYb_Wwi) z@Tld-@ZJC1_!nBdtzv18r$Qk6{rrPfb%CiXN$LLoVEw>vf~nG3h$tysdm~Rmx0r6? z0e&$5$Om?gT)z58l?N`HnRpAa@Mae^2Sy!cicjNF6Hq@s&uwGsHfKMcU;1}vgNj*7 z>(q)IM$_T<5F==QD;y8x6Z@D7*T##}D;CtpEQrv2{^k5O3zNUeDx}4Lhlj*itpdJe z(KNHp@#*h9d%dex*!B=!g2vB?2lz@Dnn4fcF?}qR36YGN42$>f6!~VqSZQ~(=lo^S zp=rVwKzb2>;yPZ2e2#=sL>w7!JahGRamQqR>;lTaS2L~wJkKh(=qe-WjI^rn1wbPmHaD z{kdb=K?T4&lW~35y<(Hv0!c3@f5oqQMkihr;}KYDTSKtNgW z+mvHmcm9&ZMew9qs&hW*8`fn*)Xhv{)s)EHhvmU7x#vFhW7PkiC4CC$!)x^9zv0Ma zvFyFEX`JK71B0_{mRL_BuJ#j7-j>xWQgiabYp;*e^#(zHloPDlcl(c zF+-1jM)iYTcFq0;`j!`cUC>8Q=RwmPA{)V&`n_LYFAX=a6vGdwz}0UQe69wZXl##K9^yU!lh^dMlqmL*qmvw+oi{U<#o(4M&fE@SBPyY9T=G+!@dvSM z`EGT-P$vC(rGW_=NrsKFh7Ci?9u=M<&Z6^3^F9I%9Bm1~C<|iZV{m;Eh^n{_{HLg> zzBh_Qb&xVS>g<8wTZF71+Lfo zl7TWg83k5LOADCp6%75z)mrlf1wvW?pL~+gA0|*U;xczdO%0Q8Kb%mSS_WQ^g~WdpZeL9 zg{xun??%Ya-V-f?^DPUo^3)jIE=~STw-D|Ml$6M!xuH}_l20izi))#vN&RjK9u zgjg9jmH6y}gCZrX5#=^bBz5(}IK0NLoH=apg1dqs!3^(#_#=8RZ-fMXVmQeK6w*ig zc&k(O^Gv!)Jg8L=9o&@qBU#QgSW>Z;BOL<0FaM)t216fN?iW0f5*=1U&fSWPWyxbS9Fcnfi zZY~C&Ey3oIRI=Nt9@4YK{g0J2yY#!RwHLfO#1{%;Osw*-&jIdXPh#Am*OjoWvsFAb zxY~$SR^=#A>{WO^SlRCw_AIA$QWBn1n9)a{3kbH81NTh7$-*S!#@x@yIO|g_y-9q- z1DF+|W^HdnukC&4LHj^1b8x?A0%SY$u>ZBrfTr4E#-Sg_>e|{TO-)HU+|rsFKwHVF zxZ+f!W2NzfNm@VO{@((@YdA>ITbDd#YoDD))1juSSK%&W=l)O$x2}{*)m_u&iGwkKfFFHnl0v71{3j{JkMj*rgCiIOTbX4)!^Tst!3`&=u(w_}FMjvQ00vCGbe z6zsaDw5DN%({K|p9(SjAqbNX1(t?29)pOHmTbDHdW}{Q9;*MKJ7R9#dR$~?r&~$L? znorPd)gVb2gO;hJ{x%{1DZF~*q~C(e*N~${M)6}}e|fDfq06~sx&fATyVaKU{mrUh z&|1=Zsb<+-bjWVlJ&CakVSsy{S|OgfgLm^dJA|}cR~_-yLKn`R<2|N)6f#bqYxLf?IE>RW;h;U=#_%X?FTsSgLms>h|@?Ut> z=y{Djc8!-~!wD*k|03QXJ|QlfxX&Xe$)^!^sP$l0SXC&~6~I5!m#e*@KWA6}V7u!u zekRj$-J5_deiuMkE8;3z@F3`UKj9XficeEUok_<#g-s5p81Xz+d;5`2f90Y*)PIns zqLof1Zrv56G_uY|x3A{~=iOxkpBgbYXCrqOdodEH?DpPPa>xa4<{hJ6moiaog)7m> zL8?E~FDHY1gpvG$nJ_*4RgRr46DsTVPE{V#@pf*fwZGQ$eS$Ln@42kcM6(eYaJ z{#CWMw4@?a1*H7<32?g#ik}Y|1J0C9Zs^&g&4}#;Y)nez#`z#jga2cTCE`hY;^nTtoG+^5M(nxk7=Pmowj$?e60;IcpPSBdBk*|E{Tx}aq0dWo# z`l;|XV&!B1l$VDnVVt5b;u3Sj|7!tandY5ww(fDK)mkEjBiiKd(~_DPU{hYwJXYD5>ah;D^#?h=rT+P%?IsmiW9BYkW8M zQo!CD+fxd0d>O5g0#4b_>d)77b=`1R|0&Rxm$g1O8J@*^F9LL@*3PFmf)7mW5Sj#eSSp?^tb%=L5YF;C z?4kRe$nHVK$svo^jIV2m$K?Wl>caK=5WA#=N#vU7wfNE+kQVb_*)%#6*MVR%=$6;{ z&EP!f<88zH;xpspL3qXaZiw^y%Zsmv{n06O;prDyEG3P_|1xM5iFLuEMWElQJyJm; zQ=%`nDIlZnCz8mFH5D;pY3I&z7&3-zrsv!JjMl%IqNsd~6UG9O>_5=Cek$IO@;ZQj zrxF{?v_{M&a(m1d=H^+8`@QGt#1|oKVAXsCYmR3iRsQl)HV6ye5su2}hH=f0CM0le zXH0r7>qzCS_IY*+pRlQ(N#X3znw!fn6Ir%TWQ)voyj?+*Rq+_%>+b{YwyL(Y$j<1y zfWokk<^zJ7M-Q`|lQ}E<5sMo&^^ciGRmonT)iRK=@+|Brxb{xuBhvEY?w>FQUcDR~ z8l}+!?9^gN86lIQ#_{qHVvD9hBHh{ovG*NCZ$}rWr@kdt;fQOi2#>$^U6xb#Gq+KL zmh2K6@3+1|N0-ep-4#gRb-R4>aA-WKn)7C`3vf02%I*O=K0WnYwj7B{jzR$Cn#g2w z7Z>lO@6K-GTG$6+!2;K!x}nE5z04+;6mkwp|jU+6z2+6tCmD+ z=&R*Ad^M(uDcuJ06zs=}X*U%mS>*TC0zFTh=X%!23U34^ z-xr-DrEs>d30Wa9@6$>sos|i#xy=$MB_J?o_U~m}3w;vkq@EVzDsVSUCg!*NoUlrr z3}09g{fq0<8zhmZ8fB49uDkY$5jz4a=3|`m`8Q?tC5M87MCThl#bQN z)o@ca6Ix^^BKG&q0pl!+(54U_?7rj{pKVR?P$~wzKSwv}S~M5c$$a7FiJqq?@6Vq<|FgL& zyojy(irZ;{A_n5hzS%C7(crl_6=OsJ_UDF4WVHJ#CNwD=3wu1n5GtPtkDCL8L&}NB zu)CW+k=dSHgqYOpev(|27nBO3V;ppq5OgBKGynWqQBMUJ|AN70Nie5`pIvu)1qn$~ z8$xb1{Nbme-s+jLmZ5Oz8iUT*H2NkPDodwdY!kmeJV)aJH-p+w;!7rZ&?QID{yoZE zIms6mysUunf2()e4ma$5%d?G*7E@o=BkCCtx)%>*Z~t}@i*ImG#31hGL23RJR-C$y z9)H)RWp=M?ff+q6lS@=rQM`vxff6LTc)k8@hArsU8ul`&EY_|xd8IpFl3D;3rn&EU z$~lI)TKs{>KX;bieaOUc&CjIz$eCCf;*Y0LuH zz*IPkO*G2|&RjwB5JNYCsyyec}82_|4)EIf1-|0diIhHT% z&(8YBFgC#lT^XPI<|oH}D;jPcDhLYK{G2~78nCq-N^N$Q9|d-^aobnMnBY z*eZ+zL(q5`_MrH7y46A;UdOT<9eKm}kdXxIQ$rZJQ+%2FO3U+7*q`#;ov;<*{n5EQ-~np{M0K`0T}HL& z6&P*JD>{*yh!PEuPQE+ke|L$+?NV9oA?ixV=N~&SO{30C*p8RD5!uXllUIY64QZ-# zQxY)qLr4&9MpI7poQcEQfu|$OfN|J-#uT>T+taAQLN(vy%d%S9UK;NN7v3NceMKeP z@r!?XdcZ?RLe^%n&zl$sd|Llz1|KUEY_@$HjUA@PI?-g*3nZn5=roxmXfrkdbnJG5 z%=w!$Xh4i6-C+j+4vb-W%trqUS;P18Yl#i zY#v4nq(#&H*0iZ5)!T`BClV4ols2Fp!aQbrV1`DI9)a!Q>AAGIsSJE*9VaWq2S4@#&WS5Wxq(!c<5~4w&V&4hy&R6=Cl@=^a3yu!;tQljtNFQk}3{ zl03q(*FN!@=r)&H{(N>32@zM|aCj9GAJbnss+VuJbH6=ia`q64J8l6?w!4qC!=I`v zL|{y|oL$bJT(LTabV`tylLA>KrFR{bvb zcAB0>Y7}|-b`~KkvkvsSudJfzdf>uUtXHt;IdCVkF0-)kjV@ z3(o_;Q`9v-7t5=XhofAS0hZP0w1zgxJ4SjXDu3<%CcQ99J0&TyV>#m8EP>Oj9t#NY z%N}gCtfcfta4v)J2z=uWIXCLe()ATZ_8D;QqFM@u1Pr^JM(hPW`RGtCleD!^>+ug*|uxK{8mJ*JyF z&1vKuf;#UCITDOQN4=+GZVtu1i4;xJZl4TQ1_Hv?5erfSqfxH$!PND4)EtH7iBZ0m znfLPaqREj@KAeiK>rMPo2I5@_Pc7UtDYAr=`bt?`pXEp(&Jg21y3nWmDJBaW=5*#s zgOBO4?s(ZISx3LEC!k|Qf{_=S(<~5T0(!)kv7N%r z(>Mu%8XKK~{3^wY`s{wIr&1Nw`|h$f&)LsHzke@z1`Z9Z@+7`ILE7wAs&MEfU;qP{ zKLmUP@LcK1F_Q&gm$$Zl9M9zgr|R6D&20F4q4Q1o_sNo~CF$?^1;oWO38J1JtDfe6 zdF>n>DFDlCD9jV&@?^uN*lMfW6`ti%b%pLXPN-rL6Q2|?FA`=Khu}W3c$b`Q+`_)2 ziIJiG@2K*=Yj&C+XQ zf^Pg%PA$<@s1$a0Ev>A8L@~|jv`tp zq*&(oMM&u2(USz^pAto5++870#&xKRcT@W6ULlzf|E{n#SpJyz3`lYf~l&@>ThF zb@7$I+g}pJh(M(M!*H{_Ftpn27D6xkqdu$oj!+BavxZ(6!Xkj((Y@b4cG_8KcDo%* z3c6y-!(woF)gqF}$CmhnAij=!(zApfXg%8UeYdkbt_I_SQ-|H+Zj-)H9?SZw(+zw1 zg--gn9(yyKX#WP6wJwq^cS{sml<%zh3j=sm+pm0kf4*eSRwJcv_cV^0%JxrG|AEgL z8udpH63%mIn>^e>q~Vdhio9_3x=V5s5F-QUnnZv#nFhZz&Kuo^ zpT?OwW>0%R5SKsEwd@wKWR{bC+DwPAj&6-p9BJ2Cj9rxlmT>XI{sDoA1 z&8lJZun19-N7iNK$(1rD$tgIS#pS7tHjhP%<#!Wh`ob^!0H?Iw{*USBdGf&+-_7FY zt8JJn@`^6+&v|x+~3SK`uc%0h~`_hMzC9Bb&^M!8Y?SeV>0+GZ;Oq0OR zNgK}~P1)&1FPJzh_eq(=5|&e5N3j<$Q}apG+@R`mxB1t)Br>eLEnaDP*)MEwX7XXk z?=9Fp?1F(FbKo9J&>&&};{m@eO>q9nx`?0fY>T7M1*h#MqhhQr`%pCRtVvrkiq)^MZl)Ud4>huIC>gj~UJWr7m)ig~@)qOE)EE z-mmxy_q@5CR2@T~`~guEvD_b=9+fspC0Ou6)eA z#Zm!Vbp<6-Dn3lo?wTxl65z)JoF>-Uo2*=NbR@mTA`;=N7r9u$f82+^42>G`}p}ik_{$W8^k#Gx8017$_yqthz9}Ma1Su1 zw{vh%*-_>jx8p;s?AlWJa7Q3p+SZmXPZgNqdH##Sqkgv$GjLSeQknslTcc;sR3qt? z>QguepV61d?ae4 zCiJxnAJDo290T`P=t=bvgqT?b08*>!VD+?Btwp66f^d3OHlNvafM{VwRiVO71x+k zf%cQsC;r8kvQvFF>=*6`7cdM6^kF33YI$P~ky4#h{K5q}b} zWRn*aIaRUNT%AFSES!&Xo>zKp=53zHLq1p|l9&`UjSJfOeKlDjZF%%;D*e0~iWtlwYH#nUmQo~Y`wkcD^H*ZtE=;M#ly zokY~TrOWKS;dbcol3A5@Hyp7%I3+bDK#at{LNkqbXMxjz+9*H={v zcNCtJq#I>nI3MKG8v(ITx>t?tR4Sj9@MA zDKroXOXlWNOnl!j>c<~+i3bc-n>{}$t2sBX+t=IKhdnpl^}?yy;8f>3c=n1(dBnUe z|K0X}Au14OKY$iG52T6wDw&3T{5QhUL0P7UnB{w%B$Ywcd5q1sXyK8+Mr!%{`P`#W zT}9Pofow)w>}Z~1^mv6#-c__~_rn3)^KI%L53ZM)YDgq+?fvslcm}M8B(7W@bbOSQ z0T*fV!k9ZQ0YWHktpLbBl|4vuXvix-ETP+UlA-(MsU*tum*(>3!TS@59%e=bRy3+N zbaHP(J+%-6UYI?um`O`DtoN|#SkZ63sOXG;LhBU6-s5Bk8!)W$3?;HFs^jUZwY-;q zH^OZ!Mux_hFL~N`va~qgdwb6= zF2-9~wRPx;(X~E*mebVK)Y=2BtfbR@jKBusngDqfFx$%B0xox%@+Gt?@xLj$*q%NWlVo}`tt*y z9$K8XzlVjo!upQ#s3b&$*5=yLd^2oa;y^E3B1Yv-ZSmCr+&=)iXLV?|hZ$hdZ92cC z#>2{MUKSNAS_Aclf@z-h@B1zP>!OqSiyyp5r=R9|=IB!M^ap>~ZvgK5ob{^l!G0)m zP){FR~>R32LRN>G)UFQaIOI&At)eOqPaM_xMp?U&rao_p`uE1q;p*eWaB_=%Xw zeS8dwkZw!&;I*4h#bej5K_DOlAoa?u*A*wCD{FAo0G&B{mYKNA_jP)5AF>; zkaW})9!-4NPW=&L&E#B7DfTP>L$TF+y1>-Z)C(%@Eub-xGyoNmh}zu}M(>fdENYEU zZKiU{VI_OK!=8s=4!udGQA;k58d^zL`RjBFiK_|0-XOU2o;K8-V&h8-KSN1HETEQPS=&j<=NS?fliS zUs~C+6W5R$EPhO-mT>l4EK8u%Gnr%wy}Es$uN#f~ci~J!omWq!8q3X)2_vORzS2g{ z;4^aUU7ewYqxH{G8D6-nr+lWxmJEp&dn(Y7l+Qkku*EXvwX@OipMfGN<3<&;d-o9> zSyPP$bi&-uC8M#Z4(|HZBpj*9nxg}WMh1NDJ7PUMUAY?B2I&%JaQ{7mrXb9@_S(6} zpC~lBim?UiQE|q`_WJh)rCPP{KydXoDpD3j_{pz^1$&lPJd4{~;-S|ucl6Skj0v|u zSf^Tn!bfxgRZ7)lo%MsiQpm9BXkDz(J9{|zUz|=y*zn3ze!s!ORc2FuSY7H4C3<`A z++oX<{l)hpX)AuJ=}i(%W{roVa^o) z9!ql+s3RywtcS#fg*ht1?PtvHKEBh#kGWaeN3aztHH|R^)->yI;<=ia3S1WT=MpzhOa;#v`YhLb2kka&lB=YpjqG1|$NU|S#)8D}+xj~!Qsjgp5T zg@ZqbFgu|_$6-PES!#+{HIBMOzr7A=@Kj5pglW_LJkjcMz4XcewL~oU)C;t5*m{!9d`V<&-3-9K41xf=>>ttg_VU% z#d2;e7waGA<^cG85blc2kN-e03%=mz$4POuCi%w3MqoB?k;ky-lJlQ;5%9F+G8PA< zeg7^L7auPOs33u+BZ=X+aAZoK5Pnbqo~)*=y?uFWD=rNJz@Gol;65{>q_T1dWFMBV z&rtH9$32+4yE~Vsi!Qw6Nnki}_}|~lyW5NhNUPDk&V%uD4{ER9Z0n<+VK|_jUZn1m zR1?1-UCVvFxd4ez@Z$NuVFv)IzWv!8;Ajm5>DpS9ctgy6IJ`E`gk$Vg0!jH6!|L9GDe9a_q>dv2lOw* zu`)X0AWF}P3CrRV=|str_QpbFvXgpL)Au&m$LOZdITP|eF;Ge22s0O}Pe_3REE&b% znd%sNMxU9D{_nwgwf1QtwX}YuLd5(_3f%@o|oW|v%CNbmQUkN0r#YT5F@rQLSZVF+?;`~U&7E+ABQp(=B9T?249jC{P{tn z*@~i0{V=4yaql_Z6I)ANb)q4U*aUKPc`3Yr&t8$iRFV$0&5@kl<4xmYg+GeXVydHR zBK}Op!Te@FIaID-_Nb^78CR&_a+R07@_|gM& zA4Lrrp_jQKjS^N2{rQZ?sTW+2E{xhzlcRWvLA2t&H-6*)K4zmQak{>XzT5AX+7o}O zlf3EC!lu>Vuw~T`W@d6CT#QG3pXW5}H?k12$%b2waG5oO5lA3?zGEKJL(Oowm<2m5 zZxTN=n;JTM#5H)&^<#}nElCmP2v2ZTb7-!RkRQaW5d2|^Q*et}%%9;YxiF%C6gnW2 z^wUflPbIj{e|I-`r1rq&sn>&{pWj$*!RBK&=p@8;$`Kskq+GDjPeImgsCF(GBgnl$ zfu_J_M$aU_8g49f>Klk1vfCfta3a_s2~+08P`JigJ&M}Rl7ztNl;g-8k~6o=D;kch zJa+^&(pEm)11&zWi)Vn6b+G9LeSUqoI=FBR>a$9Dou^;R!5#XOwB+aT{s9wvMN*|G zuV}%830q>U>TIbg^hr4T-tcFOuvc4eso{VQ?2MwI$RJ`LaaZou?o`7`d2CB#rs$oE zsv?ip_WDJ@{#51MoFdH%_C`zvVw{FV%@6iAfo49WI7Lkye1g~tdK;cRYC`xGEo>Ax zSV@i?YT|^Fy5({jO8!_z?MX$MqIx?*Nzt&fv2?=K*oZJIdoH_(vJxZ*xUcB4sh8PR zc4m2Sn^}Uxk=O{@@v1nVaH&V4LNH5VWXD;otRzS1trV@g>#?yA;F#i;CzVGG81fN2 zp0=L`m18VFoMdiSTW~ehwiWVcaV!hMp)u`-BN?p$KyZ?HBK9i4+-4Bu%bl(yJ>*|2 zB}epOUVpkz(7opg+`BOlC=58IXV3hyjs2LVPyk|KH{kp3xtaL@XoVuAXnt>Q`U2qc z?!z?Y906C_VCbxg&qc=?ASr~V9I*rLXp3agECBIA>32$_CVau8q^!K;Sr`|G25imH z>-XdRZ&?|@$yX}svYsPVQd0VF$(KQw(@~az(_Xar^IphWo=9!%xDES3S8ot}RdxlZ zfOPJ}oe!Hqm&0CZG!?M^r%BdXe%RZ>9Zz==U9D!aRx&p4_O)ARD%0TvWg~HiD zKr_!{P#1sU`H#Td(t;zaSyNL(O`3QE8GI`gzU)iassCIl3gUC#`EeQi<`2A%ZGL|P z6ijIn6@^ltnE4YNn%$pA{~+EtPd$;_hGU!W1`vHLlJpalNtGG9Of7AzOMO}wRxmJx zJ=^A_RHXH3zJ6y)@Dbb2H0#DWZZ@cAOq`*o15Y!7n|H)q%j8;f-i%tQs9WWcaOq9PsY zzOq5mVb1EThFB`RZwBgc%tKd>C?4k~vg9beUNo#G_)~DQi@DQL1lG{jwS_2Af=Xo=ya|T9t!*ZW&Xc$YcbJvOv%GNO$Uz5eMEeXA#utG z?2PPeAiYuZ?&9HG`H6niK~U+sOb191a)mtW@Z6aByP@2`S=7LftgFj78TruMH?kK^ zH4ot2VFds*0kCXJ9;l-52?(q*vtxnu0%l~7Bd<;WRi$mU;P=X2iJ z<;EZpHf)+!(ZU+v+S+Q_lmcwixSsjYOy}{0xmf)e

yqRvP9CW#kW4g4a##Su zyxM%3w-JbeKx6pAlmqu?XN;mOhvxl@j((riHz3`LOtp=1)^CIu)T+St;~Es{%rELDJhF1q)y zoEA#c^|`w9``#M(Rk>6FC@_+5B7{Gu-QOPQEI2Cfd8iKTZw0?N$SXwI;s^?Svy|3C zO>M9!Lh_sz!%1=oopJ}qv~$)z@()t<#Hl;xl4gtjTp|&6GWj6bl)Q0%y3*}o*>lUV z#@m;y0Q4O?k$j&FOpo-Hz5DOI!A!NUl(o={PX=h-K~EafmU)FZdBSLLo6#p-f8DEF z<6wjr^1Z=p3n$)w!M$m>1b_4wUh2xdF=p&5gjw9GlPm}j;VfOcjj3`!`?u<{ov4>e zNx%}cr^qr{rnxYbt-;fh_qz#=}v>JD#B zYAQ7!{G$rUzRvU{t{UzdxNjU_Jn>;;|8sus3)%Kgr+Im}1T7Sa`k$Vik#NE8XBOwa zN7=nCeiMNK;-tq7Kfi7oe=4xSw-0)pPws8_a4HaGc6HZD)a|+;ihZRwzTO|e=7RaD zp2795uDqtH$r@l%1Bo?Vyh?#UYIDHR2Svf2k^BQDjJT1khyLfa;J29NkHZVW*Mov4 z<%9NN=r($~y7D$Qj06M(9$sF{8ymT|+#d{rRe`l~K$O3#7Jp$*lv(CsV?+N5V9{g7 zkzDJ&7D4L*XLKZzCl_6ys>(`w-#$Qj*BtzE4zx7d3uvjN)~B{Vi8A+RNgM|evzmFs zB3hOaoH4r*a^NWPSiR5pdM~Olfjbe`>*B_F!KZ@BfQt}8Yx2`w&zb=CA_~z9PH3|r zAktR~L!u(z3qFR-_rn-LiKy9(!0tA(B$l;$0|z+@tkK0CjH4^wD6+GQ(fXJ}NtbeG zgQ14fU_{b_=k7 z84rSo`Ot8;qKFA1c?qf{mc*JQ2eLacIU2&4AN6nLUf4P+cX^3kpl-%`qp3K6yoxSEo_Z8~$;#m>1>n8Qc-7kR5!kA;D<&rG1=jUray) z8Nq|3Wb8n%5V@f92VNXnzmW&=Virg1aUU@zLTi%Os{VpbvG(sONp|47iiZY@5}zVG zgu!2tWnTT!Qy`=lN2YLaak-E!O+=483H54+*}h%6nmBoyWp1BqO{q{A5^}GJw-;(f zVdc+iI4B>4T{*SMm88rY`a=JEhTn$1m65+KB`>Wmju|-jj{}?-y*{6o%k0|b=4wE; z-FrEZ4|Jzo0iuAW!Fl+sBcV?wD9g%>SdJ>vhc9h{?RE=%U-0%Y7;-#^h_grAl9FCe z2$%yUU0pfr>L~v$`ghKCYoJnzI*}A!g7-3YjrV{m?%{=t_y3rB3$QA?=6hI5K$H>z z0qIT&>5>!#1O*W(0qIb>Q$pzwP^1Ku2I=n9Lr6$>Bi#+(9N+i-egAV^5046*xbMAZ z&#bj(&4UMt9=ku!=SUi622w8hzJLvX5yg|yvEOIxhrH|>-WLZnN-oK_G#Z67;yF6; z6`ikLbp;8aKS!Ypl8>aR7p8uqos)v^u@FEVufs1wSm*U%wfbku^=;>b=$x{F=n z82PlNl_dU@b(5`8GkI(-SIKMJ+nm?} zm}Z_lF_xp~x}8EUD4@d`X~HjFzIci_k6D~~+!$YeEb82oGLu_bh^jyN*WTOvV!l53 z3}yRW=Jb&U6u)alfdOYH?PH6_zrucP0HGFN$(k}&8_>B%dEvaN886+o_Abn zXDq!N_6xViy$_yH@(9fyrR=-jsCdF(afhR#%n963l0t#gRMO_Y z@QdmNv+v5uih>kN$U}etRYpUDvu2$KMDV`&evdh;{ES ziuIM<+UTQvZgq} zo&+B)`uj9y3eWvMQ1t^e`B}u76H?G1|7|$)pe*Mx@bmDUs&Rg!a-x3Y(K|`Tamr;6 z9X@J%PMMSKz$VsLueXWcVr<{%5pum7t!^pp_O=kEJmTd>ySVrurz!H|{QFEO7!-E3 z?>#cf*{(}qpl0|$MxW|ago=US;pT{esc*KnJMVQCXPVyCLZa?5cblhViY`J_N>^7m z`VlD$bFh?x0ybCOJ|P6ebRc8m;^JO&5|W_$TE9HEZP6{2b@rQ6@JPP@!b;ir>3|Gt zZVodmtD+kNg&iI~{qmca#C$0e?R(WNT(@Y%J2n2>_10NPKa?LSXo$ z-SKRH8ar{wHwFc+S0iuV%>)*o6H5}(O(j}l5Zzb*)Y-aBMIVY$!tz-h|1#K0dfYzm zKOg1a=Rpm;f$@2HNA{Iiw%FpQ5lv#O$p8F1uF%|0};OGdk$Mjza_+sFTsflP78Lu+D|Fagos=SNQmS7tl`T2C1>pH ztJL!Q%Df!5dwRVNd2Om!8J#vKe)fNR{2AEn$U16jFUSIslT*MtVCUx(uYQ5g-XK8# zEa5HAz`)>I7$Jp)OCi{Jda#D9_^J!eXiIW)L*P!uM0hOFU5G~ny_2MFgz>5JoYgC| z`eS{uCpFU-;jk}SNj`(t&30anm5mJo+Yv)(Iw0jEMQ8(jpLvXfxp=I{moCq?OkvD) z{eDqIw)t%bDA~EVJX^`~!&u)JE=&LzdW{RWmlXgAdW4Y)ETDG2bt&;3~ zvJUeU(eC(2bpY4~o=Avoyr9OPfy8zpzhbJnm>qm=-WQk-bl@sqLZ_Z2^T+Pgdth!6 z`#mcK2IN96P1D`o4Z=mA(Gt_l*b1wVc-k#P#<^7DnEnWcN}m>?Z>RT7dL=^pi~GeF z7*;nM|tRmVYgU=)8{Uh$Rl&%ig)d!LwE#P4Ci+Kmg zJAw+&r{mlm6fuXWUzu+pEq;*G4*=O@*A?D>v6GWy2bIE8 zUEQPu=ZXd%F;hNeWP1Lsy85(5#qZ{_3q-j4Joiy7!9EBHj=G3my>3eQZjitTE}ySq zrMVJpbE5iQgW;PoD#9QB{{364-w&$<^qSzc2KteW5W^R0L@%|qu>+(4tn4Q-{#dXk z3e#We55m~?;0F$rO{no-9sK;q84miR|C~+rw5jGR59_A=`($Nh*|nTZTdOLe!CG~XABJa4#2pe6nBgA%uoqLo&MYA=mL7EWT}k8K#lI9{ z7!QPoe*cz)sjNf0&BRv1C)DT`jZWi@*JYP$`y-0C-@At9-jfF=F~yz@VifFNR_Cc& zSFH%k1b#Eat>|dNaG@nqVcgQksI39k&+c%}ceaGT-fjJf_DfX*%F4ixGUr+8;lJEG z1d~b7bUSXD#@^yJ?OC_a%geKpoRil0lQ;Z~=v!e>1SVu&gM)+nLIvh8d5*%Y$Fp*R zA4tu%A&*{fml0~Q{+lh^-H1lM?hN^)6rNixq`%kfvQ&vMTJHD%T?s;M+d6YHW+cR( z9_uS}IC%WsdffbIJ-#B4t#>2SW7bVKyjAchmt*qs&{C=TXg#yb%AQG|IXL?HlZM*& zwMLzIp$mdf=n3mzw7Z|R7rM?nd%Jg3QEhz65BCb+lcAmcA`n&)?)7uuJ<+`*LqG7v zl&|<%`cNv6Q;q67*a5jD74MvVNfQIyQhk=2_Igg15Oe z2it0KH8Vx!W&5$$J#>@8Hn#ZdU!8yKdsVd?Sy?Y@Do!Gv-$FqQOS2ZX7oi^B}J^b}gs&&9u0l*NJgEBok@Lw-UY= zxgXgK73hkj8KKdiASD~{7FJhQw!0lkpML2Ne}`!{Q4>R0X1Bghi6f}aj8UOPq@}xqgStma@Jp7)ysM6ihWfzljOo5KAR6Dw;{Dfr3bKXP5&wZ!CBK4 zzm1|*Xw;OjH0;9T_9QSStNUA2%Smf9hI-0#X=7vKl;4_Fpjt~vNEoShWP{)fg11{O z)I)cD6%<*~=9eaDF92TI%*u#kE;q#PE`+JUJl@i}zl5Maps#p%(JSx%oB z5u(_GK~l0Owx`F5b$nd%=H`xiW6*Vj8P*w(SGMhMB2XnTlToViZ0p3$m{Gz}o}*oU zk~p^+?zl;)j8hIIZ(TnxU_+Dd4MP8*wffv{Ll#&%UX6>0NsTCYe-I=)Fj9}$o zXU0V_bmBGYQA%KZaQNj}OGr)*t4Y_VSoLVCo;9ST1x;!l5xcIx(#l5Pj?Y8{5{vt_;k>;vueuJulgXZY+ z`l-LkPI|*d@-$uFw9Qqhy7yU8viq5P#*n?lSbyy5&?=R&Sg(iISy{4a&l?H&#*(wR z<@ncx>67rEU8do;yF#s$?3 z2a8iilz^teOCh|Aq#U`0axBKmti~CMZg9hpw%1i)tlba6^**T*<=W;Z(lk|TPwCV5 z(T?f{7~HtIxw){^aiKiGM@B{#eIoVpb~{uWsT|%LvHw8Gb^jEF};hjS=G5jvOwRc7406o zW~0w>AA}p@n#nZ}RA*FmJL-lWxh9YX-WCC$gJ{%tzPM-=uTqq0R1>t}$X^Nj4`$q6 zdrYGCb5kBO-p8m6Dtb&9rm$A2WR`%O@D|exx-HO~LP^7$8Sik!P@SF4vf91MiUKdc zvaxaRFr+Le2L`~<$M~@ZH^X7R7i?;+MGTZdW=00RvTqGcfC34J%F*W1U%PPtLSsPR zy8Mlg6{;UH?n=9fA2C&uSWRF=+fT#{a6UqTIOAg|4Af@Cf3$IXLTu`7EbnH00rXkk zjZ=pd?fp!S@5elK@6+a}3{;u#qyML5Cz}X=$nU<1pl|KmmMb zNe7t>+{&#Vi`7Ruz;AzVov5-m?Ky*&1KtFBO&0VJnm6xFf5e{q8(L1X8ZY zo4u1~(+%EW=6J8g<}soe_7f_+sJV3&$m|l&#*5jJIeboz;xDw(X_%b%(|nVNgoM}Y z{J8X>wyZ2RIXO9Tcn|6TuRs z?H?)+561!Zv2cX$@RQm;{)3+M&y(!%$;D z$|oSOy#8S;aM^LXds3Kz5F0KO46bM?$b>V|UjAS1W$b1WggYF7v;C zsV6AJ!7F1+I@kjrwBN|X8ui9DmDKNnftG5=wUxhr6{Bgu_DsB$KZN$XBujP zC)@Fumr`JF1kq{2w&Px++u!U9#tx?)NV)KFgH~5pzrqG-!387y`^;=?Mvzgxh-(A6 zapgw&IC7k2R&FlZ3~=XjNzPLS@Li-6k&f$P>^rx(TAos8A6AigH{YnMy1Ngx@;1-KtBi!XB{xq3lIm8Olr|Rk-=_R3?;pDGo!!Ot6qkwulTH+8} z$nQ*)yEf+_%~x6~838Xk`{f=6$_RV#{_nf*WjJKdr);z_<|w*7gQoO_KkODc|Mp3* zx7Y8CMVN?&EPf<4G^X6vIN-j+9WsWgIFOwRU6V;yqx9TYL*5sqmkm(&U7jyX+=dGV z9hQ(NQ`mQLqv@?xEho=|y&b5)ij5J(ut?+~F@@Au$99m6LGOXtjB~ zpd}!e!tsblud+7@Y6HB-3V&H!>m0i<%dMY*n$FYHGi4g;BFOMz?5^vGV*jiABbje@ zi{#3zzd&p;soR2XRM2ufmY47N@fVJ>XeHaIC!dtY%aLJ$RUZ0OvjG44_l9&^k|Mkg zon2jpAUO7(o`IG32&~v2XWD8X#qDJ^{v^h};rtqHefej27^tH_v4)4?h_3Q3rUgrQ zAJ5TPD;k1wJBg*Bhvsy`S$xeE&KO>vJwupfRC{%K*0T9VcX$BW1RKiiQV(jsf4{Cg z0M+{4V$q{|1OhSeH@5dvDSHTT#Pp+!vKkt908fGla4WZv9gzmnCqKNv`61_gKSK(W za>8IHrSDZr-BEZ%R-lNT=QE}e29?qi+w{A-l z8=>V1dmEvnXZg?N!YO{vwgj1*o3H-a6XE7o^M1lp&ij;|7^fu&Z)>~$xw^UxENEy+ ztfS0?1sZwPHz80%ww0I1o~XhEhzx<4Jzrait!u_F<;(#EFE6heSO3Fval3^_h3z7g z_wP_MA|KrH8tTB3y{LRka}By5;SL10g^(DPjAj+)FR$IZGOI{oM^FfRyazJmBa%?A z)Zwf@St3V@p<$TR;O1AG1!$i_Pc1HvM(!H~SA&lJ@{S9#=}O=8K%<6&4A$2SQnBO9(_kfAJwQ%IX(doeKaSW;)L8EphahB0v z=kJ`~d=Ub`TU7LxOAT{93d1AS=t>R7z+}qRbmpr{0KI#?eV%gqMt$ zmpF_MnMVc?=Y})gc*mr-&Da#)=Q}QNc*#~;D1D1q{z{(Sc_a=M75 zi#D>o8cE7a?}fBA!`#{)xYnfLW$Y_?Eu>8f z@@pg(1P7uCE{z%1#*Bk-I`NW);x*+L&eiB@XXxCX0xm;cYLFL?O@KXik4W)+}fc^XJKJs1yQP_m(Vz9Q5OCAvx2p?bz#75 zVC6Yw-^vQSG9G)zA+G^BN^xu+1_`Ri9&X=$nEBE(6f^liV*zhbGG)?Gb4LtDmr zg_;7QeW%nSA~nOy8T1Me(&zS0(s1K*b*JSP<$(w1{}Kl8<+WD(szY>Q;)G?J4mEx% zyg)qS8_s$t1bBT_%J82* z1{LE6wu;Ki-wFx}hLiO`d-#H#_~L>qfgd79-~`(PF8c0wG&D56@OJeg1pLJt((k^> z{iCR&67JP|;35jCS4=`ejznQeaj_umWs9)~#q0wU6QO@B!cO(FnQv58Rh^zJ!!@A-n3eZ+f ze?jeE#S4b4%(%*zr05}SX>6FG)+1oJ62qu$2#~njm11=3{_4O%JbFALV0dlAY3|#f z+g^!d#b9447e*G)F>fxQ2NAc?rh?bJKZ8!Hh|k0`yUS8`Y@BJ*fcG2cZ9 zJ`8%l-|nnz|6zl&joPd&HP18S;4lB4pR$7XBPJ7?!1ef~ciV)0A8+=|RL2nL-MxLw zxCh6p{JI1g6`Gv#-hF?&<(K|`%i=B7kNu|qG&cX(eS+g9+5|J+D}Vo}zkiJK^;#8s zWfB3?eWv3otgn10{O`|lwWH~Qt_Ny>NkT$`72`eQLLd4k@sx<&T{}Q#;FkBLJgK{5 zX6d&Ct2yxI-Ttp_4sXV@yRolLz8r)LVc3e5_nghCM)xRhods$jC>r3St7P*2{d5Ld z_J-7)*{pEcVdnSmPoF&-U_%KuIk6$4l^v7I{X=H>({rJd05Trf%hduFc6K~~*Nfq5 zGAa<-RD1yr2(J2_GD)4im%p>Jv-<+NU0q$PQw13qzG?oV*NOuUj}E9gi?5XUhE-b! zZY&%e4ER_x2X2nvq}|+v;koL_w&31bSXvsTUBj*zE6mGlT{}RACMTzIai;zyPcirf z=x_KHNh6&``6i@|@TnY}7K@9E)oF?%%SkGxh3Iq9^sCFt>_pViOBoTgzwdC7$hY)%h{eBU zB#$78(bCe|*tCx-?aQitV{QGdv{c>O%ksMC-Me?|ZYXC47{nNVhS$of<9V%xD^&4V zQE~gkk6EmOiMV$5_6Hxv(<8)xL1A?F&*LW+7Oa^m zUT#5LE^xa&K(u^*Q0Tqb=!=E~h59H^uOYd`&!0b6>J&tNxPl-3)r5vAbX03E^idUs zGXTG;aL)92+$)gLH0E<`c7uy^8!9xa=N+)3Tj~+iB7&j<-(TPB`%)Z?Q(7wdu z#E!Fa#!rvwIi5tpSLq-gqxPE=xbmICg~N#Ht4n8~|G#%wN0j1*;c(SP4ycHPb#-;& zqxdiqor351@pcPd)mHtP|BNCf>Fr;7uSBUOtSRxLFxRMb&XZM}j+I_ z?OqxpPPx@>?Z9bm_)R{{@eBD@p;st2D8h;+AMdhXJ(N-Tm=x%Uc(-A5?XS5=%1hs; zGVcp0X5yO`%ih<=`iM#kF`?Gge3j`*t*NYOv8hRw)dxwJ|* zS@_BvT%h_pu8p=qlbyq;tbVXvJI)wFgVA#@J~Py(8;OG-Uly}Fo9PfLoD@m&uB&Ni z9MSVk#iV6(#SJ?d8m3K|!cKgV{~Xm zjZ14Ift7MgJ2czFXdnBlYnkq(%oa?j*(;aN-o4$s;j~8MK98~Sa%gjzGD2R0DZS3g z@Iq(sw10kQ<*TewU%V%baU{Y>=?!W#Y}#<@$T+`j-oN&5^?u;;>)n9F&hF~s2;^r^ z#okEiBa=s@JG}Viu~D-3rE39j?OE)ir3&wC37=|ec0f_uoLI8Fz4d0u-e^_&)q|x! zW{Zp`4rJs`F>z4^HIYY*;W<&O1DW=qpEWEQMFKX3^`{m}=yXYc)(+fGpk!Bub02sq zdjWUH2$Q=5qmTDsh6OVxr^&`x84)qD6Dek01?ky+e5~z?hKn$$z=LR{10jSN?apQ2 z(>!*$Lqzlg`F8m)VTzp#h^mB4dp_S|lqGG71UqVKStd@D-fH*S4{KgH<1_T5th zi!0K)5qGTF2t^c6=gzqpLnWn577P9_a^_W=@%Qe<~<{eADfVWnlZdB{W34`;hI7;U@15Xgt^$X z>1A1=4AJyEFgwYgeE>aLu5joDRtKFdPUZ_eJ<8&nSVbdztYr1prQ3=CaG{LZb(qm8 z^1=CJv}+Ycq!2CWvi+)N5jRL$U}^j8LXBY|EUu>@Ns=0$@H$8IThk&b9F#vJBUq(v zNyB+bWtQ@{ljX>(*`IodQw8Bs4P9Aol)hl6NhVLQE0_53g8kIdsMVsZon8NNso95| z+?+95Ig9!H0Fe;Pb1(1C3mr?%N+wlrT7+Gv5Vm#lTDuw6g}HH|s|9KAq2<&Kh3GL|kujZ4v){wyTix~4=VN{CB6M`p4FdcE z0!L?`j)(`){v}ZK1@Jy7gdN^P2s)a8=&6*zm))x;MI<{5gz=xN6tB}&L z!eROAG2(a}@x;EVN8;TdGlz8}lzFXB!ngl0=qTPMB&1DNWe#ozR7}c{ba`n>&f0gF zAetW9SzAFWnhp9nfN(td33kse)+ef-;W_eno;n~aO{QifPKtD=1iYZI;ZhiCT&wZ& zuwIEP(IrP;ta}DZswp2owwCu++{#ig#`{gEC4hP*iSVhDjaZ}%(imulr=D){f83KWSQH<7cUXt!L43Y} z=0ra9ht!K)x2e>&4uAF`>JP4wLreVNFqXg#k(gcophBLs12rD&EkS<&w0EyW1Tegp zY3x4Ne!yZoLTZd z%r!Y2gk>#e+cOCZ(A+3026L7U?{c+1)0Y}AODBjpvX@nZ(xU&2lQN)kcDMqBLt<`x z1!GuqNt0Q>`EMM|V?h1s=2q=RdFoN0Rp)IOMqG`^+U}Rq>cAPhm>^B=o!@EI)pwHj zm<&@wKCUw)d@*%~cMov`InA?T+RI%gnJOp8i3OkJgk8O^K8VuoJ=`KPpDRG-fOL6S z(Y%G7fUq;Q?h`Cn5!|ev=fr(tgHiLYe1+*omQ#d^*ITTsLlz%HW+P6E^kRO=JD)zvv2 zr1h_kw`V5+m+XRZnxIUGNlNM-DKfe%V53tARQ|;Mf^i@+fcx*e8Xg>cdr)kQe*Uh9 zMe5;;>0w%_+xH$~om;*`U{0dQHz~?gX6EA4q=e&XUzN-_(^${u+2Kz;c!D9(kf}U- zAE=*}NB8QOL>b>5iW6K5t{D|***>%KmzuZrHx1@?me+U59;Q8QF(Z~BiYmQedVe|9 z*W|_MDFfA6YAQ87iX**(-Wj2++S@9fYJ35Fto<5e6M288)Npx&&+Lc{=$c@Tb|Vj} zE9{dxBdtRTrw}64sQ0gze9T;+1|_Joln%Z^aoL*oW666@tOujh$oFJg;lAgvLImnf zN2PVv)pv1}e>c5l3XsfcOh6+8L!%((0 zQ>SnHV&HRl&d-=r2&Qsu-x3cw&1zFL)zw>*41B(S^S%C}yTtlv+(h_7b&|a|`RZ|?7Rmyt<2S-|`)B7M-zsT*t;(7i{Svk4u z00(dEEOZ+dsj8~}UJbFeu*fOES7mVd;et`fCi1YK{m}p6IU2UTAcrZf)~J@7UuXCy zVMmM3#|Z?P1KBk+5Z0hG!sX&%7#cz@{pb~tB^+4VeV6YrE*Y&>{)8L(T6tzNVoF~8 zx6fo$Bg_EI?C`iuxxyA;-%2!S33U8B^rmEV-w`+H9gGg#29?Chj%lyZa$;n>|K=kF z^OP5QbV5Q}`lc1*(metkA2R!dGuh}BXt}wIo_&kyHo+;H==%B7U~MS>d^f%UToUd> zsS$`rrD88lzq?u~hcbah<8T~6Eb&J9qUBp-NYnJ4#v}YHwuEclc%G!GZHm875m|y3 zH*IoRSWug$pZlXypDQ!OObv`yviL7utYFQP>#{WC$kb4KzJF_`B2}lDW}OlhfuGD0 zK`Z4f2Aw%UQ`XB%yYd%6nPLSlqNxFK zkgN7D<0jCu=#61*V01$X1$FJdeXf+6#+@(S#B*U<&z}z!(tHt?Mg(0q47b`p4#O-} zHEnkt1cCHi4%LGQdLyh9=&OOMeExe9#4&rRjrtMTj!-bnE3#zfu%4WpoQg!GF#{yo z+FeI6wz=Hy4DMyZVqA|zQ^SrRXFjMOu=~@ht&e+8SpA8h4J^0EB0zQmZV^9BY&*L> z+hY4iZsWvSor0gAzkSjLtR`X0({OV~!L4?v_WDIc&?kF~*+Z@B zOT+U#`Oa}Z(Q|d-cb`|EJ^J~G0UX>S4Q1ZXF>!E&f!b_#$K}V%{JMmR z2nr$anyf?#Y3u)np=jCIQF>d&@VtwFUXT`N6c%6Q>h~|gN~(CxZokbw$EKw8MtGka zfj&+W$wIx#VS-MCif0V{uvdf+)P?Iln4()yv{|`dd51rTq`ixJeIl0ihB^Nq7W$7iZ zbVvAkZ#6u??&g=#3x6$xx{bgOzK#Did5QY;uw|y01iy;^nhhgg-k>W%sWv(9ht1d* z`gBD1_+>~r=5Ou0$V|l?_%jMdl#0z}(1df5ya?&!jz(v{!Fl!YZB5UEl_KL;br0I{ zyMO83PWJgxZDU~(%+_R7$&DP=0fdRz;rDZXKwP2f({-i7LU;J;S&PM3S~=VPTM)=D zH`*;1Nt`VMiY;L5Dsgx-m~TMUOatS#%gU;E6hwh+`T6rF&DWZ>xXqVbI+aUyGea;KVGxf5P>-6kz#E$H4wD(EAh zcj*8W8(Hd%S^QJRw>>l5t2Up1UF&ul8}+!5#NN%~J-g1IKbvOXwu=5V(EMqf|J*XD zWAUxv{?AYJ3J>34kPUOBF3o@UA;fEl63*0HW6^>q|E`1 zD|4Hzdj>@90Y-@{LBoI4s1&0Iv!7zFsD3ttKvnA?6D>!cj3U^y9>^!N8F1Y%Y7oEq zO61j#MUSmJH>9GzGGI5L38J`Q%xV?kM|tSXOK#VE;_nSX^|zZ(o4L~V5h2i{^_E+> zs*K>r=wxZM#A4_qlG)vv>i<$vYU-Op*X)#pcg{LOiMfr{y!P$J@s-aRof(e5TWg{( z7S)mrvo4?U6P#wmYfRenA(Sy9>bi(|lDi$n#Su``{@p{5=m4SBJI?p(As!l0uWky> zAGl>_Wi^_Vte)ID6_Js3QMsj|k)53l<)YT0SAUXeP9gB!&YEww)t&DM|u#= z2blIkge&{8pdDNe286$6St_CHzPcP%=Bs zR=w5=&vlRdV*w#kK&A-)$ud1wn;MWLog!@z%8Fj*KZ+(Iv0i81k_T@_2&V-F_mo*O z`z9uoKt}T@tL9RMR}V|mN#ptR_CICmi)q|)XkgPCQM2qUB_-AA?q*~}Cm%r>_GM?2 zQZx9FRydeEV&u@mvB5N5Q|SaKWC!LHCwU5g9q75G5Pd3Yp512nwX*Wl$B#rnVi<|*1*bh79{Z&% zbS6Y6ks^83aMX&5{g&no1l9&K!Y^?CKZ|1J)}@{BjZFG6er)W2cko5vq`<^6V(kAoA4^p(J>- zd4&4G1C11;d!jBs^lxWd%!-u??ojt2X}Gm3OA8BV?%5pP8S;0NLxv;4b3hWs+z4eueFxoT)Rl<1xA~PhewJi=r{jV zF*{TKexTD}6MGM3qeODL9np=H|Q((EFP01IUHUDHn>_?YCzhPS`j%gjcl8 zVyRJBOnHtSOpc5Ic4ppiwAdL%C%wtd86T3CMk`tBdbBxtu~~=gmNzt8dMijOWS4Q- z034tGUYPljqR}H3NvbOs7ca|=&aSSTp9`Uk3eeeuueQrt19$#!u^wwjy#4t^f02nv zsVVArRfo(oX}Vq;?^r6$f@?y@&8zt{zGwP|3)=)@xLi9R-fU0(qYGIlQm7Bpn&bBa!9<_@k%a{9D0FK&#whhiRW~)}PscP`W*` zH9M*A#dxE{P&bO-I4g=uFWVg6mAhCBPCK2pa7wDV%{C&Q3lH13Pctj$7gB<~d~huf z44>f^Rk)*s-g%(0=rVOB@$p^*+FHe(&zZfE_)>@A?CbeWizZdR?)*=#==cx;KQ^x zsnxrk?|PvtnfU~RVk>=mHtC)}yL)@z9EHZyvoJ1p%wgzgXh3Wm7t0N`4*d?net+3B z!`STk=X;AHyf+@7iiifZ85(p((rhfL6j>Otw@a5)T8_4SN5qyq}P5qt<) z*$?Nzf)|r)=}n(9yI&kisyR9MXa5(!V)06$DjW`(G}&af zCMj{h%&{S`W1e7R*Udfin%XPV6C0c3^($X1j}O>yKU<^5^mjxhCi}I_{?^A;*J#^1 z^0w?)ItnTJJL)9fe5(#Em~fpOFPw8X)1_|e%|T^-zFTRvTYnHi?7em8Z&Vx%XV*T2 zFQ+8tCtwN^4J_`aZxi?yG3?*JqP#D}A`aY+>>pPZ8eHF+tbnsybfoAxD+(e1E58Qmt}>-~Me zYp|A!j-#QdsA$~wFRoS`kkF;&f%=zUWq6)L0D%%Px`F2Dx|4hS{4E*L=Qb7EPgE&a zUCfgCKyRdb7#pWrQ30j8qhlFoW$6AGck;AueniIk_L@k7rF=B~g^P$b&=Pb{oiI>+ zBkM!u%qDfowO_uZWz~$PhkOW5LQo)g%zwz@Dm!!49Cprd^(uAGbJ~y@Yv1NxwIWaH zCFygqT(04#j7pAis;L|2mm9-;>9-3h7=Wp*M-Bik0G%4YoJ4GxQ(@MT8RqQV8(I)) zTqS?BH$|&J2)v5hm8>u&b@DWv@1DEo{5|@#_we#>`AVja*N0|$7>PBh`+Pz^S}7Fb z0H-qoQJmY;XB?#gy~}iLVmR7TOcB%yqCYCYj1kOTLGWZ3k+-JrzVQHb+23%2Ro29fL9kVQ1TLD|c@B2HGBLR8nvtwyDQjQ}cyJ}zru^)*Zp zre|U*X(;#4EthCq_L82$!;XB-BeA0@JbQn>BfM?)t;30(Dhx4-rLCIMi!nWyo9G z^+lj?d>;Nx0$mS-^EWIVe-?Kw{EYdDK%mJa4nv^_{V-V(-k^=4^4uqY+74uTceH|2 z7NWSnmweuB{QS^c=J$Y^hGe~-9kn+K|uNgEP#D*u$nQE^5T5u4>3 z)}!Z|UZelh0@zfD$OzpwC)d{2cBd~5DfLK?WJ|EjEG~9mu6vQR2frj*VKgd-e^hC9 zR$SOYj#cH9CNz|+BN7!y6=I1g6CMoWNzRH7OO&P_dNRUOv%SLwi9L>NSFLj3$t;oVD* zX=-aD!UU$Wj-mj+0!@w^;mefXj-DhDklQ{pP@O;`!yK|lii%Qk#$xTR zaGkkow^461D!qbXMAwl1uSl;Y@FGHXlOg9oIu7eb49eQM-i~2@^w8y%0>Dis_8oN@Laz9_h^XG3bPON7?d-2AI9U*YmUt4d~CTV7r_lpY4I@DOm zQ>)%p1vk;@5c7X&*HTunBA^;1x()=??Bm0ES}k*JAw{EBgTq55_VQ=UoW=Cd>||U- zC*V!c)6?I#Us3`YYVh>d7LPz@gx5ZuBvw39b_Fz%qG!_Z+9D93P4hec^i}A`(6fh% z<$J!nh!EPU-D75FU*A2hzw(TtgHE~Fwkbwwi@-Vf-31pU*1xh@2xYpiXD7g08&Ab+ELZl#Z#oF&uLJO&bG`l-Ab;T za2W`rXJAm%)$P{QcYHG_=5_AU9zg}Tk`A;U>wtwy=sl6=`VG*ik?eJ94wPyZl_m&j z1STC~>bt@AQwHEgB(?ubzz+7$%8gQpr_q1}2Zh=~n-y36F)grop%g+-^)Er5x}CJ; z5fp$!g8CMyH*mz5GVb|^Z2N-$uzg^;@_>*%lJqCEREUD_MJT11aBB>RW5VGH*OrIA zys1|JPGU7%VS|l(mFxO5n`j(@soP8rKU6UE(1$#8IiTTO^#=a!|2<-Sga4Z{t==Y* z?Yr%Aa^VI4pO?8Vj4-G69o|?(4(otEK#Sls791eSq*HI-Xn6>lt&U8VNI~snwx-V)&8cma``tr1|>~x!Pohg`7i{E{VOQ`JwUI*{>dA6 zT2JnFj7lW_*VN*7)9+~bN6+4iQar=2(%i-WFqrGZC?g&!Cnat8xfPe{>(A-4J6dsY zuBk0{|Aqq@R7=noOlc(kGVgh=UUIidNOjWK&aYyM5+@5|i^Vq7d@gH#ZKYE+wr6V{ zFCvuldT8c9na$FtKm6i+*d}xrv4g#ZFhdNEksx@^=0pV!jo~LYT+Y&JU!*LL9gM8j zsNwZBA6qQ8H@j)G@k;_nE{M`^I3!`*T zpR5jToHL5vW!Gcdj@`RonH9T|p<}?X=mq^8Usvw`l4-WrAzfetB=__wAy^m9f#*oG zMH!FruaEB$4#G)pHlg8+dLZ;bUMHmPn_w8__lo%vmHTs=33|S}+08CIf(To(DyGcc z%>-vx&^$n~i2c)%fMs-a^tGvJ33MTN0>gl`l%)`S>r-UZ(LXl!3wXIteCAIbVzs0e z>x=1U5350ag%mYb*)Q*c{q+ur4D8{_rVXbW(p!l` zjsAr11IvxbfKGevPkD!Er7RA4?~MldoV;mN^!2V2^i3)mcESMwDtFB^;znPWub1$S zps^B~b6Yc%wBkm=3=UFCdIs68JDH0oBhakCrdbfms)o!M=+_B7;6mRNQe z7X2{lz|{i<`=yJ^`#B$*C9}>f~i-e+4iJ z;=Hl|Pgr>uC|YNa&LF*j$ae^n$wIZIx&NEm5b9?&yIlg~131!5`%x5LWhg#_ZKDYg~ywa|>s&9S9_73{fr3aaM^ zOUCWhNpRC#l@M#WJea8D0yd$ppP`*L*xVIZ#t-JMzXj$mb`e}1rM1YZsNWY+CYP+q z^MKmC`r!H*z`cj?n<17v0u3k1U5L;J5K7Iu5Teg}_rQ>=t+op}{QCRiBruxX9C$OD z4W}CLQ#6FCcoeH!f&YI2&PJrs1i~6WeXTgU=SutNkSqc3FSM3dLR;}&RlQu@tJ@JX z1^o++g#W_)jVq$1rDQ2Tgl`(N$5G8RhRpkP0y?=LFod;7crA~VLIF(f?N)x%^oZ^$ zL5c(cRTORwcM(6&oFnz(a6R8-%lk4}b-#dj+!8^{d|`-9H&8pvP$%aQ-($(pVC zmcdAN@#oA#jX#vw(;6BKymT?ZCrqYoHe6kBLHBGcV4;=u1%ICsF|_8?!%n~xKTg%P z+ow!NMX5A6vy0SOksFM@p+VWRohWJHLxNEh z?*%0|(ky)TtspWIc{Ma37A9vG{g(??-al{aXyxvR(T^lHW%K6G$CV2I~mBKQpkpL;I4m$u_jMclV$R2yJZa@mmqN1XjAgdjpC!K`+ z751^pFbFBi_x1tmdo43t1_Y&$SZv=~!MCv?)aa#?)MySZp&A<^3|DE`H?6QgSHH#Y zLP3mN{_{I7m1_kxfcc6QyaAdl$>t4k2pjS4xMaskwS*%?WxGv*gxjF za0by_xX*_P?1J8HZ^S49!pUcWH+m6!S&yPK+@~GNRoiboZ+iZmx8d>OZwkWOw;dlIgN=qr{^MwRdWFCHBb(K01u94P9#{~i%Z2q{ zL66_toi<(hja4O@2O!=OGck7-Vg_zDm3hKx+*;RW_%5I%h!}J!F4ok(js?YV*JF`5-7&&+IF%Nw$ihSvB_wMqx8Dg?DQ`|;F|B0%xXk1E9*}CYk5{&y`g#~yAfUduv z%H@z-KIGpITF(H{4wj@J-9ydm4)i`1;~O9^e+o>nsnA_XW-L)kJgB~r{tLhXh&e>% zWPKFS+*MrtlIk*ED?uMHpP+lQ;;y->X4lNk}Ag zWWx#0tD5d%8U;d7yVp$0xrDzCVTVSOY7Wo>8kN0tzthf6_7BrLWnKR8$D+-qV)g@{ zfU*ZI%}1&V6ovOyS$HEsj|6T0-;Zrb$g70$9>Ro=yyHUjPKtZfoapmQ{M zT^E2^il=b3%l8F-|9Fg4R{qT`1lkvXuuy-owx0saFGrGQ`TgskHm2Dfe|X|jI(u!1 zUIYn#{njUgEeQ^h@t_O_>1GP6WT|*wP7W;#OZIpzi!T|KPD)?7YWcPrl7898Ev8`U z910KE2c_-NnO2b@hD`-IsN9!34kqk-QTA7>Pz-Cq-xtL2G`zgLOC8^`SRFa?ClYH? zcz5@(TMNA^GC`qHk3DiNgL)}V&;M!byKMA-V#5D@`9SlpwtVeFYY^qv&nJZ8{!?*o z^h{b_k!(~ z6p@)*P#p{zDZd=Y7KMgJ6+rz}fKy1>sfiZZGYoIJ;XnFQF7w>FmYHiQBZw(lGg2O` zHX~xG$dJV%$O84_U6l;m#EvFfZV62MPG49FE4x)&ZG($H@f>?Qsh0M#^4yJ)BviiK zp`divMHI4k*kk$<=TuyS7E=kh=kEZXA$9nz$x7}piR@qxdHf8S}i9(b;l4O&S?KZNv z$gU`rT}Cz;385&mcVuR--|Kxo-`{yW`lE9?dUIdz>$+agxt#jAs+o%RX4let7B_Q= zuKXNaY&v5cf8sSF!QrEA(w2!ZQFru!0D6C%SjD7^@)Acb>6k}7)-$4pFdzFrv2*C1 zqS9Pmqv;DB$0H&!|4j{a#+sC(`LBtelYdOkd-#g=M+DzKe_fT<)ZG5zreDL)_akH; zCa{X9HoD3V#^3MweQuJ|5B@(`zIAB_)ZXDZsv%C1&GScPELe_N6>7ho^ZKKztzNz= zo^h}9dw-&5Xjry7o?q+*BWr^>%gjzmaAEas-0*m-4JTF-Ij9Wcr(I`wyg`IWOj-4&D}rF*R! z`{U^6Rf$q9Tyyq+xjt;0SjrUcub-dZ#A7CCvN3*4dq6X5^7tJq+tJp|7NOAI`yBmM zM4s=le5nh*ISQFl2yO7qLU{K6(q4Y&eLh+rB1_AP%YXw$C|3`WEe7cI3N z@`{zxx%iDPfcwJ|z-BP35fc+XlMj#<%y=7?HRc2N7B@*oKQquYgIV?Zbz&r8=lpad z=ojF>B_*PM{~qYd?S4nA^#T~A3y`mUj+ar!uD||==h{t2SB#SfHE!Q3D4^RPwc`}f zdim!8&AG`ji(Env0#S|sh3A+I*f#^j4XgnWTqq2tZ^6}Zn7^PHxa)outRIl=!&AW- zY65n{NHV5bNUm*NrHOt4Q&n|A)oHP7a<8m}%D59wGNvIK&`Drl_DG z1g^NkpsSZli0%Xg(k~Sq0D~N)glH=Ws_Fmpzk>8OK&TSf`anWLC++z_b9)}9cMj;Q(B@V?oo$%+8?9tcYQQ_r%3JRm;=t@vPW7O`de9w9ZOmpeC-M+9h<@NyDBC`O(}5Lx}n-$Svc+OJ!Wx1*|m~{ZGKcz=-@9- zAcGSwA#Dm8=M_zaop#uiT*W_K4b(~`2mjhRi7AdL*;ky1&+lGs4KQL{H+b7C^YEG- z*?BgmPz!{NPl(SiE0?*&QPwy2FR^3ZV(qiaJ}<=;RP#x2Qst2&l*!Tv860kp-1Kq$ zqb8-+62p}0#VbKB5qemcQUTPE}xB1l^N6J8s^^Rom*Pg5;C9rh*L9^FR9(14rlZFy3 z30VI&;E2?bcB4E~e3lftRtOS?#iIKoWfq8a6S2F3U*CSJDPUs6_i!Ntl7T28p{*Zy zWF%!4LKa1k@TBVMR1mj5uNshl4zAc8z&|QHiZ=y;tbosT8;)pwQ`6rLhtY-8J4ZkD zk7z#qCR7?q<>uqzJMV9Zam+Bi5vU}O;3G&sHmn#a-Qtu>7StPbFo_C65w~gJ;et-C zY^Hs#Z_)T1J*c}NXG{bj)fve3ybOAh7@?+ThdWEa6nqA@yB)}+I!G?#<>Q-$%%qm3 z9R!mGK`ZjS5GytdnO%h+Kd!!J1&oJ;;aV&h&Y(lQ1)5OF1J+7SO)czy;EK3|0d709(5HrMIt6{W#e>Ip04@%le}25h zE|ByEY#|d~V{rY*p8gyDKI&~3j0-m$U74lS23^mJ^pP`-$9vFKH6>L}vvzwWx^Hh%e)oy9@PC zi#YTjL+`2$&S`=U?NiQDsF`ux2s;Hul(fr6>si+JTp3Df zWq{&(LL_=dQa;MgW-gnzb;E)#eS+)}y8-VI)fuLI_6MzM)tPf1}n>P3nl=Od`%Bx6bZv1<+8AM!;cIuvU#!b5l~k;c!pgzlb=bWOV7pn|hK_`~5Hh1kx~3k5uC$!U zAJ!{ohgM#^(aIi<7Z?)wPJZ0RidObsiR}N{<{w8UpQI08PqnZW=9An0xN}Yw}5p^X5xb?O~%6wlCPDY?sk+ z+#ux_&qUQ4BrIH8@nugo+d&TmD*B}yZj##x3THk@^aIEXGo&PAyTEVtuj%{PRj)q= z;M>CoH6}e^r~vZ;LJ77nX%GT`|84-htwunA9IWY_k*mJjwu(ON?<|^g#rn}ZH`Vlw zYZe2_Bm8E5+Ftna0qVfh-Is*IYt}>YFZe)L*Wu)T)2M?S7I397{X=TO!SU32Klr_9 zC3IDlPAMfzT7c&k8ndDg&+3{!GLMN==y%mKOLf=VGro5U2tITUdgK zDziO@v|?%5+`T~r3s$_Z@IsQeoE+}_g3}pBK}a-lZrn8jjgY3MCiwI(sL>e|l!=S4 z*1E2>F7%;dUbT8Ch>M7660p**l#QB(kH<9;*Gj5*2u( zu#zvLoN19Jff#Qti}G zhu)C_=iKc4BbHbH2q^k^_=Zl|W4W-k&GNJyqQa6(rt5`>J}{Fn(P6mn2q;Sis90Q; zCDS4(Vp|U)LUEPFqCU+R{=F-t%Cj^w98ii$tCsVQ|8U~9<3RPbH|p$Ea8^p3#41WJ z9L4=HxBs{jJs5dTeDBO^^H{dSe|m~UcabfZtq*mxMzmwc>e}p?FQqcJEZtA#CE{28 z{ml|#Q#WzTI*NL58UBMuMa#>_TGB|~I2^pUp*4TjgMaC}$5bza&-!5aPb$j9Ck`U- zdHAqv2!mRpY4qBQ2eI*8?y`*(yr>v|)arV*F|AK4zXXpOzpeuNregRaiKY(rC)}5< z(%$5dTCuhAz%D;6MOXG|Fr_76qV<*S_p3l?QZiO0zwzMHQ!4I=cR_&IXNuXAVL^ru z$f4z4F3P+mbDAFt@Q&nDunm7{tx$Y}Ud8XeG+=MQ60Ej)1w%+?a$;gI ze=l(Vh8BVFVggRxZ+OS-$BVf?%LlM(lrdXCjUfpMaBq_n{|<<0mii1Bb>2L~oUMK>ES4-@hB@fV|yf5-X@!klKzVxpt{(pu!uUtJ1*5GhMl!f7lg zp4v4xnC1Y_aWYM)fEYTEj8nPNE}uux??h^nZc=U;QffN}Mh*a8+EzS8ft5;>6@*tf zG&Gd?$K3w$?&h7&(HN~U5D$bP4{FJTma#chhZVv$qQcRs{n|9IZm(yIHVj=_p%B8lcO z1%=Yf`8Wp?wOp1jqap*Gdc?`M)4?QxeYe~~Kx7fG$8B6(rQ}J9j@mqAkbpj7| zPojgXg)HI)Vko6gMV9XDM>|ZaWFbQJyj7?qdo~J+$~QH+9JpyytjcEehhCdD^Fiob zCbQ2c)ojXDb{K)t&SH<%*CBnf9lH;012=7KiQ0Nsb;j_znB1h4qmFK4whsv6naal5 zF*RVQsp zDSAf?Z6(~EH#6p2M|yDiXt7h^ZQB`3q@Fvy3H6Do<`XsjJEl>3i&xoXAL;y0j*m;E zg%d-Xt(BOoaX(rouqfF(@6=gR=a!l{S!wl|x8EUnHS3hoqmZiTl(4%T|zwJ|C zrzz-gc*yzFsZsaQ`+^d_l0vLolAM^)2bF)T~`!sH?>c$RRJl=dP?|j_w45 z%8p+Y4bta?`L>?{#|}~!k}haA_GVPe!A6UEdf79nFy4Q9F4*A>y5B%XEZR2)?9onik5pzs!SdROm zLM-fw(M~uGj0KW@cx-Dfikx<4Oing)j5JohG6Ihd$=lBCIgXB%kcn2^r{5IQE$~eq zdvbbStz(M-{iV(!%A-3%&wnxuEv%rtZ)B%UG2@l1=p*IN#J^+Tx*HVd&^s3?5QSFC z5_J}t4Xh;`EQ>+Ud=8Az!(m!3YN&{k-w>>xwtPuDn4zm0|FnvtAYI(0G>>C3FGz{! z%Ds9XN3G4{tmzFM3}rr%TYu!0GXjnvRoP2T3Fyi)Wu70@9lp}|1+HTesMYT@8f|BJ z%FkzA#AfN_jSmH8PkPmTDw)-2g^VfGSuw)N*| z8t4Gic!#yJylY_C%6?Dy%eyCk-V9TbG0S!%!ODSKF%}&!%j#!gV6lM=TjZ9!;lCtv zv<-A5GDw5EP2tqxk#f(*RC69o28l>*3=x&8rmKMdkxSqXgCgh4DM;=aF1}G@^W%4Z zhe04fXbcZVj|@>Elcd!|vgWF&w2Omcj$QE$3hrnr97R+Uff%3iH#Y(HRBbWk2ixwh zr=(HeRBX_p!~L$T{9Z~Z!YPt;!*EplmS9`TOeNVqvurlyiTs5mv@ebm(Z@XeQdQL3 z_`Uph9>K@x0{1g~MwZ@)PN{Q}zzB9~O}i*r9;`al&BUC4Uquh4Fw;l+W85;$)IKr< zf8#Xdiy`h%0j+$fz;Vaug_Ne;r$<_V@X8RKt9>+{q5cGck)$5fll%5J}&Uph!?32Z-)Yl%yea8q=> zxp~anaW&wo)1l1>TdvqX8?{p7%%hH-t}s-P9c9P=fE-y)BHIhso`=oP+upi81d1d8 zyeDv)C6s?EvIY;Zv^ZB;)r3A2`7(`hZ*#0lN=#H#rY1%U_*PBa?|O*OZxmKL@I5#f zrD0n&20@}_%thU+yP^=_fi8=foPGQWfrf!WzYn3a=L%2knOO`2*3b8U-M?}9I2Vm_ zG^V-2O8nidI$Hbf5TBRB>d;-s-7&w4k3YtZo|N6SsA)Vi9jhM+P{F6ctNv2RT3J{A zS`meptho2pz5noZD@RQG{$(QDMB4tP-7k)!_V=55J#shPa|iZ)bIs2M(ZrPx1|FUj z`x0qZ914vcr1+6>A9Pl6z7PrVxd$R2ME8YIlG%44uXo&MxgZh=sPkjG{$_j+9|s{z zAypJ4JEMtdT?h*l*12-4lU;nFHh@}g^gLQWZnL?zpexZMRqEAxoDbMZk_3oO|r z;l_jmc*}g@UX2hm!|ZR@%OCv`h5JPpkilRUjyHwChz0msmdoD?OhEzX@Wv7N?Gr$? zmLP;bdUX`&S;T4xx9VWj>BzUAQMMj9pyHyfEhlNKunM`)t{ca!`8v{=1ot9aEYUu8 z0*mKrlP?cRbLZu_z6;IH#ZM$Dxbu_}Sz;(UTW+G&8{eWk7yF`azr|%T5==9em|*OP zXq1@OkZ0F1B5nIwCg$p4=o$GClk~4fWHc~FGxem#Jlze~l`wSe2{naoG5rkwHX%l% zU2$Kx5@oeoeex@+&OP6+nUT+yl1s`AmE3lzX???v?k1aT%bAv}l{H0I8Y<^6Y3BBg z$O|Iej{`Qxzl0>Q7@3@3I-3{>?upuxza)}CW^1e-Az@wGr?_V4_a%G;Y(D3T@5ZlL zTT-#YcY6?_tepNz)7mMr9mrlJHy)@&SD@jb$h52*tl-vXnzCX1A^h5IQ!=kFY9~KP z;b+rwuPx-@7$~!%Jvcqn{e$T&mHk+ogy$q*Ndj8ZP6CQPe|B z;v*~y!kV8?1-n}#-|?x_u3rfzr|?C^bDr>=)N?G3C2l&p#64k69?H`e_v|@Eag3g; z&=$qx;dBtQU1SjK*50`3bxv-ba;}}seO>;RuDprON%%9z^olaacgMDytg$BoD3iOz z2aPQ^+wxAy+itTP=7xlXrm4zJ@}C}RM{kU2#`|M5hEi0!wz*J4>?RL7ZEGo%CcSnA z{Vx>6cu-7|XyC|lTHGV+^gR`uE}l6*52qdWK))PP67cDo@s)3m3MO+ z`xR!mWY@5*+auy!gz@MKxKT^~A9NZdO-B z_4GyL1{J(Ry$eWA`cv&#Ct3m+^c&a5C%mVaG8hW;^Me7DmQpS?vHMc_6_7?Sh1tWo zO~NFd3R0(!qnyblB@{RDE_{0Et>k9C{09)ag*&FLxa9IJi1Z3F^+DU`Uv`YAXJA0B z!TzDTT6Mk}RC8E}!wrSjN_PD;L1;i8+%=~(TzCp%cn)svNQi=k(>=~13rsNwTXR5f zLtaS*tWuJ8h+2gBGrt?bkgX<}3Bqwo;_yVyr=V$6U;hP4?@UMsh2($0KIl!-5$F@i z{n%J9EpT%JzxW(O0u@E^N;6a7xl@OS1JSIwGnuf!$$7ID^J2-BZM4;6MNCW`4n>Cp0J#rb=$Q9&DG9S-qRY`Q-lhvzLnojBHv9Y4cjKN;TqceQa!Nj>|I zJT_YmdXp?P)q^dW$+3k1<7|${fAdyJXIM6@8{cww z!v-@r$(lv@ib`ai0X>6Bo`LYr@eYRL@!%&R_s~{f1o6%!Za8fS)#4=ec4u{+MX|Ej zu-vU-Cwaw$ND6Ux_p9UiP0j!ih_3)+H!eB=3u9@MX1(|svMLxUUoZWMsA65-u*c6R2h@Vnz9oBMK9 zKr=aYTAwneG?yLntCYeByvB>G`lYWdEJYV`rrE$qbG~bFPBU5>6X+_U;CH;C%3&MK zuhktLTT+t5x$x#slAV-;84TG_OEsz80DJn>$)W^rFRs!MBK+oE)eeX5i;hMiK(FT~ zmZL#b4Cj!s7uP3?tRk-NKnO8Q%4X>SCo6Zy0*grd0>gF^?bUXmVsL3=K0(RC!YPzY zZJu{=ScepcMo4lUSst9apd;Ei7BG!Zg-h?Lr=HFz&{>JQvB$m)#4}0$^1htvC)1fi zS_NNlIg739^vsPWUKAUOoRq@(HEG4|J~E4{2C2}O@^W_J1N3SSeK&B7aBoZ24iB9& zIilSqNegbWuz8lFC_-P8iIg&V=5hWS^DLqtcpD-XHIYrm_?^VYHKXkv_l|loG*KeHBAqZZTbNu_ z-TFAn%v2}!!Jqe$&ll8gKkttqqyC8(1-m~1iLD6;_+5eYK=jI>5ZT`ZkWqey^HZD( z;pdh4uvZrxCTj5$hAD`I>{`Es4Gj+U6Ha<1u0+Ypm(+cI^4E~Ka>wmoqNCcdxcuqA z+X(s?+G_CRdsx^b&1&jpepJH+cYZ4?_A)2i+sEqk+uljfGh&8Ube3%_wpNPswqcY` z&dbXi!OWV$*bXuwYtYj{&~x#-Uhx}n>N4H<%c>z~mdHsz_S|H5hA&HEqjSdn@@XTd z7PQA8lmMKeaHPw8U#X{^)19h~B2n^u4Hq??Fnm(#lb2-7Z$;aUXQ&;CtRL<*uacBIP8eSPec8vVOZ* zVFlsms?RSO$0p|CCN=oWyUMW3#-w~N&SHD`@A(aX^NZ+$>Mv3I)N~Jo(US}u=*~kz z3ae%vjL!K5H~Jb1D~(?{4YZ5l%~aZtXi+MYaVWFXtD919h>~5T*GbMZQ7(yjk!5*t zi30)u1T{iwLiy9O-=#Q5K5qB+*)%#RyJ$r#$0TyrfO@fYB}!^Qec*snDfL_^6swMm z;sLdPjN($WO=)qwCaL^2o6z!a6ZRw*i5IzJy=x>_w!lQefuY)v|IKPq{aGJ5+0 zi1j8YS;-=CA?p&dl{mX<5>@Dtw)&>rAS_G&d?AEkJVO8TUUBib@%tQF1Wuk zEPE$x1SA)zU!tRIPG%8S`5WXdkM8J%**$(NzOz@!_3lEQ zzW7z%3Qz{YnGe`MQr0;A_!GGl_V%C#%N7g(K>~Q)0GdOL4Lc+NzTXG!m1ld)#)hTt zo>57q+2fb;u*s58%f?EY1<`EM&Q?oCrF?*OA>kQ*T^ti1u7hEtQI#y@h&w4SXL&|{ z)*SJA@St*Ym6=`aTX-oI!`7ydOg<_sO z>V&U?D2NM_(eAAJ^I|Z~OmitM{t3J3WFFnY+j^Q2u0765FTTHZoKwwM2V|lA6%ERz z8ff<^rS8M`&o(T=zV!(F6U^(Dpdk+RtkxYmZW{&fV#sEH7U@Y?h<52NzJdy8*2kz_ zt*FG<9$S;BZv5)Rrr;ADR zvKOzQG2d@K&{oL>iI5_UXBYvgwC&6u{odN;^7#!~hIOo~UbSMd z5`G>Kzy$;2ayg3wdKXmP=X&^6eUek6$TRFfDG({y`ou9~>P?IE^i_gA2UB!G^IZz9 z0)Z0>tDiAfhfJPs7_n3KH8BOcu(!4GE6$f06a5ZO{2LxMe8i@jn;sG$#IAfrjwMim z#OhH{ct?Zi3pBxxbJgX$-Y+9k9!%fi#@#ol!gowzenQWcSwW6z^+1JIe~NtYx}p&^ z@d=;8$o^8B_L4%#LrjAO*0onPE2jkbQ72**_0~z66$Rn%+~iO*b*UGA|A^U6P<=AZ zttvMYO~HJ%IsWi>@&ZxIZB(eO!`Iyc6#KvQ&8a?=`+Dl;28+#f%2pbx@z8ckSHC@C6uaGE=AO-&h6xlB{t2O zdm*i=UPEn%GozDSr?NP1oVq^9pd4J@fDdlm)m`;T8OEnp&mJ+}rHu%2UWQ2qBFNg_ z9O&_=+TSC*pKPycCIOR@^4-iLEhr9QPT-!Ae=JY~?Fes1qxgGmZEdks$5Qxw_dZ$h zFRhGSQhXa_E>7yRYE~(DjQdOKSM7xJ*!Xx#?p}tS)R58JzeW=k;MoF`<+Q8hZ0S9= zHtSw8f-p}{&c*IjPF}%7KSxK$q5Ar;w4X8`V@pp(Z$RfgDx9+2Y_&)U0_bT;)KTcw zz|WI0mcq1$x%kaf2~&Br7mf6(8zvv{EP)}icE3^S&Om~JSLS~cLG=f4jzJ+0X2a`o z^pb8yZ`!5_d>v}y&jAvd;D*(LbVyB{JtVTEj*m`A1Ah=+PF`Q;y?CFY(KvCj@V``t+mVs03Bt3L6^< z$4DU;hjQEpDl=O_6*R9stM2sOV%Iy^=Sl18`!?U%V*j%0Uc=>QfA}?{-hYwn;cMLw zmc6Qfg*4i-yv5oyn*~oMV}z+y*x6Q8Y4=$N)pzq+9%I;!bf(=po}Lt=_vcw0oV?E4 zUPaY8>wo>A-Bwm1Ma7BHO^cj*E;4$1WzN4tYl{OPKK7r4Osh)o-pY5z8xmSiRtMfm z=pJ-svSvCKvo`l(n)&?g`8Ve&Pakyshty?1Q5v?YpPcdOXC^Prc$Gh1Txp`IC_Sg8 zVBXlD*=T&qy0IZ|z_iTGgMFV}xyoeqUnWZQV9BiM3P%(-`QWa~<;h`3w}Svta#e=^ zyVTIW!y8YpfW%*EY3YN5xdfzKiriy0o)Hu_O##PChY;I6d4uAdd zO2@|ji&bgJY1kQ1$OAkVKn*gO<9Ye{Z6SF+Np%erz1D~VtI-^0{Y()X^_tVMk4_9= z%!O1g+_F&ywseqo*Mc<)>E!`~fE5T)fNk>ti57w=!;S8eR=*P$Tb{TBC8(`=SF=FP z3<;IMfH0B~{{8zGegdG6G9XAJ!;$;`H#2$SlQ$CHvDwaYCd_MsjzLPrTxXQw!NDO6 z2qa{k1Lpwd(gZ~A0abf2PWF8HG8-$_&nhOC0T2!z# z!tT-XWTWvCTdk*AVn}qM>xhZ{;NV~@5b|$eaD@Q@2x?KQHasw1w}BIsUF!$vT!eLs zbh0GBS<`ApMRs4q4Bk*))7baGE0={YhlNQxxUO30aV4)qCTf%PJoYg6Dz`B~I+cyQ zqgvwSHzu}LjIZscQ&?Ero_bC2q?PR6qqRX5^Asz6%MTyemP{CDm#v^hJj)LL5j4U|;Vjq2tB)~JF*vnEH@ z#hyonk55Yde95=W^RLynTvkMOrD;pLCrY%lmxRo}!p*%OKF~}_w^=Lai6rb{aY$L3 z3B4ew_@C9^q~4Ra${uN98_T-)t8(|l0Sm7=;9QD!hxy%a4w_H@x`1Jl(rn`wp8tf` zC{WBuRURDWNSHPUH8K1Ph_7EnxChYTft`sN%DW)<4PLRuFq$jBC>~fgR$i#6DSyN( z&8=m3@=YvdNen|VDf%Z5YlmXKP_swbHX0d+~3Wf2wB&d5C% zH$<%EvtW~G4#DAW(-5+Ni5Q!zRHGzEE^yW__ zFXwYyPUct;8ztYRPS7YJvMPWW0jPQk*<Uq#1kSgC9>6UR zw--*J*JLqXW<4JSh-xtxMZDKdL2(jcHSPJBMf5N5K?~5k3qN^gZ~`GZv?vOWZ(dA* z$bdDY7CJZUrb90rz#aPk+BOkaGLy_>SMYHFx^n<;y&{-1A5I7=6T~|eMr7UIITMWQ_Hf5p$JxF8eH~1f=4g0l{iTZMLNsi> z>!@PSR#hLo-@%&7O@tLjnijWD163^q%ILL64JZ>rj5n58+Y&q_9$R~jv=(f{{J1C~ zj7w3tRO25&)Gm-yy`DqYbf3vT>Th#w?Gmf<24ZDr~XJli#H$(|4)=vnnPfzMSMKc+%Qg+qEEXz9l4PV~QX%{Jd zU(cmFE~EjWRfNy6pmVM>7?Y+ajYc7#V87hHY2fkr)HWQ8ka+*Y=_eh7LohGNhEXj` zMlX$2K$!!5Yrmp3ieJm-IG#zfQdO9C9>j`vkYEApf4AiNm3L|5f8%A;$jia@iS$Np z?}YE~00gC@64q_Mzp+1MGtn)5v7SLOTIA((fv;|T1>$A2WkR^fwmp!ZoL9ENC_*f>rsoKfP_=ZXE z%k3G%UI0c-@Nj`>i>l54bSoif9-I~TH>X=CM(lQ5uPeLknv8l$%SbC}H937VNaNDZ zfMiJYC9)D3<9!YNRLz`pQ_^tO&IRq2(yCk z&S=)VrQKZmx{Ta?msvzVbib!m{eAE%it7ARaM^NDWwmm&iThud%I_3f6X(Q?VJNN- z>1W&;G$Mf{n2hVzzkjq}z_nt0uU!3?qn=!7*Gnb(@B(Y>%TuRPaf;$m4Q#t8e=hp+ z6jh<>b|UTg5ucCd$0__uYXx0w%7q`;I{W;{^1Mw%JHAxnR$P0$TQ2oG5p8qi)8XgD zx~C-bv@|V}VI)^a9sMu8wQZ>PEZA6TGMZJ2T*12y?xrQ= z|0|3U#F-9188|F4#^D%=$&a)6!Yw9W7X9w`8ftLxc|B}ouYnM12!itgQqyBFrhecm z@rCW)Rgw`oKac_~!ii=OhW0C1%*%+8gzc`5iaCu{qqcC%p<@+cUO?@G{e$XWM(L_f zrZoPf6Nmg-z3_p{&+-ts8Hob3!0bDp6tpVLIV2Q-$mF`2855kX_d%aU zI8pZd_wOisvKBD6LBW)_={S5{q-hEfAXakj=Q%kYS+eTt{fTnrt0-0u`&SiL3Fs(e zx?7v{%Gn*&2We-?io>G9pU}l_q!_ni1v(8bjFmk$J6>9-aH=$gm1F6J^C^olN8fA4 zPQ>Um`=AFe?5Y)5YN@QM36r|dbTQ2TA|Hp=j*?+yNoVbTu~^deu46T^G<_tgjpNz+ z!NMHz&dr(Y>4u9t-e9^Th1=mOy3W16b4nq*T$7ViDRrcma zi;&h=Gfzp1w+Y0v%*X9zr)u0c@OlCj*n>JP+DC(8glG37-AMIK?(Wz|y*A&wWby@t zW^lYw5{yZ28BNA)tF230{y`lwD&Fjv>rsEQ$ry(G8)Vdo^q5XDC{bsG$n5Xt=& zD!uxkM1xlDXf^h_<%>JQr*H+m1vw$`j&@xgHKo#uaL$?BxB^PEo3dMY2x&eal_c(6 zz4K%Guca!{g_o>VQdz^Ims4^wRMgegP0cgSy}QJH!53@{+6(UquS*cog!JQr4;E2H z-o1Cv7cy2rv3xh-P}DCBNy-4k68e6IBr6vI$zOyUR1}9A89{w+ZL@n1-}uAF<<4iv zzqg;)!G*>u>;BmKCHQt>$~Xa2G+sVzEZRbisc`NhbO4kXsd;$pU~^x;83FuNBvg>? zfWDJnD;b4{13o;&OAU7lO^zoDtwOBtZbi*5 zt|g$4dpAhxjRV~?8O>OapjxYmJK+zyQ(v@~9k=C~L{y!9Gw*$7AD3~)sg3@~9QT@E zDe2`Rp4B zKV=yrKlN2quDRHZ({m8(-%P-HacNr4pxF z@fB8eN6%{tXzuoFe{XMYelPMra5LnBRx;GyLY5?)b;;U-GXN77zk3&JDg4Zc<9(c` zV(Z3+?aGs0sF;}*3Ody04Ee|8{ks)(^~^+Ep29zy0%QhccSt}8h|m_*-qjuxPin&6 z@*i;#Js8O>jeqIxgAYdubmM+Mejx*>^%iVzgOHpAjqBXtajq`~WzI9uRAv<=LArFl zQrLgV_b@q;)mubZZGvrZcFX4g&N#*l{y^Y2;4&VLu=Jw3n%X2C>W7gz6CP>z zjbOX55Ac$RY6wZ;{%sHETnwg>Udo*lR75Mc&59hhK}v*VE|)f+`GMu!J@(P&d)&=Zl6d7yh;P{kPAE4hqYU9IW0{Up$K~?c~9y7+JV7t*qL_ab-Y1 zh)wZG#uk%wCfC7xe6i!{P)-9DyUCR_8WAer|k#p*uk z)-&l%F6+R5?3#6F^EBV`tXMlYE>5#!*!ERsd;Vhk{#in|h+dPxZB|^q&(PNU_VNi+ zjAzZD9RH9U)d8EeDT8Xe073b}PX=R;50?(Gm7T*vgvjoM-D>^cfRhin@qbfwj6;@# z6qoayUMm#yL2gy|vd9a#)&e;yf3m4kq?_Z{!l}v+cMl|-CKD!WvHM^1IhL2o@?^Vg z-oI6hrzCjB7HF)HTA@MC%`HJ#GagsEO2{bq7rfSRJ9rNH0-UoG;E_Y7CU7oU!9B$D z@nh&VZPDV+@%(lQPVxr0t3$_>s-FB5(tvCLo=#a@)PK999rK9HnvvUDu4X210_d2Q1LD&5`PQJk-I@u6^e zc{%F1AF_fvkYD;5OrJr?2a*yf{6Iu2kRx6TGHF06=0GLaHn*i`WRzZcQVKt27t`de zv9Y_8{yR7B_hsCJ<}y)_B$>Fvb+%)76%Ll^CsW7F2%gjx z&jjUXi8(np%+1ZqR#%soTZ9Xbe?jr?f8#B5ZEi0FyG$fK4Dl?%;|uBTlX0G; zO>HEsj>3-yfz3g%;J?{Qh=;>Y{2f-DCa9y3?ozK`Q+meFt?i=+CRF2y5fI(alHIyu z95`w?ZQ%FiNo28V^RKin?j4l>e{ccbJuo--iWc!UkjNiQ<2u*Q5Tx^)3~>+OFf^*b zOqyZt$E#aRWi|>4;~67d+^;8lV77&Z-=yP(ox|F*qg6NQ+rPA_^dUtL(z$kO9l^dJ zi2N$D8v0q8P<0j{QoF5Kx!y=t|E1Ta?1f*qCMYt1oRaekgii{py0U5+nSmdlfgGm} z@xG{ni7#rmA*+5Pa9`%(!-upAfhX8>>)5&e+dZ|ymcSXzB6%~W)-Kvz*#B(`b=vRb zWw*OIc%ms+x-#wa@ZYj<$fs7bW_y}xsKAz+`Ux!l!Rs)|%)`JGd)x=X4s~GQqCCPE z2Ji}?uU4?SNd0;J0z{h9`6B*MvGtiEAMim~SK>pL&>j)J3(AIu&fttf(Ro4~)jw&A zm?8&6uHsgA2WFB?8E@xoXDskWV!=1jV7@=cKysx)>&}Cj)xTQu&@k9p5E9GEjNi1+ zl6Z$?0)5_i#W3V*3|ldZPa`ZdLM1htcqey$wpZrcF0RFBy>c?w!y56y5n9WiHQUFN z@@W#vvsynN;;BoD1!?Gn?&8Uqy!-!8eY)lmumYs%ohLg_d;+4_^} zy8id3DTZLn;(qS)$?l>-@8DjQykh~a@*pAcwFs-ZR?YUtB)gyWxpQI`FW>nI7bP*9 zH0`WizHchYF0ikj%=PEv$j-*VTan9o`0Dh9I@d-0cZqBlU)ncep$_Qk9}C*p$9B%> zCOrKjenb9lZH|pSnQmV0fJJYSevQ(7Bfldi;+EsGw(nI+G6%xV6q=N4zxrw>&NXHd ztsDlmv_dED-9r(0@s)TC-6|@4CVTn}&)ytcoxZ<8Q*>~Q z(-`wO?E5}?9>5^`Ct{WCDHe-2diC%g{M7H7J=!U7_sQB5Am;bMyKjDZWZRRK_T~1A zX3>SU2Jts_9ZPTjyrxYH>mB#bNev!34O~2*S{mWHbjp(v{U7r-7ZtofWKG8mxj)c# zBD?$jHe{m|kKaDKMn)1{m|yZRTVMKGLPSZ)gRJP1*JkYGk+5kg>Ze8xny1^rc-@ou zW!cI1q@#^E;3CP2>Dgdj%L)g5Z;x&4+T4EbhG@M%ryAF1$jwpjCV%_4=T8bJZ$W8p z>^U$pcHNh?Z{FU0?}(C6)2}C89l704y=v|i4%&{@WKNJ?WzuDO-WXujri^}s3HlIsze(FJ**PCmXz=Q^}EBbd`JzlbHEa}jx6pQ%^A zaXb8-Cgw!?M!YVc`m3&5t{%#g1}b{>wrMK|c17jHB>Xg+mghI782FUzWtxO6zrAMs zr$--~S<;cQ`sWR8W>{g5gF|7nKzDzf;)3z!{UPtSu_R9%TV_H^#8+OvQ@qD;_=}7n z?u%ATl9@i2Xl1s;Cy7dSD}r$=m#4Hbe)O?@l{kS4^VG^))8fm%#@ybDzc%@Q)5SM2 zh;s>(ro4CDD>58tKER?iAeF**U{&NqdA4=Z2-`AB{(h-8@S%+7i@_Vs6y58hn9$2g zK|D@=r4I>gORRE8_D;ek#fMfRCM=5X3^0uIFf4M3DHIPp6x8}QNN>Be)R#)y%g!;L zd69ZcM{B&74*9vbYJy&GN zoea=klzlBNE$0`Hcc$B5;nTizcS<{z=IK$60DRTX)z$Ieju9@8=1Oa;9yzPtIt)y@ z)BoR{$*qG8&M5F|=KB5Ng|Lx6zzkp0OY=Jw6UN(ilNnc?n4MUxb zYia1Ui(p2+6>BBETs-Z;otK$cURIk6_Z3LSc=;C)N*K>~;Etl<-otqP(OC6U$g)Dr zJ1TcGN=lIXGV*(fK{=?`Nfa|=oH?c<4qtxo7mN3@L^x+fP1Vx{28H~l``lTJipgA| z@3H^ITYeqo1d`$x&?<=S7B29S)fR*=7zi(@=_5%Vka}nfPw8gPw4$gv`m4Dt=}*?p zH%8m!=cOWYnm^*OhvXQpHR+dITU?OLp57&DziEL&8&Y-hcwe?%Eb8KUA+6{&+xHkv zvX?_`L`|1*=hejf>As5ubX5t%iTu5_Vj*OC%ib2GJjr+k`O8n#ui4J;P0(^_ zdz`--knyUdhbJrh;}ec6qNODGjDmw3dh~m0(ZsnVB6nVD4;XT1QOU{^I_JIEkj7xF zvR>=bxqJ-M<00KwPji#RcG0b!a-fX~(RFhXb??Z|B-@bLkpE40IrkBvI*ZpWW##17 z1w;Mso`33yvgk*8#8tbJXf_WNJ(Ta8`+YGmaJ8G=d*f5ZNbB%ylv7RFgQqrtO!k+< zHxDiLpKn5jpEqs2m&QmWbH~k%iz|A~^^GXHzqEVrW%rt=Bv#^H`!WYMJ6i}|Daj#v z?H=RGu!k0o26u@H>44+PW@oZnt)Z+5eeS!kQ7s~(VevsPsy>Pujn%mb(Z>G`|BP-R zR)p@Xrwc)#auVJjnL?W%Ke}RSVX5fm%T-#E9fwnDM<<(tLLRuqZ{ND* z-u(xMt-Y;H8Lmm{U^MkNi|9#LWGXJLCyK?=oM}j?`z*rM{YZR&C+0LkC4ewr#r5hy zeBjd;e4VU4$`R0vIy9sVi9~SCdhz=8a^n7~qUQ@CJ;O9GYs0{X^pBV}C3v#FTooVL zXdP|3_y=c8$`M`^nvhA~ zRX*4rcBQ;TaoyS?orKA^aB78JQM|ZTs0m0TDbvq(y zr;}1<8Gg0Yf2|dFo;X>IX>L$lC!5)@Ta*+ps<1~=PSHr-_R=eR{v#R8mh>k*jGs0k zcBP9#F8}5$Ep9H8wz*-{!Eo_X$^GDVwtw7ac->7Cp{m#KWJVLy{j!}$G5wJ-!H&K> zv&mz-C_PlQu8Ce-#;Xzfy&;Zf&BD!QR^ohRJXF+iO49q5rr-b8?`7xyTY1T;q(PW