Skip to content

Commit

Permalink
set cpu affinity and membind for better oob performance (huggingface#853
Browse files Browse the repository at this point in the history
)

* set num threads and memory binding for better OOB performance

* clean env var

* added core and memory binding util for improved performance

* add example usage in docstring

* change utlity for best oob to support world_size and rank >=1

* fix style

* fix node_id value to account for rank_id starts at zero

* numa node assignment calculated from local size not from world size

* reorg imports, moved checks to import_utils, remove prints for logger

* raise Errors with missing pkg and unsupported OS

* added missng env var to list

* Update optimum/intel/utils/modeling_utils.py

* Update optimum/intel/utils/import_utils.py

* Update optimum/intel/utils/import_utils.py

* fix style quality error

---------

Co-authored-by: Ilyas Moutawwakil <[email protected]>
  • Loading branch information
rbrugaro and IlyasMoutawwakil authored Aug 27, 2024
1 parent 403c696 commit 9a18ae0
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 4 deletions.
9 changes: 5 additions & 4 deletions docker/Dockerfile.intel
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
libpng-dev \
python3 \
python3-pip \
python3-dev \
libnuma-dev \
&& rm -rf /var/lib/apt/lists/*"
RUN /usr/sbin/update-ccache-symlinks
RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
Expand All @@ -43,12 +45,11 @@ RUN python3 -m pip install --no-cache-dir \
torchaudio==${TORCHAUDIO_VERSION} \
-f https://download.pytorch.org/whl/torch_stable.html && \
python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
python3 -m pip install --no-cache-dir numa

ARG OMP_NUM_THREADS=1
ENV OMP_NUM_THREADS=${OMP_NUM_THREADS}
ARG KMP_BLOCKTIME=1
ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
ARG KMP_HW_SUBSET=1T
ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
ENV LD_PRELOAD="/usr/local/lib/libiomp5.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so"
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
1 change: 1 addition & 0 deletions optimum/intel/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
is_neural_compressor_available,
is_neural_compressor_version,
is_nncf_available,
is_numa_available,
is_openvino_available,
is_torch_version,
is_transformers_available,
Expand Down
12 changes: 12 additions & 0 deletions optimum/intel/utils/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,14 @@
except importlib_metadata.PackageNotFoundError:
_accelerate_available = False

_numa_available = importlib.util.find_spec("numa") is not None

if _numa_available:
try:
importlib_metadata.version("numa")
except importlib_metadata.PackageNotFoundError:
_numa_available = False


def is_transformers_available():
return _transformers_available
Expand Down Expand Up @@ -272,6 +280,10 @@ def is_accelerate_available():
return _accelerate_available


def is_numa_available():
return _numa_available


# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
"""
Expand Down
82 changes: 82 additions & 0 deletions optimum/intel/utils/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import math
import os
import platform
import re
from pathlib import Path
from typing import List, Optional, Union

import psutil
import torch
from huggingface_hub import HfApi, HfFolder

from .import_utils import is_numa_available


MULTI_QUERY_ATTN_MODELS = {"gpt_bigcode"}

logger = logging.getLogger(__name__)


def get_model_device(model: torch.nn.Module) -> torch.device:
"""
Expand Down Expand Up @@ -135,3 +144,76 @@ def replace_customized_linear_with_linear(model):
setattr(model, child_name, new_m)
else:
replace_customized_linear_with_linear(child)


def get_int_from_env(env_keys, default):
"""Returns the first positive env value found in the `env_keys` list or the default."""
for e in env_keys:
val = int(os.environ.get(e, -1))
if val >= 0:
return val
return default


def bind_cores_for_best_perf():
"""
Set number of threads per rank, numa cpu affinity and numa memory binding if not already set for better OOB performance.
Works for wold_size >= 1 and rank >= 1
Example:
.. code-block:: python
from optimum.intel.ipex import IPEXModelForCausalLM
from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
bind_cores_for_best_perf()
model = IPEXModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.bfloat16, export=True)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
input_sentence = ["tell me a story about a trip to the moon"]
model_inputs = tokenizer(input_sentence, return_tensors="pt")
generation_kwargs = dict(max_new_tokens=500)
generated_ids = model.generate(**model_inputs, **generation_kwargs)
Returns:
None
"""
if platform.system() != "Linux":
logger.error("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.")
raise OSError("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.")
if not is_numa_available():
logger.error("'numa' module not found")
raise ImportError("'numa' module not found, install with 'pip install numa'")
import numa

local_size = get_int_from_env(
["LOCAL_WORLD_SIZE", "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
)
rank_id = get_int_from_env(
["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0
)
nodes = numa.get_max_node() + 1
rank_per_node = math.ceil(local_size / nodes)
num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
node_id = int(rank_id / rank_per_node)
rank_offset_per_node = rank_id % rank_per_node
if os.getenv("OMP_NUM_THREADS") is None:
num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
logger.info(f"Setting OMP_NUM_THREADS to {num_cpus_per_rank} for better performance")
else:
num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
logger.info(f"OMP_NUM_THREADS already set to {num_cpus_per_rank}")
if len(numa.get_membind()) == nodes:
# if numa memory binding is not set, set it to the node where the rank is running
numa.set_membind([node_id])

torch.set_num_threads(num_cpus_per_rank)

if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True):
# if numa affinity is unset (default value is set to all logical cores) set it to the physical cores assigned to the rank
cpu_start = num_cpus_per_rank * rank_offset_per_node
numa.set_affinity(
0,
list(numa.node_to_cpus(node_id))[cpu_start : cpu_start + num_cpus_per_rank],
)
logger.info(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}")

0 comments on commit 9a18ae0

Please sign in to comment.