Skip to content

Commit

Permalink
CUDA setup cleanup (#996)
Browse files Browse the repository at this point in the history
* Diagnostics: streamline debug printing code

* CUDA setup: Remove unused `backup_paths`

* CUDA setup: DRY OS detection

* CUDA setup: Streamline `manual_override()`

* CUDA setup: Use comment instead of string literal, simplify

* CUDA setup: remove duplicate sort

The "sort compute capabilities" fix from #703 (#527) would actually do nothing due to this.

* CUDA setup: make version number replacement logic more obvious
  • Loading branch information
akx authored Feb 4, 2024
1 parent acc7fb3 commit 259ad44
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 86 deletions.
69 changes: 23 additions & 46 deletions bitsandbytes/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import glob
import os
from os.path import isdir
import sys
from warnings import warn

Expand All @@ -8,17 +8,9 @@
HEADER_WIDTH = 60


def find_file_recursive(folder, filename):
import glob
outs = []
try:
for ext in ["so", "dll", "dylib"]:
out = glob.glob(os.path.join(folder, "**", filename + ext))
outs.extend(out)
except Exception as e:
raise RuntimeError('Error: Something when wrong when trying to find file.') from e

return outs
def find_dynamic_library(folder, filename):
for ext in ("so", "dll", "dylib"):
yield from glob.glob(os.path.join(folder, "**", filename + ext))


def generate_bug_report_information():
Expand All @@ -27,40 +19,25 @@ def generate_bug_report_information():
print_header("")
print('')

if 'CONDA_PREFIX' in os.environ:
paths = find_file_recursive(os.environ['CONDA_PREFIX'], '*cuda*')
print_header("ANACONDA CUDA PATHS")
print(paths)
print('')
if isdir('/usr/local/'):
paths = find_file_recursive('/usr/local', '*cuda*')
print_header("/usr/local CUDA PATHS")
print(paths)
print('')
if 'CUDA_PATH' in os.environ and isdir(os.environ['CUDA_PATH']):
paths = find_file_recursive(os.environ['CUDA_PATH'], '*cuda*')
print_header("CUDA PATHS")
print(paths)
print('')

if isdir(os.getcwd()):
paths = find_file_recursive(os.getcwd(), '*cuda*')
print_header("WORKING DIRECTORY CUDA PATHS")
print(paths)
print('')

print_header("LD_LIBRARY CUDA PATHS")
if 'LD_LIBRARY_PATH' in os.environ:
lib_path = os.environ['LD_LIBRARY_PATH'].strip()
for path in set(lib_path.split(os.pathsep)):
try:
if isdir(path):
print_header(f"{path} CUDA PATHS")
paths = find_file_recursive(path, '*cuda*')
print(paths)
except Exception as e:
print(f'Could not read LD_LIBRARY_PATH: {path} ({e})')
print('')
path_sources = [
("ANACONDA CUDA PATHS", os.environ.get("CONDA_PREFIX")),
("/usr/local CUDA PATHS", "/usr/local"),
("CUDA PATHS", os.environ.get("CUDA_PATH")),
("WORKING DIRECTORY CUDA PATHS", os.getcwd()),
]
try:
ld_library_path = os.environ.get("LD_LIBRARY_PATH")
if ld_library_path:
for path in set(ld_library_path.strip().split(os.pathsep)):
path_sources.append((f"LD_LIBRARY_PATH {path} CUDA PATHS", path))
except Exception as e:
print(f"Could not parse LD_LIBRARY_PATH: {e}")

for name, path in path_sources:
if path and os.path.isdir(path):
print_header(name)
print(list(find_dynamic_library(path, '*cuda*')))
print("")


def print_header(
Expand Down
83 changes: 43 additions & 40 deletions bitsandbytes/cuda_setup/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,17 @@

from .env_vars import get_potentially_lib_path_containing_env_vars

# these are the most common libs names
# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
# we have libcudart.so.11.0 which causes a lot of errors before
# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
system = platform.system()
if system == 'Windows':
if platform.system() == 'Windows': # Windows
CUDA_RUNTIME_LIBS = ["nvcuda.dll"]
else: # Linux or other
CUDA_RUNTIME_LIBS = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2']
DYNAMIC_LIBRARY_SUFFIX = ".dll"
else: # Linux or other
# these are the most common libs names
# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
# we have libcudart.so.11.0 which causes a lot of errors before
# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
CUDA_RUNTIME_LIBS = ["libcudart.so", "libcudart.so.11.0", "libcudart.so.12.0", "libcudart.so.12.1", "libcudart.so.12.2"]
DYNAMIC_LIBRARY_SUFFIX = ".so"

# this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths
backup_paths = []
backup_paths.append('$CONDA_PREFIX/lib/libcudart.so.11.0')

class CUDASetup:
_instance = None
Expand Down Expand Up @@ -108,22 +106,30 @@ def initialize(self):
self.error = False

def manual_override(self):
if torch.cuda.is_available():
if 'BNB_CUDA_VERSION' in os.environ:
if len(os.environ['BNB_CUDA_VERSION']) > 0:
warn(
f'\n\n{"=" * 80}\n'
'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n'
'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n'
'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n'
'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
f'Loading CUDA version: BNB_CUDA_VERSION={os.environ["BNB_CUDA_VERSION"]}'
f'\n{"=" * 80}\n\n'
)
binary_name = self.binary_name.rsplit(".", 1)[0]
suffix = ".so" if os.name != "nt" else ".dll"
self.binary_name = binary_name[:-3] + f'{os.environ["BNB_CUDA_VERSION"]}.{suffix}'
if not torch.cuda.is_available():
return
override_value = os.environ.get('BNB_CUDA_VERSION')
if not override_value:
return

binary_name_stem, _, binary_name_ext = self.binary_name.rpartition(".")
# `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda118`;
# let's remove any trailing numbers:
binary_name_stem = binary_name_stem.rstrip("0123456789")
# `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda`;
# let's tack the new version number and the original extension back on.
self.binary_name = f"{binary_name_stem}{override_value}.{binary_name_ext}"

warn(
f'\n\n{"=" * 80}\n'
'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n'
'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n'
'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n'
'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
f'Loading: {self.binary_name}'
f'\n{"=" * 80}\n\n'
)

def run_cuda_setup(self):
self.initialized = True
Expand All @@ -140,11 +146,10 @@ def run_cuda_setup(self):
package_dir = Path(__file__).parent.parent
binary_path = package_dir / self.binary_name

suffix = ".so" if os.name != "nt" else ".dll"
try:
if not binary_path.exists():
self.add_log_entry(f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?")
legacy_binary_name = f"libbitsandbytes_cpu{suffix}"
legacy_binary_name = f"libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}"
self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
binary_path = package_dir / legacy_binary_name
if not binary_path.exists() or torch.cuda.is_available():
Expand Down Expand Up @@ -348,19 +353,18 @@ def get_compute_capabilities():

def evaluate_cuda_setup():
cuda_setup = CUDASetup.get_instance()
suffix = ".so" if os.name != "nt" else ".dll"
if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
cuda_setup.add_log_entry('')
cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35)
cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'),
('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues'))
cuda_setup.add_log_entry('='*80)
if not torch.cuda.is_available(): return f'libbitsandbytes_cpu{suffix}', None, None, None

if not torch.cuda.is_available():
return f'libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}', None, None, None

cudart_path = determine_cuda_runtime_lib_path()
ccs = get_compute_capabilities()
ccs.sort()
cc = ccs[-1] # we take the highest capability
cc = get_compute_capabilities()[-1] # we take the highest capability
cuda_version_string = get_cuda_version()

cuda_setup.add_log_entry(f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}.")
Expand All @@ -380,12 +384,11 @@ def evaluate_cuda_setup():
# we use ls -l instead of nvcc to determine the cuda version
# since most installations will have the libcudart.so installed, but not the compiler

if has_cublaslt:
binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
else:
"if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
if not has_cublaslt:
# if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
binary_name += "_nocublaslt"

binary_name = f"{binary_name}{suffix}"
binary_name = f"{binary_name}{DYNAMIC_LIBRARY_SUFFIX}"

return binary_name, cudart_path, cc, cuda_version_string

0 comments on commit 259ad44

Please sign in to comment.