From 269c88230ec258d6590c626014edfad023ff6a7e Mon Sep 17 00:00:00 2001
From: Alex Noble <Alex_Noble@hotmail.com>
Date: Sat, 31 Aug 2024 03:36:01 +0800
Subject: [PATCH 1/3] Fix for multi gpu setup training with a single GPU.

check_nvidia() originally spawns a new process for nvidia-smi, thus bypassing that GPU count might be limited by an OS environmental variable as this won't be reflected in the new process.

Added check for if GPU is limited by OS environ, if multiple, raises error like original behaviour.

If only one GPU enabled, only returns output for that GPU.
---
 unsloth/tokenizer_utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 044629ea..5192b0f7 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -1091,12 +1091,20 @@ def add_new_tokens(
 
 
 def check_nvidia():
+    index_for_cuda = -1
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        index_for_cuda = os.environ["CUDA_VISIBLE_DEVICES"]
+        if "," in index_for_cuda:
+            raise RuntimeError("Unsloth currently does not support multi GPU setups - but we are working on it!")
+        index_for_cuda = int(index_for_cuda)
     # Unsloth doesn't work yet on AMD devices - we're working on it!
     output = np.array([0,])
     try:
         output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv", shell = True)
         output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
         output = np.array([int(x.decode('utf-8'))/1024 for x in output])
+        if index_for_cuda != -1:
+            output = np.array([output[index_for_cuda],])
     except:
         if not torch.cuda.is_available():
             raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")

From de8216c9d5ce477214f242c93a24b9f8256f2099 Mon Sep 17 00:00:00 2001
From: Alex Noble <Alex_Noble@hotmail.com>
Date: Sat, 31 Aug 2024 04:08:36 +0800
Subject: [PATCH 2/3] Add fixed code to the trainer patcher.

Add fixed code to the trainer patcher.
---
 unsloth/tokenizer_utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 5192b0f7..a9b163df 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -1168,11 +1168,20 @@ def patch_sft_trainer_tokenizer():
         "    )\n"\
         "pass\n"\
         "import subprocess, re, gc, numpy as np\n"\
+        "import os\n"\
+        "index_for_cuda = -1\n"\
+        "if \"CUDA_VISIBLE_DEVICES\" in os.environ:\n"\
+        "    index_for_cuda = os.environ[\"CUDA_VISIBLE_DEVICES\"]\n"\
+        "    if \",\" in index_for_cuda:\n"\
+        "        raise RuntimeError(\"Unsloth currently does not support multi GPU setups - but we are working on it!\")\n"\
+        "    index_for_cuda = int(index_for_cuda)\n"\
         "a = np.array([0,])\n"\
         "try:\n"\
         "    a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)\n"\
         "    a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)\n"\
         "    a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n"\
+        "    if index_for_cuda != -1:\n"\
+        "        a = np.array([output[index_for_cuda],])\n"\
         "except:\n"\
         "    if not torch.cuda.is_available():\n"\
         "        raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"\

From 72cd790d92ccb970545a5c0229b4cabf0e200fb7 Mon Sep 17 00:00:00 2001
From: Alex Noble <Alex_Noble@hotmail.com>
Date: Sat, 31 Aug 2024 04:16:18 +0800
Subject: [PATCH 3/3] Update tokenizer_utils.py

Fixed variable misname in trainer patcher.
---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index a9b163df..f50110b0 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -1181,7 +1181,7 @@ def patch_sft_trainer_tokenizer():
         "    a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)\n"\
         "    a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n"\
         "    if index_for_cuda != -1:\n"\
-        "        a = np.array([output[index_for_cuda],])\n"\
+        "        a = np.array([a[index_for_cuda],])\n"\
         "except:\n"\
         "    if not torch.cuda.is_available():\n"\
         "        raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"\