diff --git a/figures/DDP.png b/figures/DDP.png new file mode 100644 index 0000000..c9c2a4c Binary files /dev/null and b/figures/DDP.png differ diff --git a/tutorials/parallel_computing.ipynb b/tutorials/parallel_computing.ipynb index 73d9f40..f1eed04 100644 --- a/tutorials/parallel_computing.ipynb +++ b/tutorials/parallel_computing.ipynb @@ -9,7 +9,159 @@ "author: Jing Zhang \n", "e-mail: zhangjingnm@hotmail.com \n", "date: 2024-09 \n", - "reference: http://www.idris.fr/eng/jean-zay/gpu/jean-zay-gpu-torch-multi-eng.html" + "reference: http://www.idris.fr/eng/jean-zay/gpu/jean-zay-gpu-torch-multi-eng.html \n", + "https://keras.io/guides/distributed_training_with_torch/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Single GPU" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "implemention" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from torchvision import datasets, transforms\n", + "from torch.utils.data import DataLoader\n", + "import timm\n", + "import os\n", + "import socket\n", + "\n", + "hostname = socket.gethostname()\n", + "num_gpus = torch.cuda.device_count()\n", + "num_epochs = 2\n", + "batch_size = 64\n", + "model_path = './best.pth'\n", + "\n", + "for i in range(num_gpus): print(f\"GPU {i}: {torch.cuda.get_device_name(i)}\")\n", + "print(f'{num_gpus} GPU in total on the node [{hostname}]\\n')\n", + "\n", + "# data augmentation and preprocessing\n", + "transform = transforms.Compose([\n", + " transforms.RandomHorizontalFlip(),\n", + " transforms.RandomCrop(32, padding=4),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])\n", + "])\n", + "transform_val = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])\n", + "])\n", + "\n", + "# load CIFAR-10 dataset\n", + "train_dataset = datasets.CIFAR10(root='./data', train=True, download=False, transform=transform)\n", + "valid_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transform_val)\n", + "\n", + "train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", + "valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)\n", + "\n", + "def train_model(model, train_loader, valid_loader, num_epochs, optimizer, criterion):\n", + " best_acc = 0.0\n", + " for epoch in range(num_epochs):\n", + " model.train()\n", + " running_loss = 0.0\n", + " running_loss_count = 0\n", + " correct = 0\n", + " total = 0\n", + " \n", + " # train\n", + " for inputs, labels in train_loader:\n", + " inputs, labels = inputs.cuda(), labels.cuda()\n", + " # forward pass\n", + " outputs = model(inputs)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " # backward propagation and optimization\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " running_loss += loss.item()\n", + " running_loss_count += 1\n", + " _, predicted = outputs.max(1)\n", + " total += labels.size(0)\n", + " correct += predicted.eq(labels).sum().item()\n", + "\n", + " epoch_loss = running_loss / running_loss_count\n", + " epoch_acc = 100. * correct / total\n", + "\n", + " # valid\n", + " model.eval()\n", + " correct_val = 0\n", + " total_val = 0\n", + " running_loss_val = 0.0\n", + " running_loss_count_val = 0\n", + " with torch.no_grad():\n", + " for inputs, labels in valid_loader:\n", + " inputs, labels = inputs.cuda(), labels.cuda()\n", + " outputs = model(inputs)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " running_loss_val += loss.item()\n", + " running_loss_count_val += 1\n", + " _, predicted = outputs.max(1)\n", + " total_val += labels.size(0)\n", + " correct_val += predicted.eq(labels).sum().item()\n", + "\n", + " epoch_loss_val = running_loss_val / running_loss_count_val\n", + " epoch_acc_val = 100. * correct_val / total_val\n", + " print(f'Epoch {epoch+1}/{num_epochs}: Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}, Valid Loss: {epoch_loss_val:.4f}, Valid Acc: {epoch_acc_val:.2f}')\n", + " \n", + " # save model\n", + " if epoch_acc_val > best_acc:\n", + " print(f'Saving best model with validation accuracy: {epoch_acc_val:.2f}%')\n", + " torch.save(model.state_dict(), model_path)\n", + " best_acc = epoch_acc_val\n", + "\n", + "if __name__ == \"__main__\":\n", + " \n", + " model = timm.create_model('resnet18', pretrained=True, num_classes=10)\n", + " model = model.cuda()\n", + " optimizer = optim.AdamW(model.parameters(), lr=1e-3)\n", + " criterion = nn.CrossEntropyLoss()\n", + "\n", + " train_model(model, train_loader, valid_loader, num_epochs, optimizer, criterion)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "output\n", + "```\n", + "GPU 0: NVIDIA RTX A5000\n", + "1 GPU in total on the node [cn39]\n", + "\n", + "Epoch 1/2: Train Loss: 1.1341, Train Acc: 59.92, Valid Loss: 0.7287, Valid Acc: 75.31\n", + "Saving best model with validation accuracy: 75.31%\n", + "Epoch 2/2: Train Loss: 0.7425, Train Acc: 74.28, Valid Loss: 0.6481, Valid Acc: 77.84\n", + "Saving best model with validation accuracy: 77.84%\n", + " \n", + " \n", + "This program takes 0d:0h:1m:20s\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multi-GPU on one node with SLURM" ] }, { @@ -19,6 +171,275 @@ "### Data parallelism" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "implemention 0:Data Parallelism \n", + "easy but not recommand. reference: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html \n", + "https://torchhogehoge.hatenablog.com/entry/pytorch_DDP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Multi GPU(DP)\n", + "import torch\n", + "import torchvision\n", + "import torch.nn as nn\n", + "\n", + "class NN(nn.Module):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.n = nn.Sequential(\n", + " nn.Flatten(),\n", + " nn.Linear(28*28, 128),\n", + " nn.ReLU(),\n", + " nn.Linear(128, 10)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.n(x)\n", + "data = torchvision.datasets.MNIST(root= \"data\", train=True, download=True, transform = torchvision.transforms.ToTensor())\n", + "data_loader = torch.utils.data.DataLoader(data, batch_size=64*torch.cuda.device_count(), shuffle=True, num_workers=2, pin_memory=True)\n", + "\n", + "model = NN().cuda()\n", + "model = torch.nn.DataParallel(model)\n", + "criterion = nn.CrossEntropyLoss()\n", + "optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)\n", + "\n", + "for epoch in range(2):\n", + " total_loss = 0\n", + " for imgs, labels in data_loader:\n", + " predict = model(imgs.cuda())\n", + " loss = criterion(predict, labels.cuda())\n", + " total_loss+= loss.item()\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " print(f\"{epoch:3d}: {total_loss:.4f}\")\n", + "torch.save(model.module.state_dict(), 'model.pth')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "implemention 1:Distributed Data Parallelism \n", + "reference: https://pytorch.org/docs/stable/notes/ddp.html#ddp \n", + "https://www.youtube.com/watch?v=bwNtfxEDjGA \n", + "![DDP](../figures/DDP.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from torchvision import datasets, transforms\n", + "from torch.utils.data import DataLoader\n", + "import torch.utils.data.distributed as data_dist\n", + "import torch.distributed as dist\n", + "from torch.nn.parallel import DistributedDataParallel as DDP\n", + "import hostlist\n", + "import timm\n", + "import os\n", + "import socket\n", + "\n", + "hostname = socket.gethostname()\n", + "num_gpus = torch.cuda.device_count()\n", + "num_epochs = 2\n", + "batch_size = 64\n", + "model_path = './best.pth'\n", + "\n", + "for i in range(num_gpus): print(f\"GPU {i}: {torch.cuda.get_device_name(i)}\")\n", + "print(f'{num_gpus} GPU in total on the node [{hostname}]\\n')\n", + "\n", + "def setup_device(current_gpu_index, num_gpus):\n", + " # Device setup\n", + " os.environ['MASTER_ADDR'] = str(hostname) # or localhost\n", + " os.environ['MASTER_PORT'] = str(12345 + int(num_gpus)) # to avoid port conflict on the same node\n", + " gpu = torch.device(\"cuda:{}\".format(current_gpu_index))\n", + " dist.init_process_group(backend=\"nccl\", world_size=num_gpus, rank=current_gpu_index)\n", + " torch.cuda.set_device(gpu)\n", + "\n", + "def cleanup():\n", + " dist.destroy_process_group()\n", + "\n", + "def prepare_dataloader(dataset, current_gpu_index, num_gpus, batch_size):\n", + " sampler = data_dist.DistributedSampler(dataset, num_replicas=num_gpus, rank=current_gpu_index, shuffle=False)\n", + " dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size, shuffle=False)\n", + " return dataloader\n", + "\n", + "def per_device_launch_fn(current_gpu_index, num_gpu):\n", + " # Setup the process groups\n", + " setup_device(current_gpu_index, num_gpu) \n", + " print(f'current_gpu_index: {current_gpu_index}')\n", + " \n", + " # data augmentation and preprocessing\n", + " transform = transforms.Compose([\n", + " transforms.RandomHorizontalFlip(), \n", + " transforms.RandomCrop(32, padding=4), \n", + " transforms.ToTensor(), \n", + " transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261]) \n", + " ])\n", + " transform_val = transforms.Compose([\n", + " transforms.ToTensor(), \n", + " transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261]) \n", + " ])\n", + " # load CIFAR-10 dataset\n", + " train_dataset = datasets.CIFAR10(root='./data', train=True, download=False, transform=transform)\n", + " valid_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transform_val)\n", + "\n", + " # prepare the dataloader\n", + " train_loader = prepare_dataloader(train_dataset, current_gpu_index, num_gpu, batch_size)\n", + " valid_loader = prepare_dataloader(valid_dataset, current_gpu_index, num_gpu, batch_size)\n", + "\n", + " # Put model on device\n", + " model = timm.create_model('resnet18', pretrained=True, num_classes=10, in_chans=3)\n", + " optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)\n", + " loss_fn = nn.CrossEntropyLoss()\n", + " \n", + " # Put model on device\n", + " model = model.to(current_gpu_index)\n", + " ddp_model = DDP(model, device_ids=[current_gpu_index], output_device=current_gpu_index) \n", + " \n", + " train_model(ddp_model, train_loader, valid_loader, num_epochs, optimizer, loss_fn)\n", + " cleanup()\n", + "\n", + "def train_model(model, train_loader, valid_loader, num_epochs, optimizer, criterion):\n", + " best_acc = 0.0\n", + " for epoch in range(num_epochs):\n", + " model.train()\n", + " running_loss = 0.0\n", + " running_loss_count = 0\n", + " correct = 0\n", + " total = 0\n", + " # train\n", + " for inputs, labels in train_loader:\n", + " inputs, labels = inputs.cuda(non_blocking=True), labels.cuda(non_blocking=True)\n", + "\n", + " # forward pass\n", + " outputs = model(inputs)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " # backword propagation and optmization\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # calculate loss and accuracy\n", + " running_loss += loss.item() # average loss of batch_size\n", + " running_loss_count += 1\n", + " _, predicted = outputs.max(1) # return max value and its index, outputs:(batch_size, num_classes)\n", + " total += labels.size(0) # (batch_size,)\n", + " correct += predicted.eq(labels).sum().item() # .item() Convert a tensor to a scalar\n", + "\n", + " epoch_loss = running_loss / running_loss_count\n", + " epoch_acc = 100. * correct / total\n", + " # valid\n", + " model.eval()\n", + " running_loss_val = 0.0\n", + " running_loss_count_val = 0\n", + " correct_val = 0\n", + " total_val = 0\n", + " with torch.no_grad():\n", + " for inputs, labels in valid_loader:\n", + " inputs, labels = inputs.cuda(non_blocking=True), labels.cuda(non_blocking=True)\n", + "\n", + " outputs = model(inputs)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " running_loss_val += loss.item()\n", + " running_loss_count_val += 1\n", + " _, predicted = outputs.max(1)\n", + " total_val += labels.size(0)\n", + " correct_val += predicted.eq(labels).sum().item()\n", + "\n", + " epoch_loss_val = running_loss_val / running_loss_count_val\n", + " epoch_acc_val = 100. * correct_val / total_val\n", + " print(f'Epoch {epoch+1}/{num_epochs}: Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}, Valid Loss: {epoch_loss_val:.4f}, Valid Acc: {epoch_acc_val:.2f}')\n", + " # save model at rank=0\n", + " if dist.get_rank() == 0:\n", + " if epoch_acc_val > best_acc:\n", + " print(f'Saving best model with validation accuracy: {epoch_acc_val:.2f}%')\n", + " torch.save(model.state_dict(), model_path)\n", + " best_acc = epoch_acc_val\n", + "\n", + "if __name__ == \"__main__\":\n", + " torch.multiprocessing.start_processes(\n", + " per_device_launch_fn,\n", + " args=(num_gpus,),\n", + " nprocs=num_gpus,\n", + " join=True,\n", + " start_method=\"spawn\",\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "output\n", + "```\n", + "GPU 0: NVIDIA RTX A5000\n", + "GPU 1: NVIDIA RTX A5000\n", + "GPU 2: NVIDIA RTX A5000\n", + "GPU 3: NVIDIA RTX A5000\n", + "4 GPU in total on the node [cn39]\n", + "\n", + "GPU 0: NVIDIA RTX A5000\n", + "GPU 1: NVIDIA RTX A5000\n", + "GPU 2: NVIDIA RTX A5000\n", + "GPU 3: NVIDIA RTX A5000\n", + "4 GPU in total on the node [cn39]\n", + "\n", + "current_gpu_index: 2\n", + "Epoch 1/2: Train Loss: 1.3159, Train Acc: 54.38, Valid Loss: 0.8592, Valid Acc: 69.44\n", + "Epoch 2/2: Train Loss: 0.8251, Train Acc: 71.74, Valid Loss: 0.6585, Valid Acc: 76.80\n", + "GPU 0: NVIDIA RTX A5000\n", + "GPU 1: NVIDIA RTX A5000\n", + "GPU 2: NVIDIA RTX A5000\n", + "GPU 3: NVIDIA RTX A5000\n", + "4 GPU in total on the node [cn39]\n", + "\n", + "current_gpu_index: 1\n", + "Epoch 1/2: Train Loss: 1.3193, Train Acc: 54.21, Valid Loss: 0.8807, Valid Acc: 69.68\n", + "Epoch 2/2: Train Loss: 0.8282, Train Acc: 71.26, Valid Loss: 0.6728, Valid Acc: 76.80\n", + "GPU 0: NVIDIA RTX A5000\n", + "GPU 1: NVIDIA RTX A5000\n", + "GPU 2: NVIDIA RTX A5000\n", + "GPU 3: NVIDIA RTX A5000\n", + "4 GPU in total on the node [cn39]\n", + "\n", + "current_gpu_index: 0\n", + "Epoch 1/2: Train Loss: 1.3200, Train Acc: 53.60, Valid Loss: 0.8733, Valid Acc: 69.96\n", + "Saving best model with validation accuracy: 69.96%\n", + "Epoch 2/2: Train Loss: 0.8254, Train Acc: 71.49, Valid Loss: 0.6761, Valid Acc: 77.56\n", + "Saving best model with validation accuracy: 77.56%\n", + "GPU 0: NVIDIA RTX A5000\n", + "GPU 1: NVIDIA RTX A5000\n", + "GPU 2: NVIDIA RTX A5000\n", + "GPU 3: NVIDIA RTX A5000\n", + "4 GPU in total on the node [cn39]\n", + "\n", + "current_gpu_index: 3\n", + "Epoch 1/2: Train Loss: 1.3211, Train Acc: 54.14, Valid Loss: 0.8391, Valid Acc: 70.36\n", + "Epoch 2/2: Train Loss: 0.8263, Train Acc: 71.31, Valid Loss: 0.6591, Valid Acc: 77.00\n", + " \n", + " \n", + "This program takes 0d:0h:0m:51s\n", + "```" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -30,140 +451,297 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Multi-process configuration with SLURM" + "implemention" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Files already downloaded and verified\n", - "Files already downloaded and verified\n", - "Epoch 1/10: Train Loss: 1.1410, Train Acc: 60.10, Test Loss: 0.8326, Test Acc: 71.25%\n", - "Epoch 2/10: Train Loss: 0.7750, Train Acc: 73.15, Test Loss: 0.7030, Test Acc: 75.96%\n", - "Epoch 3/10: Train Loss: 0.6593, Train Acc: 76.96, Test Loss: 0.6416, Test Acc: 77.75%\n", - "Epoch 4/10: Train Loss: 0.5870, Train Acc: 79.59, Test Loss: 0.5958, Test Acc: 79.26%\n", - "Epoch 5/10: Train Loss: 0.5420, Train Acc: 81.16, Test Loss: 0.5572, Test Acc: 80.90%\n", - "Epoch 6/10: Train Loss: 0.4965, Train Acc: 82.73, Test Loss: 0.5543, Test Acc: 81.10%\n", - "Epoch 7/10: Train Loss: 0.4654, Train Acc: 83.87, Test Loss: 0.5373, Test Acc: 81.57%\n", - "Epoch 8/10: Train Loss: 0.4363, Train Acc: 84.82, Test Loss: 0.5188, Test Acc: 82.19%\n", - "Epoch 9/10: Train Loss: 0.4177, Train Acc: 85.38, Test Loss: 0.5282, Test Acc: 81.94%\n", - "Epoch 10/10: Train Loss: 0.3896, Train Acc: 86.30, Test Loss: 0.5135, Test Acc: 82.77%\n" - ] - } - ], + "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", - "from torchvision import datasets, transforms, models\n", + "from torchvision import datasets, transforms\n", "from torch.utils.data import DataLoader\n", + "import timm\n", + "import os\n", + "import socket\n", "\n", - "# data augmentation and preprocessing\n", - "transform = transforms.Compose([\n", - " transforms.RandomHorizontalFlip(), \n", - " transforms.RandomCrop(32, padding=4), \n", - " transforms.ToTensor(), \n", - " transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261]) \n", - "])\n", + "hostname = socket.gethostname()\n", + "num_gpus = torch.cuda.device_count()\n", + "num_epochs = 2\n", + "batch_size = 64\n", + "model_path = './best.pth'\n", "\n", - "# load CIFAR-10 dataset\n", - "train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)\n", - "test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)\n", + "for i in range(num_gpus): print(f\"GPU {i}: {torch.cuda.get_device_name(i)}\")\n", + "print(f'{num_gpus} GPU in total on the node [{hostname}]\\n')\n", "\n", - "train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)\n", - "test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)\n", + "model = timm.create_model('resnet18', pretrained=True, num_classes=10)\n", "\n", - "# load pretrained ResNet18 model\n", - "model = models.resnet18(weights='ResNet18_Weights.DEFAULT')\n", + "# split model mannualy into different parts (GPU)\n", + "class SplitResNet18(nn.Module):\n", + " def __init__(self):\n", + " super(SplitResNet18, self).__init__()\n", + " # GPU 0\n", + " self.conv1_to_maxpool = nn.Sequential(model.conv1, model.bn1, model.act1, model.maxpool).to('cuda:0')\n", + " # GPU 1 \n", + " self.layer1_to_layer2 = nn.Sequential(model.layer1, model.layer2).to('cuda:1')\n", + " # GPU 2\n", + " self.layer3_to_layer4 = nn.Sequential(model.layer3, model.layer4).to('cuda:2')\n", + " # GPU 3\n", + " self.global_pool_to_fc = nn.Sequential(model.global_pool, model.fc).to('cuda:3')\n", "\n", - "# modify last fully connected layer, to adapt for CIFAR-10\n", - "num_ftrs = model.fc.in_features\n", - "model.fc = nn.Linear(num_ftrs, 10) # 10 classes in CIFAR-10\n", + " def forward(self, x):\n", + " # input to GPU 0 \n", + " x = x.to('cuda:0')\n", + " x = self.conv1_to_maxpool(x)\n", + " x = x.to('cuda:1')\n", + " x = self.layer1_to_layer2(x)\n", + " x = x.to('cuda:2')\n", + " x = self.layer3_to_layer4(x)\n", + " x = x.to('cuda:3')\n", + " x = self.global_pool_to_fc(x)\n", + " \n", + " return x\n", "\n", - "# move model to GPU\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "model = model.to(device)\n", + "# data augmentation and preprocessing\n", + "transform = transforms.Compose([\n", + " transforms.RandomHorizontalFlip(),\n", + " transforms.RandomCrop(32, padding=4),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])\n", + "])\n", + "transform_val = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])\n", + "])\n", "\n", - "criterion = nn.CrossEntropyLoss()\n", - "optimizer = optim.AdamW(model.parameters(), lr=0.0001)\n", - "\n", - "def train(model, train_loader, criterion, optimizer, device):\n", - " model.train()\n", - " running_loss = 0.0\n", - " correct = 0\n", - " total = 0\n", - "\n", - " for inputs, labels in train_loader:\n", - " inputs, labels = inputs.to(device), labels.to(device)\n", - "\n", - " # forward pass\n", - " outputs = model(inputs)\n", - " loss = criterion(outputs, labels)\n", - "\n", - " # backword propagation and optmization\n", - " optimizer.zero_grad()\n", - " loss.backward()\n", - " optimizer.step()\n", - "\n", - " # calculate loss and accuracy\n", - " running_loss += loss.item() * inputs.size(0)\n", - " _, predicted = outputs.max(1) # get predicted class\n", - " total += labels.size(0)\n", - " correct += predicted.eq(labels).sum().item()\n", - "\n", - " epoch_loss = running_loss / len(train_loader.dataset)\n", - " epoch_acc = 100. * correct / total\n", - " return epoch_loss, epoch_acc\n", - "\n", - "def evaluate(model, test_loader, criterion, device):\n", - " model.eval()\n", - " running_loss = 0.0\n", - " correct = 0\n", - " total = 0\n", - "\n", - " with torch.no_grad():\n", - " for inputs, labels in test_loader:\n", - " inputs, labels = inputs.to(device), labels.to(device)\n", + "# load CIFAR-10 dataset\n", + "train_dataset = datasets.CIFAR10(root='./data', train=True, download=False, transform=transform)\n", + "valid_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transform_val)\n", "\n", + "train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", + "valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)\n", + "\n", + "def train_model(model, train_loader, valid_loader, num_epochs, optimizer, criterion):\n", + " best_acc = 0.0\n", + " for epoch in range(num_epochs):\n", + " model.train()\n", + " running_loss = 0.0\n", + " running_loss_count = 0\n", + " correct = 0\n", + " total = 0\n", + " \n", + " device = next(model.parameters()).device\n", + " print(f'current device: {device}')\n", + " # train\n", + " for inputs, labels in train_loader:\n", + " inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)\n", + " # forward pass\n", " outputs = model(inputs)\n", + " outputs = outputs.to(device)\n", " loss = criterion(outputs, labels)\n", "\n", - " running_loss += loss.item() * inputs.size(0)\n", + " # backward propagation and optimization\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " running_loss += loss.item()\n", + " running_loss_count += 1\n", " _, predicted = outputs.max(1)\n", " total += labels.size(0)\n", " correct += predicted.eq(labels).sum().item()\n", "\n", - " epoch_loss = running_loss / len(test_loader.dataset)\n", - " epoch_acc = 100. * correct / total\n", - " return epoch_loss, epoch_acc\n", + " epoch_loss = running_loss / running_loss_count\n", + " epoch_acc = 100. * correct / total\n", + "\n", + " # valid\n", + " model.eval()\n", + " correct_val = 0\n", + " total_val = 0\n", + " running_loss_val = 0.0\n", + " running_loss_count_val = 0\n", + " with torch.no_grad():\n", + " for inputs, labels in valid_loader:\n", + " inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)\n", + " outputs = model(inputs)\n", + " outputs = outputs.to(device)\n", + " loss = criterion(outputs, labels)\n", "\n", - "num_epochs = 10\n", + " running_loss_val += loss.item()\n", + " running_loss_count_val += 1\n", + " _, predicted = outputs.max(1)\n", + " total_val += labels.size(0)\n", + " correct_val += predicted.eq(labels).sum().item()\n", "\n", - "for epoch in range(num_epochs):\n", - " train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)\n", - " test_loss, test_acc = evaluate(model, test_loader, criterion, device)\n", + " epoch_loss_val = running_loss_val / running_loss_count_val\n", + " epoch_acc_val = 100. * correct_val / total_val\n", + " print(f'Epoch {epoch+1}/{num_epochs}: Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}, Valid Loss: {epoch_loss_val:.4f}, Valid Acc: {epoch_acc_val:.2f}')\n", + " \n", + " # save model\n", + " if epoch_acc_val > best_acc:\n", + " print(f'Saving best model with validation accuracy: {epoch_acc_val:.2f}%')\n", + " torch.save(model.state_dict(), model_path)\n", + " best_acc = epoch_acc_val\n", "\n", - " print(f'Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')" + "if __name__ == \"__main__\":\n", + " \n", + " model = SplitResNet18()\n", + " optimizer = optim.AdamW(model.parameters(), lr=1e-3)\n", + " criterion = nn.CrossEntropyLoss()\n", + "\n", + " train_model(model, train_loader, valid_loader, num_epochs, optimizer, criterion)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "output\n", + "```\n", + "GPU 0: NVIDIA RTX A5000\n", + "GPU 1: NVIDIA RTX A5000\n", + "GPU 2: NVIDIA RTX A5000\n", + "GPU 3: NVIDIA RTX A5000\n", + "4 GPU in total on the node [cn39]\n", + "\n", + "current device: cuda:0\n", + "Epoch 1/2: Train Loss: 1.1441, Train Acc: 60.03, Valid Loss: 0.7721, Valid Acc: 73.44\n", + "Saving best model with validation accuracy: 73.44%\n", + "current device: cuda:0\n", + "Epoch 2/2: Train Loss: 0.7518, Train Acc: 74.11, Valid Loss: 0.6648, Valid Acc: 76.91\n", + "Saving best model with validation accuracy: 76.91%\n", + " \n", + " \n", + "This program takes 0d:0h:1m:20s\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SLURM information\n", + "\n", + "```\n", + "$ sinfo\n", + "PARTITION AVAIL TIMELIMIT NODES STATE NODELIST\n", + "GPU11Go up 2-00:00:00 13 mix cn[01-07,09-11,15,17,41]\n", + "GPU11Go up 2-00:00:00 1 idle cn16\n", + "GPU24Go up 1-00:00:00 9 idle cn[18,22,24-25,35,38-40,53]\n", + "GPU48Go up 1-00:00:00 1 down* cn60\n", + "GPU48Go up 1-00:00:00 3 mix cn[37,48,57]\n", + "GPU48Go up 1-00:00:00 3 idle cn[20,42,52]\n", + "GPU96Go up 1-00:00:00 3 mix cn[51,58-59]\n", + "GPU96Go up 1-00:00:00 1 alloc cn50\n", + "Tests-GPU24Go up 5:00 1 down* cn54\n", + "WS-CPU1* up 4-00:00:00 3 down* cn[32-33,46]\n", + "WS-CPU1* up 4-00:00:00 7 mix cn[08,12-14,19,26,34]\n", + "WS-CPU1* up 4-00:00:00 6 alloc cn[21,27-29,31,36]\n", + "WS-CPU2 up 4-00:00:00 12 mix cn[01-07,09-11,15,17]\n", + "Serveurs-CPU up 10-00:00:0 8 mix cn[30,43-45,47,49,55-56]\n", + "```\n", + "\n", + "```\n", + "#!/bin/bash \n", + "#SBATCH --partition=GPU24Go # partition name\n", + "##SBATCH --nodes=2 # total number of nodes\n", + "#SBATCH --nodelist=cn39 # Or specify a specific node check via sinfo\n", + "#SBATCH --ntasks-per-node=1 # number of tasks per node, if >1 will repeat task\n", + "#SBATCH --gres=gpu:4 # number of GPUs reserved per node\n", + "#SBATCH --cpus-per-task=4 # number of CPUs per task\n", + "##SBATCH --mem-per-cpu=1500MB # memory of per CPU\n", + "#SBATCH -J multigpu # job name\n", + "#SBATCH --out=%J/result.txt # log file\n", + "#SBATCH --error=%J/error.txt # log file\n", + "##SBATCH --array=0-1 # array 2 jobs\n", + "#SBATCH --array=1 # array 1 job\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "SLURM in python" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hostname = socket.gethostname()\n", + "num_gpus = torch.cuda.device_count()\n", + "print(f'{num_gpus} GPU in total on this node {hostname}')\n", + "num_epochs = 2\n", + "batch_size = 64\n", + "\n", + "for i in range(num_gpus):\n", + " print(f\"GPU {i}: {torch.cuda.get_device_name(i)}\")\n", + "\n", + "# get SLURM variables\n", + "global_rank = int(os.environ['SLURM_PROCID']) # The global ranking of the current process among all processes\n", + "local_rank = int(os.environ['SLURM_LOCALID']) # The local rank of the current process on its node\n", + "number_tasks = int(os.environ['SLURM_NTASKS']) # The total number of tasks in the job\n", + "cpus_per_task = int(os.environ['SLURM_CPUS_PER_TASK']) # The number of CPUs allocated to each task\n", + "nodelist = os.environ['SLURM_JOB_NODELIST'] # the list of nodes\n", + "nodelist = hostlist.expand_hostlist(os.environ['SLURM_JOB_NODELIST']) # expand nodelist\n", + "gpu_ids = os.environ['SLURM_STEP_GPUS'].split(\",\") # The list of GPU IDs allocated to the job\n", + "\n", + "print(f' global_rank: {global_rank}')\n", + "print(f' local_rank: {local_rank}')\n", + "print(f' number_tasks: {number_tasks}')\n", + "print(f'cpus_per_task: {cpus_per_task}')\n", + "print(f' nodelist: {nodelist}')\n", + "print(f' gpu_ids: {gpu_ids}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "output\n", + "```\n", + "4 GPU in total on this node cn39\n", + "GPU 0: NVIDIA RTX A5000\n", + "GPU 1: NVIDIA RTX A5000\n", + "GPU 2: NVIDIA RTX A5000\n", + "GPU 3: NVIDIA RTX A5000\n", + " global_rank: 0\n", + " local_rank: 0\n", + " number_tasks: 1\n", + "cpus_per_task: 4\n", + " nodelist: ['cn39']\n", + " gpu_ids: ['0', '1', '2', '3']\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GPU information" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1 GPUs in total on this machine\n", + "1 GPU in total on this machine\n", "GPU 0: Quadro RTX 5000\n", - "Node name (hostname): DESKTOP-7GS0DEP\n" + "Node name (hostname): DESKTOP-7GS0DEP\n", + "PyTorch version: 2.4.1+cu124\n", + "CUDA version: 12.4\n" ] } ], @@ -172,7 +750,7 @@ "import torch \n", "\n", "num_gpus = torch.cuda.device_count()\n", - "print(f'{num_gpus} GPUs in total on this machine')\n", + "print(f'{num_gpus} GPU in total on this machine')\n", "\n", "for i in range(num_gpus):\n", " print(f\"GPU {i}: {torch.cuda.get_device_name(i)}\")\n", @@ -180,7 +758,9 @@ "import socket\n", "hostname = socket.gethostname()\n", "print(f\"Node name (hostname): {hostname}\")\n", - "\n" + "\n", + "print('PyTorch version:', torch.__version__) # PyTorch version\n", + "print('CUDA version:', torch.version.cuda) # CUDA version" ] }, { @@ -215,7 +795,14 @@ "print(f\"Node name (hostname): {hostname}\")\n", "\n", "print('PyTorch version:', torch.__version__) # PyTorch version\n", - "print('CUDA version:', torch.version.cuda) # CUDA version" + "print('CUDA version:', torch.version.cuda) # CUDA version" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multi-GPU on multi nodes with SLURM (todo)" ] } ], @@ -235,7 +822,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.6" }, "orig_nbformat": 4, "vscode": {