-
Notifications
You must be signed in to change notification settings - Fork 32
/
re_run_failed_training.py
111 lines (90 loc) · 3.28 KB
/
re_run_failed_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import neptune
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
# Step 1: Get Run ID
## Get project
project = neptune.init_project(
project="common/showroom", api_token=neptune.ANONYMOUS_API_TOKEN, mode="read-only"
)
## Fetch only inactive runs with tag "showcase-run"
runs_table_df = project.fetch_runs_table(
state="inactive", tag=["showcase-run"], columns=["sys/failed"]
).to_pandas()
## Extract the last failed run's id
failed_run_id = runs_table_df[runs_table_df["sys/failed"] == True]["sys/id"].values[0]
print("Failed_run_id = ", failed_run_id)
# Step 2: Resume failed run
failed_run = neptune.init_run(
project="common/showroom",
api_token=neptune.ANONYMOUS_API_TOKEN,
with_id=failed_run_id,
mode="read-only",
)
## Step 3: Use the fetch() method to retrieve relevant metadata
## Fetch hyperparameters
failed_run_params = failed_run["config/hyperparameters"].fetch()
## Fetch dataset path
dataset_path = failed_run["dataset/path"].fetch()
# Step 4: Create a new run
## Create a new Neptune run that will be used to log metadata in the re-run session.
new_run = neptune.init_run(
project="common/showroom",
api_token=neptune.ANONYMOUS_API_TOKEN,
tags=["re-run", "successful training"],
)
# Step 5: Log Hyperparameters and Dataset details from failed run to new run
## Now you can continue working and logging metadata to a brand new Run.
## You can log metadata using the Neptune API Client
new_run["config/hyperparameters"] = failed_run_params
new_run["dataset/path"] = dataset_path
## Load Dataset and Model
data_tfms = {
"train": transforms.Compose(
[
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
),
}
trainset = datasets.CIFAR10(dataset_path, transform=data_tfms["train"], download=True)
trainloader = torch.utils.data.DataLoader(
trainset, batch_size=failed_run_params["bs"], shuffle=True, num_workers=0
)
## Model
class BaseModel(nn.Module):
def __init__(self, input_sz, hidden_dim, n_classes):
super(BaseModel, self).__init__()
self.main = nn.Sequential(
nn.Linear(input_sz, hidden_dim * 2),
nn.ReLU(),
nn.Linear(hidden_dim * 2, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim // 2),
nn.ReLU(),
nn.Linear(hidden_dim // 2, n_classes),
)
def forward(self, input):
x = input.view(-1, 32 * 32 * 3)
return self.main(x)
model = BaseModel(
failed_run_params["input_sz"],
failed_run_params["input_sz"],
failed_run_params["n_classes"],
).to(failed_run_params["device"])
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=failed_run_params["lr"])
## Log losses and metrics
for i, (x, y) in enumerate(trainloader, 0):
x, y = x.to(failed_run_params["device"]), y.to(failed_run_params["device"])
optimizer.zero_grad()
outputs = model.forward(x)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, y)
acc = (torch.sum(preds == y.data)) / len(x)
new_run["training/batch/loss"].append(loss)
new_run["training/batch/acc"].append(acc)
loss.backward()
optimizer.step()