What is Model Offloading?
Model offloading, as described in the DeepSpeed ZeRO’s paper, refers to the technique of offloading parts of the model’s parameters, gradients, and optimizer states from GPU memory to CPU memory or NVMe storage during training. This allows for the training of models that are larger than the GPU memory, by leveraging the typically larger memory capacity available on the CPU or fast storage devices.
What is NanoGPT?
NanoGPT is develop by Andrew Karpathy, a Lead engineer from Tesla.
The simplest, fastest repository for training/finetuning medium-sized GPTs.
Andrew Karpathy
It basically only 2 files, less than 200 lines of code. and all is coded from scratch. It is great to learn about the ins and outs of a large language model.
Convert NanoGPT to support offload
# %%
from torch import nn
import torch.nn.functional as F
import torch
from torch.autograd import Function
import numpy as np
import time, os
from nanoGPT.model import GPT, Block, GPTConfig
data_dir = '/media/j/wdata/git/PYTHON_IMPORT/nanoGPT/data/all_news_char'
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
def get_batch(split, block_size, batch_size, device_type, device):
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
if device_type == 'cuda':
# pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
else:
x, y = x.to(device), y.to(device)
return x, y
# %%
if __name__ == '__main__':
device = 'cuda'
model_args = dict(n_layer=4, n_head=64, n_embd=768, block_size=4096, bias=True, vocab_size=128, dropout=0.1)
gptconf = GPTConfig(**model_args)
model_part_1 = GPT(gptconf)
model_part_1.transformer.h[1] = None
model_part_1.transformer.h[2] = None
model_part_1.transformer.h[3] = None
model_part_1.transformer.ln_f = None
model_part_1.lm_head = None
model_part_2 = GPT(gptconf)
model_part_2.transformer.rrr = torch.nn.Parameter(torch.randn(50000,10000))
model_part_2.transformer.wte = None
model_part_2.transformer.wpe = None
model_part_2.transformer.drop = None
model_part_2.transformer.h[0] = None
model_part_2.transformer.h[2] = None
model_part_2.transformer.h[3] = None
model_part_2.transformer.ln_f = None
model_part_2.lm_head = None
model_part_3 = GPT(gptconf)
model_part_3.transformer.wte = None
model_part_3.transformer.wpe = None
model_part_3.transformer.drop = None
model_part_3.transformer.h[0] = None
model_part_3.transformer.h[1] = None
model_part_3.transformer.h[3] = None
model_part_3.transformer.ln_f = None
model_part_3.lm_head = None
model_part_4 = GPT(gptconf)
model_part_4.transformer.wte = None
model_part_4.transformer.wpe = None
model_part_4.transformer.drop = None
model_part_4.transformer.h[0] = None
model_part_4.transformer.h[1] = None
model_part_4.transformer.h[2] = None
# Choose a loss function
loss_fn = torch.nn.CrossEntropyLoss()
# Choose an optimizer
weight_decay = 1e-1
learning_rate = 9e-3
beta1 = 0.9
beta2 = 0.99
params = list(model_part_1.parameters()) + list(model_part_2.parameters()) + list(model_part_3.parameters()) + list(model_part_4.parameters())
decay_params = [p for p in params if p.dim() >= 2]
nodecay_params = [p for p in params if p.dim() < 2]
optim_groups = [
{'params': decay_params, 'weight_decay': weight_decay},
{'params': nodecay_params, 'weight_decay': 0.0}
]
num_decay_params = sum(p.numel() for p in decay_params)
num_nodecay_params = sum(p.numel() for p in nodecay_params)
print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
# Create AdamW optimizer and use the fused version if it is available
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
start_time = time.time()
# Training loop
for i in range(50):
x, targets = get_batch('train', 128, 50, device, device)
print(f'x, targets {torch.cuda.memory_allocated() / 1024**2:0.2f}')
model_part_1.to('cuda')
print(f"for model_part_1.to('cuda') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
logits_1, _ = model_part_1(x)
# model_part_1.to('cpu')
torch.cuda.empty_cache()
print(f"for model_part_1 empty_cache {torch.cuda.memory_allocated() / 1024**2:0.2f}")
model_part_2.to('cuda')
print(f"formodel_part_2.to('cuda') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
logits_2, _ = model_part_2(logits_1)
model_part_2.to('cpu')
torch.cuda.empty_cache()
print(f"for model_part_2.to('cpu') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
model_part_3.to('cuda')
print(f"for model_part_3.to('cuda') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
logits_3, _ = model_part_3(logits_2)
model_part_3.to('cpu')
torch.cuda.empty_cache()
print(f"for model_part_3.to('cpu') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
model_part_4.to('cuda')
print(f"for model_part_4.to('cuda') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
logits_4, loss = model_part_4(logits_3, targets)
model_part_4.to('cpu')
torch.cuda.empty_cache()
print(f"for model_part_4.to('cpu') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
# Zero the gradients
optimizer.zero_grad()
# Backpropagation
model_part_4.to('cuda')
print(f"back model_part_4.to('cuda') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
logits_4.grad = torch.autograd.grad(outputs=loss, inputs=logits_4, retain_graph=True, create_graph=False)[0]
model_part_3.to('cuda')
print(f"back model_part_3.to('cuda') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
logits_3.grad = torch.autograd.grad(outputs=logits_4, inputs=logits_3, grad_outputs=logits_4.grad, retain_graph=True, create_graph=False)[0]
model_part_4.to('cpu')
torch.cuda.empty_cache()
print(f"back model_part_4.to('cpu') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
model_part_2.to('cuda')
print(f"back model_part_2.to('cuda') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
logits_2.grad = torch.autograd.grad(outputs=logits_3, inputs=logits_2, grad_outputs=logits_3.grad, retain_graph=True, create_graph=False)[0]
model_part_3.to('cpu')
torch.cuda.empty_cache()
print(f"back model_part_3.to('cpu') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
model_part_1.to('cuda')
print(f"back model_part_1.to('cuda') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
logits_1.grad = torch.autograd.grad(outputs=logits_2, inputs=logits_1, grad_outputs=logits_2.grad, retain_graph=True, create_graph=False)[0]
model_part_2.to('cpu')
torch.cuda.empty_cache()
print(f"back model_part_2.to('cpu') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
model_part_1.transformer.wpe.weight.grad, model_part_1.transformer.wte.weight.grad = torch.autograd.grad(outputs=logits_1, inputs=[model_part_1.transformer.wpe.weight, model_part_1.transformer.wte.weight], grad_outputs=logits_1.grad, retain_graph=False, create_graph=False)
model_part_1.to('cpu')
torch.cuda.empty_cache()
print(f"back model_part_1.to('cpu') {torch.cuda.memory_allocated() / 1024**2:0.2f}")
# Update the parameters
optimizer.step()
print(loss)
print(f'Time taken: {time.time() - start_time} seconds')
# %%