import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets
 
def get_dataloaders(batch_size=64):
  transform = transforms.Compose([transforms.ToTensor()])
  train = datasets.MNIST(root="data", train=True, download=True, transform=transform)
  test = datasets.MNIST(root="data", train=False, download=True, transform=transform)
  return DataLoader(train, batch_size=batch_size, shuffle=True), DataLoader(test, batch_size=batch_size)
 
 
class MLP(nn.Module):
  def __init__(self, hidden=128):
    super().__init__()
    self.net = nn.Sequential(
      nn.Flatten(),
      nn.Linear(28*28, hidden),
      nn.ReLU(),
      nn.Linear(hidden, 10),
    )
 
  def forward(self, x):
    return self.net(x)
 
def train_model(epochs=1, lr=1e-3, device=None):
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
  model = MLP().to(device)
  opt = torch.optim.Adam(model.parameters(), lr=lr)
  loss_fn = nn.CrossEntropyLoss()
  train_loader, _ = get_dataloaders()
+  # Seed for reproducibility
+  torch.manual_seed(42)
+  if device == "cuda":
+    torch.cuda.manual_seed_all(42)
+  # AMP + Scheduler
+  scaler = torch.cuda.amp.GradScaler(enabled=(device=="cuda"))
+  scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs)
  model.train()
  for epoch in range(epochs):
    total, correct = 0, 0
    for x, y in tqdm(train_loader, desc=f"epoch {epoch+1}"):
      x, y = x.to(device), y.to(device)
      opt.zero_grad(set_to_none=True)
      logits = model(x)
      loss = loss_fn(logits, y)
      loss.backward()
      opt.step()
      scaler.scale(loss).backward()
        scaler.unscale_(opt)
+      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      scaler.step(opt)
      scaler.update()
+      preds = logits.argmax(dim=1)
+      total += y.size(0)
+      correct += (preds == y).sum().item()
+    acc = correct / max(1, total)
    scheduler.step()
+    print(f"epoch {epoch+1}: acc={acc:.3f}")
  return model`,