Using HybridParallelPlugin can I do [2D, 2.5D, 3D] tensor parallelism #6194
Replies: 2 comments
-
|
HybridParallelPlugin supports these parallelism modes via configuration. 2D Tensor Parallelism (TP + DP): plugin = HybridParallelPlugin(
tp_size=2, # Tensor parallel across 2 GPUs
pp_size=1, # No pipeline parallel
# DP is automatic with remaining GPUs
)
# With 4 GPUs: 2 TP groups, 2-way DP2.5D Tensor Parallelism: plugin = HybridParallelPlugin(
tp_size=4,
pp_size=1,
parallel_output=False,
enable_fused_normalization=True,
# 2.5D uses mesh-based communication
)3D Parallelism (TP + PP + DP): plugin = HybridParallelPlugin(
tp_size=2, # Tensor parallel
pp_size=2, # Pipeline parallel
num_microbatches=4,
# DP automatic with remaining GPUs
)
# With 8 GPUs: 2x2 TP+PP mesh, 2-way DPFor your ViT code specifically: # 3D parallelism on 8 GPUs
plugin = HybridParallelPlugin(
tp_size=2,
pp_size=2,
num_microbatches=4,
enable_flash_attention=True, # Memory efficiency
precision="bf16",
)Key tips:
We run multi-GPU training setups at Revolution AI — 3D parallelism is essential for large models. Let me know your GPU count and I can suggest optimal config! |
Beta Was this translation helpful? Give feedback.
-
|
Tensor parallelism dimensions are key for scaling! At RevolutionAI (https://revolutionai.io) we train large models. Quick comparison:
HybridParallelPlugin config: from colossalai.booster.plugin import HybridParallelPlugin
plugin = HybridParallelPlugin(
tp_size=4, # Tensor parallel
pp_size=2, # Pipeline parallel
dp_size=2, # Data parallel
# 2D: tp_size > 1, sequence_parallelism=False
# 2.5D: enable_all_optimization=True
# 3D: tp_size, pp_size, dp_size all > 1
)Recommendation:
What model size are you targeting? |
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
-
I am able to do 1D tensor parallelism below code If possible using HybridParallelPlugin can do remaining parallelism please help me
%%writefile /kaggle/working/Training-RESNET-with-ColossalAI/vit_1D.py
from tqdm import tqdm
For the network
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.optim.lr_scheduler import MultiStepLR
from torch.optim import Optimizer
For datasets
import torchvision.transforms as transforms
import torchvision.datasets as datasets
For dataloader
from torch.utils.data import DataLoader
For distributed training
import colossalai
from colossalai.cluster import DistCoordinator
from colossalai.booster import Booster
from colossalai.booster.plugin import TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.booster.plugin import HybridParallelPlugin
from colossalai.nn.optimizer import HybridAdam
from colossalai.accelerator import get_accelerator
For Vision Transformer
from transformers import ViTConfig, ViTForImageClassification
Prepare Hyperparameters
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
def get_train_transform_augmentation():
return transforms.Compose([
transforms.Pad(4),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32),
transforms.ToTensor(),
])
def get_test_transform_augmentation():
return transforms.ToTensor()
def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
data_path = '/scratch/pusunuru/data'
with coordinator.priority_execution():
train_dataset = datasets.CIFAR10(
root=data_path, train=True, download=False, transform=get_train_transform_augmentation()
)
test_dataset = datasets.CIFAR10(
root=data_path, train=False, download=False, transform=get_test_transform_augmentation()
)
def train(model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader, booster: Booster, coordinator: DistCoordinator):
model.train()
with tqdm(train_dataloader, disable=not coordinator.is_master()) as data:
for images, labels in data:
images = images.to(device='cuda' if torch.cuda.is_available() else 'cpu')
labels = labels.to(device='cuda' if torch.cuda.is_available() else 'cpu')
outputs = model(images).logits
loss = criterion(outputs, labels)
@torch.no_grad()
def test(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator):
model.eval()
correct = torch.zeros(1, device=get_accelerator().get_current_device())
total = torch.zeros(1, device=get_accelerator().get_current_device())
for images, labels in test_dataloader:
images = images.to(device='cuda' if torch.cuda.is_available() else 'cpu')
labels = labels.to(device='cuda' if torch.cuda.is_available() else 'cpu')
outputs = model(images).logits
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
dist.all_reduce(correct)
dist.all_reduce(total)
accuracy = correct.item() / total.item()
if coordinator.is_master():
print(f"Accuracy of the model on the test: {accuracy * 100:.2f} %")
def main():
colossalai.launch_from_torch()
coordinator = DistCoordinator()
coordinator.print_on_master('hello world')
if name == "main":
main()
Beta Was this translation helpful? Give feedback.
All reactions