InfiniTensor · Chamberlain0w0 · Feb 4, 2026 · Mar 11, 2026 · Mar 12, 2026 · May 13, 2026
diff --git a/example/common/utils.cc b/example/common/utils.cc
@@ -1,5 +1,8 @@
 #include "example/common/utils.h"
 
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
 namespace infini_train {
 
 float ConvertBF16ToFloat(void *ptr) {
@@ -61,4 +64,11 @@ void ReadVectorShardFloat(std::ifstream &ifs, float *dst, int64_t len, int64_t s
     ifs.seekg(base + std::streamoff(len * sizeof(float)));
 }
 
+void ValidateDistributedOptimizerFlags(bool use_distributed_optimizer) {
+    gflags::CommandLineFlagInfo zero_stage_info;
+    CHECK(gflags::GetCommandLineFlagInfo("zero_stage", &zero_stage_info));
+    CHECK(use_distributed_optimizer || zero_stage_info.is_default)
+        << "--zero_stage requires --use_distributed_optimizer=true.";
+}
+
 } // namespace infini_train
diff --git a/example/common/utils.h b/example/common/utils.h
@@ -30,4 +30,6 @@ void ReadVectorAllFloat(std::ifstream &ifs, float *dst, int64_t len);
 
 void ReadVectorShardFloat(std::ifstream &ifs, float *dst, int64_t len, int64_t start, int64_t cnt);
 
+void ValidateDistributedOptimizerFlags(bool use_distributed_optimizer);
+
 } // namespace infini_train
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -36,6 +36,7 @@
 
 #include "example/common/tiny_shakespeare_dataset.h"
 #include "example/common/tokenizer.h"
+#include "example/common/utils.h"
 #include "example/gpt2/checkpoint_loader.h"
 #include "example/gpt2/config.h"
 
@@ -58,6 +59,7 @@ DEFINE_uint32(text_length, 64, "the length of the generated text");
 // optimization
 DEFINE_double(learning_rate, 1e-4, "learning rate warmup iterations");
 DEFINE_bool(use_distributed_optimizer, false, "Whether to enable DistributedOptimizer(only take effects when DP>1)");
+DEFINE_int32(zero_stage, 1, "ZeRO stage (1/2/3), default 1 (only take effects when use_distributed_optimizer=true)");
 // evaluation
 DEFINE_uint32(val_loss_every, 0, "every how many steps to evaluate val loss?");
 DEFINE_uint32(sample_every, 0, "how often to sample from the model?");
@@ -114,6 +116,7 @@ const std::unordered_map<std::string, nn::TransformerConfig> kModelToConfigs = {
 DEFINE_validator(model, [](const char *, const std::string &value) { return kSupportedModels.contains(value); });
 DEFINE_validator(device,
                  [](const char *, const std::string &value) { return value == kDeviceCPU || value == kDeviceCUDA; });
+DEFINE_validator(zero_stage, [](const char *, int32_t value) { return value >= 1 && value <= 3; });
 
 void Train(const nn::parallel::Rank &rank) {
     using namespace nn::parallel;
@@ -252,8 +255,8 @@ void Train(const nn::parallel::Rank &rank) {
         model = std::make_shared<nn::parallel::PipelineParallel>(model, pp_world_size, num_micro_batches, shapes,
                                                                  pp_rank, device, model_config.GetChunkSize());
         if (ddp_world_size > 1) {
-            auto ddp_config
-                = DistributedDataParallelConfig{.use_distributed_optimizer = FLAGS_use_distributed_optimizer};
+            auto ddp_config = DistributedDataParallelConfig{
+                .use_distributed_optimizer = FLAGS_use_distributed_optimizer, .zero_stage = FLAGS_zero_stage};
             auto *mutable_chunks = dynamic_cast<nn::parallel::PipelineParallel *>(model.get())->mutable_chunks();
             for (int chunk_id = 0; chunk_id < mutable_chunks->size(); ++chunk_id) {
                 (*mutable_chunks)[chunk_id]
@@ -265,7 +268,8 @@ void Train(const nn::parallel::Rank &rank) {
         // before wrapping the model with DistributedDataParallel (DDP).
         // Otherwise, DDP’s gradient hooks may be lost because new parameter tensors
         // are created during the conversion.
-        auto ddp_config = DistributedDataParallelConfig{.use_distributed_optimizer = FLAGS_use_distributed_optimizer};
+        auto ddp_config = DistributedDataParallelConfig{.use_distributed_optimizer = FLAGS_use_distributed_optimizer,
+                                                        .zero_stage = FLAGS_zero_stage};
         model = std::make_shared<DistributedDataParallel>(model, rank, ddp_config);
     }
 
@@ -447,6 +451,7 @@ void Train(const nn::parallel::Rank &rank) {
 int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
+    ValidateDistributedOptimizerFlags(FLAGS_use_distributed_optimizer);
 
     auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,

diff --git a/example/llama3/main.cc b/example/llama3/main.cc
@@ -35,6 +35,7 @@
 
 #include "example/common/tiny_shakespeare_dataset.h"
 #include "example/common/tokenizer.h"
+#include "example/common/utils.h"
 #include "example/llama3/checkpoint_loader.h"
 #include "example/llama3/config.h"
 
@@ -57,6 +58,7 @@ DEFINE_uint32(text_length, 64, "the length of the generated text");
 // optimization
 DEFINE_double(learning_rate, 1e-5, "learning rate warmup iterations");
 DEFINE_bool(use_distributed_optimizer, false, "Whether to enable DistributedOptimizer(only take effects when DP>1)");
+DEFINE_int32(zero_stage, 1, "ZeRO stage (1/2/3), default 1 (only take effects when use_distributed_optimizer=true)");
 // evaluation
 DEFINE_uint32(val_loss_every, 0, "every how many steps to evaluate val loss?");
 DEFINE_uint32(sample_every, 0, "how often to sample from the model?");
@@ -100,6 +102,7 @@ constexpr char kDtypeBF16[] = "bfloat16";
 DEFINE_validator(model, [](const char *, const std::string &value) { return kSupportedModels.contains(value); });
 DEFINE_validator(device,
                  [](const char *, const std::string &value) { return value == kDeviceCPU || value == kDeviceCUDA; });
+DEFINE_validator(zero_stage, [](const char *, int32_t value) { return value >= 1 && value <= 3; });
 
 void Train(const nn::parallel::Rank &rank) {
     using namespace nn::parallel;
@@ -222,8 +225,8 @@ void Train(const nn::parallel::Rank &rank) {
         model = std::make_shared<nn::parallel::PipelineParallel>(model, pp_world_size, num_micro_batches, shapes,
                                                                  pp_rank, device, model_config.GetChunkSize());
         if (ddp_world_size > 1) {
-            auto ddp_config
-                = DistributedDataParallelConfig{.use_distributed_optimizer = FLAGS_use_distributed_optimizer};
+            auto ddp_config = DistributedDataParallelConfig{
+                .use_distributed_optimizer = FLAGS_use_distributed_optimizer, .zero_stage = FLAGS_zero_stage};
             auto *mutable_chunks = dynamic_cast<nn::parallel::PipelineParallel *>(model.get())->mutable_chunks();
             for (int chunk_id = 0; chunk_id < mutable_chunks->size(); ++chunk_id) {
                 (*mutable_chunks)[chunk_id]
@@ -236,7 +239,8 @@ void Train(const nn::parallel::Rank &rank) {
         // Otherwise, DDP’s gradient hooks may be lost because new parameter tensors
         // are created during the conversion.
 
-        auto ddp_config = DistributedDataParallelConfig{.use_distributed_optimizer = FLAGS_use_distributed_optimizer};
+        auto ddp_config = DistributedDataParallelConfig{.use_distributed_optimizer = FLAGS_use_distributed_optimizer,
+                                                        .zero_stage = FLAGS_zero_stage};
         model = std::make_shared<DistributedDataParallel>(model, rank, ddp_config);
     }
 
@@ -422,6 +426,7 @@ void Train(const nn::parallel::Rank &rank) {
 int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
+    ValidateDistributedOptimizerFlags(FLAGS_use_distributed_optimizer);
 
     auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,

diff --git a/infini_train/include/autograd/function_hook.h b/infini_train/include/autograd/function_hook.h
@@ -17,6 +17,12 @@ namespace infini_train::autograd {
 class PostAccumulateGradHook {
 public:
     virtual void operator()(const std::shared_ptr<Tensor> &tensor) = 0;
+
+    // ZeRO-2: Use this function to take over AccumulateGrad::Backward
+    virtual bool TryBypassAccumulate(const std::shared_ptr<Tensor> &, const std::shared_ptr<Tensor> &, bool, float) {
+        return false;
+    }
+
     virtual ~PostAccumulateGradHook() = default;
 };
 

diff --git a/infini_train/include/nn/parallel/ddp/distributed_data_parallel_config.h b/infini_train/include/nn/parallel/ddp/distributed_data_parallel_config.h
@@ -40,6 +40,12 @@ class DistributedDataParallelConfig {
     // In this case, grad reduce is triggered immediately when a grad is ready or till all grads are ready.
     bool overlap_grad_reduce = true;
 
+    // ZeRO-DP Stage for memory optimization (Only take effects when use_distributed_optimizer=true)
+    //   ZeRO-1: Optimizer states partitioning, by default
+    //   ZeRO-2: Gradients partitioning
+    //   ZeRO-3: Parameters partitioning
+    int zero_stage = 1;
+
     // Whether to overlap parameter all-gather with forward compute.
     bool overlap_param_gather = true;
 
@@ -59,7 +65,7 @@ class DistributedDataParallelConfig {
     // Maximum number of parameters in each ParamAndGradBucket.
     // NOTE(zbl): This is distinct from DDP Reducer's MB-based bucket caps.
     // TODO(zbl): To unify the definition of bucket_size argument for users
-    size_t bucket_size_in_elements = 40000000;
+    size_t bucket_size_in_elements = 1000000;
 
     // Whether to pad bucket sizes to improve NCCL bus bandwidth utilization.
     bool pad_buckets_for_high_nccl_busbw = false;

diff --git a/infini_train/include/nn/parallel/ddp/param_and_grad_buffer.h b/infini_train/include/nn/parallel/ddp/param_and_grad_buffer.h
@@ -22,8 +22,8 @@ namespace infini_train::nn::parallel {
 class ParamAndGradBucket {
 public:
     ParamAndGradBucket(const std::vector<std::shared_ptr<Tensor>> &params, const std::shared_ptr<Tensor> &param_data,
-                       const std::shared_ptr<Tensor> &grad_data, size_t offset, size_t num_elements_unpadded,
-                       float gradient_scaling_factor, size_t bucket_id);
+                       DataType param_dtype, const std::shared_ptr<Tensor> &grad_data, DataType grad_dtype,
+                       size_t offset, size_t num_elements_unpadded, float gradient_scaling_factor, size_t bucket_id);
 
     size_t bucket_id() const { return bucket_id_; }
 
@@ -33,6 +33,10 @@ class ParamAndGradBucket {
 
     const std::shared_ptr<Tensor> &grad_data() const { return grad_data_; }
 
+    DataType param_dtype() const { return param_dtype_; }
+
+    DataType grad_dtype() const { return grad_dtype_; }
+
     size_t offset() const { return offset_; }
 
     size_t num_elements_unpadded() const { return num_elements_unpadded_; }
@@ -49,6 +53,8 @@ class ParamAndGradBucket {
     std::vector<std::shared_ptr<Tensor>> params_;
     std::shared_ptr<Tensor> param_data_;
     std::shared_ptr<Tensor> grad_data_;
+    DataType param_dtype_;
+    DataType grad_dtype_;
 
     size_t offset_ = 0;
     size_t num_elements_unpadded_ = 0;
@@ -73,6 +79,11 @@ class ParamAndGradBucketGroup {
     // Start grad reduce
     void StartGradSync();
 
+    // Accumulate a parameter grad into bucket buffer
+    // ZeRO-2: Use this funtion to take over autograd::AccumulateGrad::Backward
+    void AccumulateParamGrad(const std::shared_ptr<Tensor> &parameter, const std::shared_ptr<Tensor> &grad,
+                             bool overwrite, float learning_rate);
+
     // Wait for gradient reduce to complete
     void FinishGradSync();
 
@@ -87,6 +98,9 @@ class ParamAndGradBucketGroup {
 
     const std::vector<std::shared_ptr<ParamAndGradBucket>> &buckets() const { return buckets_; }
 
+    // ZeRO-2: Get a bucket's local grad shard buffer
+    std::shared_ptr<Tensor> GetLocalGradShardBuffer(size_t bucket_idx) const;
+
     const DistributedDataParallelConfig &config() const { return ddp_config_; }
 
 private:
@@ -98,12 +112,19 @@ class ParamAndGradBucketGroup {
 
     std::unordered_set<Tensor *> params_;
     std::unordered_set<Tensor *> params_with_grad_;
+    // Tensor -> (Bucket, Bucket Index)
+    std::unordered_map<Tensor *, std::pair<std::shared_ptr<ParamAndGradBucket>, size_t>> param_to_bucket_;
 
     // TODO(zbl): Implement CoalescedWork for aggregate works
     //            According to Megatron-LM's _coalescing_manager
     std::vector<std::shared_ptr<Work>> grad_reduce_work_list_;
+    std::vector<size_t> grad_reduce_bucket_indices_;
     std::vector<std::shared_ptr<Work>> param_gather_work_list_;
 
+    // ZeRO-2: persistent grad shard buffers and temporary full grad buffers
+    std::vector<std::shared_ptr<Tensor>> grad_shard_buffer_list_;
+    std::vector<std::shared_ptr<Tensor>> temp_full_grad_buffer_list_;
+
     std::shared_ptr<ParamAndGradBucketGroup> next_param_gather_bucket_group_ = nullptr;
 
     std::vector<std::vector<std::shared_ptr<Tensor>>> param_buffer_shard_list_;

diff --git a/infini_train/src/autograd/accumulate.cc b/infini_train/src/autograd/accumulate.cc
@@ -33,8 +33,15 @@ AccumulateGrad::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_output
                             "running before autograd). The grad is not cast and will be used as-is.";
         }
 
+        const bool overwrite = tensor_->ConsumeGradOverwriteFlag();
+        auto hook = tensor_->post_accumulate_grad_hook();
+        if (hook && hook->TryBypassAccumulate(tensor_, grad_output, overwrite, learning_rate_)) {
+            tensor_->ResetAccumulator();
+            return {};
+        }
+
         if (grad) {
-            if (tensor_->ConsumeGradOverwriteFlag()) {
+            if (overwrite) {
                 // If the tensor is marked to overrite its current grad on next grad update
                 // See notes in `infini_train::nn::parallel::Reducer::PrepareForBackward()`
                 // NOTE(zbl): must copy, cannot change grad buffer address
@@ -48,7 +55,6 @@ AccumulateGrad::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_output
             auto new_grad = std::make_shared<Tensor>(*grad_output.get(), 0, grad_output->Dims());
             tensor_->set_grad(new_grad);
         }
-        auto hook = tensor_->post_accumulate_grad_hook();
         if (hook != nullptr) {
             (*hook)(tensor_->grad());
         }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -30,4 +30,6 @@ void ReadVectorAllFloat(std::ifstream &ifs, float *dst, int64_t len);

		void ReadVectorShardFloat(std::ifstream &ifs, float *dst, int64_t len, int64_t start, int64_t cnt);

		void ValidateDistributedOptimizerFlags(bool use_distributed_optimizer);

		} // namespace infini_train