NVIDIA-NeMo · Edresson · Apr 15, 2026 · Apr 16, 2026 · Apr 19, 2026 · Apr 22, 2026
diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -17,6 +17,7 @@ model:
   disable_lm_text_head: false
   disable_subword_embedding: false
   use_bpe_char_tokenizer: true
+  cas_encoder_n_layers: 1
 
   # HuggingFace backend config (used when decoder_type: "huggingface")
   transformer_hf_backend: "Qwen/Qwen2.5-1.5B"

diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -15,6 +15,7 @@ model:
   disable_lm_text_head: false
   disable_subword_embedding: false
   use_bpe_char_tokenizer: true
+  cas_encoder_n_layers: 1
 
   # HuggingFace backend config (used when decoder_type: "huggingface")
   transformer_hf_backend: "Qwen/Qwen2.5-1.5B"

diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse_multiturn.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse_multiturn.yaml
@@ -0,0 +1,231 @@
+name: Magpie-TTS-DecoderOnly-EN
+
+quadratic_duration: 20
+
+# Adjust batch size based on GPU memory
+# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch.
+# If null, then weighted sampling is disabled.
+
+model:
+  use_lhotse: true
+
+  # Decoder backend selection
+  # Options: "huggingface" (default), "nemotron_h"
+  decoder_type: "huggingface"
+
+  # HuggingFace backend config (used when decoder_type: "huggingface")
+  transformer_hf_backend: "Qwen/Qwen2.5-1.5B"
+
+  # NemotronH config (used when decoder_type: "nemotron_h")
+  # Hybrid Mamba2/MoE/Attention model (~3B total, ~600-800M active). Layer types via hybrid_override_pattern:
+  # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer
+  nemotron_h_config:
+    hidden_size: 1536  # Should match embedding_dim
+    num_hidden_layers: 48
+    vocab_size: 131072
+    # Attention config
+    num_attention_heads: 12
+    num_key_value_heads: 4
+    attention_dropout: 0.0
+    attention_bias: false
+    max_position_embeddings: 8192
+    # Mamba config
+    mamba_num_heads: 64
+    mamba_head_dim: 24
+    ssm_state_size: 128
+    conv_kernel: 4
+    n_groups: 8
+    chunk_size: 256
+    mamba_hidden_act: "silu"
+    use_conv_bias: true
+    use_bias: false
+    # MLP config
+    intermediate_size: 4096
+    mlp_hidden_act: "silu"
+    mlp_bias: false
+    # MoE config (scaled from Nemotron-3-Nano-30B-A3B)
+    n_routed_experts: 48
+    num_experts_per_tok: 6
+    moe_intermediate_size: 1024
+    moe_shared_expert_intermediate_size: 2048
+    n_group: 1
+    topk_group: 1
+    routed_scaling_factor: 2.5
+    norm_topk_prob: true
+    # Layer pattern: (M E M E M *) x 8 => 16 Mamba, 16 MoE, 8 Attention
+    hybrid_override_pattern: "MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*"
+    # Normalization
+    layer_norm_epsilon: 1e-5
+    residual_in_fp32: true
+
+  use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided.
+  context_duration_min: 5.0
+  context_duration_max: 5.0
+  load_cached_codes_if_available: true
+
+  embedding_dim: 1536
+  hidden_dim: 1536
+  audio_embedding_dim: 1536  # Can set a smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
+  codecmodel_path: ???
+
+  # Local transformer parameters for autoregressive codebook prediction within a frame
+  local_transformer_type: "autoregressive" # "none", "autoregressive"
+  # Below args are only relevant if use_local_transformer is autoregressive
+  local_transformer_loss_scale: 1.0
+  phoneme_loss_weight: 1.0
+  local_transformer_n_layers: 3
+  local_transformer_n_heads: 12
+  local_transformer_hidden_dim: 1536
+
+  cfg_unconditional_prob: 0.05
+
+  # Multi-mode training configuration
+  training_modes:
+    - text_input_mode: "streaming" # Options: "full", "streaming"
+      streaming_phonemes_delay: 0
+      streaming_speech_delay: 1
+
+  frame_stacking_factor: 2
+  phoneme_stacking_factor: 1
+  phoneme_confidence_unk_threshold: 0.0 # If max phoneme probability is below this threshold at inference-time, replace the predicted timestep with UNK to reduce error propagation.
+  dropout_text_input_prob: 0.1
+  phoneme_corruption_batch_prob: 0.1
+  phoneme_corruption_timestep_ratio: 0.15
+  phoneme_corruption_unk_mode_prob: 0.5
+  phoneme_corruption_type: "repeat_skip_unk" # "repeat_skip_unk" or "complete_channel"
+  phoneme_turn_dropout_batch_prob: 0.0  # prob of applying turn dropout to a sample
+  phoneme_turn_dropout_turn_prob: 0.0  # prob of dropping each phoneme turn within a sample
+  phoneme_turn_max_words_to_drop: 0  # turns with <= this many words keep phoneme tokens as pad_id
+
+  phoneme_tokenizer:
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
+    tokenizer_path: ???
+
+  text_tokenizers:
+    nemotron_nano_30b:
+      _target_: AutoTokenizer
+      pretrained_model: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+
+  train_ds:
+    use_lhotse: ${model.use_lhotse}
+    volume_norm: true
+    dataset:
+      multi_config: true
+      shuffle: true
+      seed: 42
+      shard_seed: "trng"
+
+      sampler_fusion: randomized_round_robin
+      sampler_weights:
+        tts_data: 0.5
+        duplex_data: 0.5
+      tts_data:
+        min_duration: 0.2
+        min_context_speaker_similarity: 0.6
+        max_cer: 0.03
+        batch_duration : ???  # in seconds. Adjust based on your GPU memory.
+        quadratic_duration: ${quadratic_duration}
+        use_bucketing: true
+        num_buckets: 20
+        bucket_buffer_size: 10_000
+        shuffle_buffer_size: 10_000
+        num_cuts_for_bins_estimate: 10_000
+        shard_seed: "trng"
+        drop_last: true
+        shuffle: true
+        num_workers: 6
+        pin_memory: true
+
+        input_cfg:
+        - type: lhotse_shar
+          shar_path: ???
+          weight: 1.0
+          tags:
+            tokenizer_names: ["english_phoneme"]
+
+      duplex_data:
+        input_cfg: /lustre/fsw/convai_convaird_nemo-speech/data/duplex/multispeaker_syn_duplex.yaml
+        use_bucketing: true
+        num_buckets: 20
+        bucket_buffer_size: 1_000
+        shuffle_buffer_size: 1_000
+        num_cuts_for_bins_estimate: 1_000
+        max_duration: 300 # 5 mi max duration
+        bucket_duration_bins: [4.0, 8.9, 10.2, 11.6, 13.2, 15.0, 17.0, 19.3, 25.0, 31.5, 38.5, 46.0, 55.5, 66.5, 79.5, 93.3, 110.0, 130.0, 156.8, 203.3]
+        bucket_batch_size:    [75, 33, 29, 25, 23, 20, 18, 15, 12, 10, 8, 7, 5, 4, 3, 3, 2, 2, 1, 1]
+
+
+  validation_ds:
+    use_lhotse: ${model.use_lhotse}
+    volume_norm: true
+
+    dataset:
+      min_duration: 0.2
+      min_context_speaker_similarity: 0.6
+      max_cer: 0.03
+      batch_duration: ???   # recommend to use smaller batch_duration for validation dataset than training dataset.
+      quadratic_duration: ${quadratic_duration}
+      use_bucketing: false
+      force_finite: true
+      force_map_dataset: true
+      drop_last: false
+      shuffle: false
+      num_workers: 2
+      pin_memory: true
+      seed: 42
+      shard_seed: "randomized"
+
+      input_cfg:
+      - type: lhotse_shar
+        shar_path: ???
+        weight: 1.0
+        tags:
+          tokenizer_names: ["english_phoneme"]
+
+  optim:
+    _target_: torch.optim.AdamW
+    lr: 1e-4
+
+    sched:
+      name: ExponentialLR
+      gamma: 0.998
+
+trainer:
+  num_nodes: 1
+  devices: -1
+  accelerator: gpu
+  strategy: ddp_find_unused_parameters_true
+  precision: bf16-mixed
+  max_steps: ???
+  accumulate_grad_batches: 1
+  enable_checkpointing: False # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  limit_train_batches: 1_000
+  val_check_interval: 1_000
+  num_sanity_val_steps: 0
+  benchmark: false
+  use_distributed_sampler: false  # required because Lhotse has its own handling
+  gradient_clip_val: 2.5
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    entity: null
+    name: ${name}
+    project: null
+    group: null
+    resume: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
+    save_top_k: 5
+    save_best_model: true
+    always_save_nemo: true
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.4f}-{step}-{epoch}'
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
diff --git a/examples/tts/easy_magpietts.py b/examples/tts/easy_magpietts.py
@@ -55,6 +55,9 @@ def main(cfg):
     else:
         raise NotImplementedError(f"Only train, onlinepo_train and test modes are supported. Got {mode}")
 
+    if cfg.get("pretrained_model", None):
+        model.restore_from_pretrained_checkpoint(cfg.pretrained_model)
+
     model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
 
     if mode in ['train', 'onlinepo_train']: