diff --git a/examples/configs/sft_avlm.yaml b/examples/configs/sft_avlm.yaml
new file mode 100644
index 0000000000..85e7d25414
--- /dev/null
+++ b/examples/configs/sft_avlm.yaml
@@ -0,0 +1,29 @@
+defaults:
+  - sft_vlm_3B.yaml
+
+sft:
+  val_batches: 2
+  val_global_batch_size: 8
+
+policy:
+  max_total_sequence_length: 32768
+  train_global_batch_size: 8
+  dtensor_cfg:
+    tensor_parallel_size: 1
+  dynamic_batching:
+    enabled: true
+  tokenizer:
+    video:
+      num_frames: 16
+
+data:
+  # dataset
+  train:
+    dataset_name: daily-omni
+    split: train
+    split_validation_size: 0.05  # use 5% of the training data as validation data
+    seed: 42  # seed for train/validation split when split_validation_size > 0
+  validation: null
+  # default settings for all datasets
+  default:
+    prompt_file: null
diff --git a/examples/run_sft.py b/examples/run_sft.py
index 45fb43036f..4e80414f8d 100644
--- a/examples/run_sft.py
+++ b/examples/run_sft.py
@@ -66,6 +66,7 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
     print("\n▶ Setting up data...")
     # setup train dataset
     task_data_processors = {}
+    task_data_preprocessors = {}
     data_list = []
 
     if isinstance(data_config["train"], dict):
@@ -85,6 +86,8 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
             add_generation_prompt=data_config["add_generation_prompt"],
         )
         task_data_processors[data.task_name] = (data.task_spec, data_processor)
+        if hasattr(data, "preprocessor") and data.preprocessor is not None:
+            task_data_preprocessors[data.task_name] = data.preprocessor
 
     merged_data = concatenate_datasets([data.dataset for data in data_list])
     dataset = AllTaskProcessedDataset(
@@ -92,12 +95,14 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
         tokenizer,
         None,
         task_data_processors,
+        task_data_preprocessors=task_data_preprocessors,
         max_seq_length=data_config["max_input_seq_length"],
     )
     print(f"  ✓ Training dataset loaded with {len(dataset)} samples.")
 
     # setup validation dataset
     val_task_data_processors = {}
+    val_task_data_preprocessors = {}
     val_data_list = []
 
     # validation dataset from train dataset (when train dataset's split_validation_size > 0)
@@ -107,6 +112,10 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
             # bind task_name to task_data_processors
             task_name = data.task_name
             val_task_data_processors[task_name] = task_data_processors[task_name]
+            if task_name in task_data_preprocessors:
+                val_task_data_preprocessors[task_name] = task_data_preprocessors[
+                    task_name
+                ]
 
     # validation dataset from config
     if "validation" in data_config and data_config["validation"] is not None:
@@ -130,6 +139,8 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
                 val_data.task_spec,
                 val_data_processor,
             )
+            if hasattr(val_data, "preprocessor") and val_data.preprocessor is not None:
+                val_task_data_preprocessors[val_data.task_name] = val_data.preprocessor
 
     val_dataset = None
     if len(val_data_list) > 0:
@@ -139,6 +150,7 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
             tokenizer,
             None,
             val_task_data_processors,
+            task_data_preprocessors=val_task_data_preprocessors,
             max_seq_length=data_config["max_input_seq_length"],
         )
         print(f"  ✓ Validation dataset loaded with {len(val_dataset)} samples.")
diff --git a/nemo_rl/algorithms/utils.py b/nemo_rl/algorithms/utils.py
index cc99033aba..8e632ca5ee 100644
--- a/nemo_rl/algorithms/utils.py
+++ b/nemo_rl/algorithms/utils.py
@@ -320,6 +320,39 @@ def get_tokenizer(
         processor.bos_token_id = tokenizer.bos_token_id
         # copy name_or_path from tokenizer to processor for logging
         processor.name_or_path = tokenizer.name_or_path
+        if hasattr(processor, "feature_extractor") and "audio" in tokenizer_config:
+            if (
+                "sampling_rate" in tokenizer_config["audio"]
+                and tokenizer_config["audio"]["sampling_rate"]
+                != processor.feature_extractor.sampling_rate
+            ):
+                new_sampling_rate = tokenizer_config["audio"]["sampling_rate"]
+                warnings.warn(
+                    f"Overriding audio sampling rate from {processor.feature_extractor.sampling_rate} to {new_sampling_rate}"
+                )
+                processor.feature_extractor.sampling_rate = new_sampling_rate
+        if hasattr(processor, "video_processor") and "video" in tokenizer_config:
+            if (
+                "fps" in tokenizer_config["video"]
+                and tokenizer_config["video"]["fps"] != processor.video_processor.fps
+            ):
+                # override the video loading fps
+                new_fps = tokenizer_config["video"]["fps"]
+                warnings.warn(
+                    f"Overriding video fps from {processor.video_processor.fps} to {new_fps}"
+                )
+                processor.video_processor.fps = new_fps
+            # fps and num_frames cannot co-exist, but let it crash later
+            if (
+                "num_frames" in tokenizer_config["video"]
+                and tokenizer_config["video"]["num_frames"]
+                != processor.video_processor.num_frames
+            ):
+                new_num_frames = tokenizer_config["video"]["num_frames"]
+                warnings.warn(
+                    f"Overriding video num_frames from {processor.video_processor.num_frames} to {new_num_frames}"
+                )
+                processor.video_processor.num_frames = new_num_frames
 
     return tokenizer if processor is None else processor
 
diff --git a/nemo_rl/data/datasets/processed_dataset.py b/nemo_rl/data/datasets/processed_dataset.py
index add422e199..1971e7a12f 100644
--- a/nemo_rl/data/datasets/processed_dataset.py
+++ b/nemo_rl/data/datasets/processed_dataset.py
@@ -21,6 +21,7 @@
 from nemo_rl.data.datasets.utils import assert_no_double_bos
 from nemo_rl.data.interfaces import (
     DatumSpec,
+    TaskDataPreProcessFnCallable,
     TaskDataProcessFnCallable,
     TaskDataSpec,
 )
@@ -52,6 +53,9 @@ def __init__(
             dict[str, tuple[TaskDataSpec, TaskDataProcessFnCallable]]
             | TaskDataProcessFnCallable
         ),
+        task_data_preprocessors: Optional[
+            Union[dict[str, TaskDataPreProcessFnCallable], TaskDataPreProcessFnCallable]
+        ] = None,
         max_seq_length: Optional[int] = None,
     ):
         self.dataset = dataset
@@ -59,6 +63,7 @@ def __init__(
         # TODO @yukih: will be removed once eval datasets are adapted
         self.default_task_data_spec = default_task_data_spec
         self.task_data_processors = task_data_processors
+        self.task_data_preprocessors = task_data_preprocessors
         self.max_seq_length = max_seq_length
         self._bos_checked = False
 
@@ -95,6 +100,20 @@ def __getitem__(self, idx: int) -> DatumSpec:
         """Return a single prompt."""
         entry = self.dataset[idx]
 
+        # preprocessing
+        task_data_preprocessor = None
+        if self.task_data_preprocessors:
+            if isinstance(self.task_data_preprocessors, dict):
+                task_name = entry["task_name"]
+                if task_name in self.task_data_preprocessors:
+                    task_data_preprocessor = self.task_data_preprocessors[task_name]
+            else:
+                task_data_preprocessor = self.task_data_preprocessors
+
+        if task_data_preprocessor is not None:
+            entry = task_data_preprocessor(entry)
+
+        # processing
         if isinstance(self.task_data_processors, dict):
             task_name = entry["task_name"]
 
diff --git a/nemo_rl/data/datasets/raw_dataset.py b/nemo_rl/data/datasets/raw_dataset.py
index decd722736..f425cef8d3 100644
--- a/nemo_rl/data/datasets/raw_dataset.py
+++ b/nemo_rl/data/datasets/raw_dataset.py
@@ -15,7 +15,11 @@
 from datasets import Dataset
 
 from nemo_rl.data import PreferenceDatasetConfig, ResponseDatasetConfig
-from nemo_rl.data.interfaces import TaskDataProcessFnCallable, TaskDataSpec
+from nemo_rl.data.interfaces import (
+    TaskDataPreProcessFnCallable,
+    TaskDataProcessFnCallable,
+    TaskDataSpec,
+)
 from nemo_rl.data.processors import PROCESSOR_REGISTRY
 
 
@@ -27,6 +31,7 @@ class RawDataset:
     val_dataset: Dataset | None
     processor: TaskDataProcessFnCallable
     task_spec: TaskDataSpec
+    preprocessor: TaskDataPreProcessFnCallable | None = None
 
     def split_train_validation(self, test_size: float, seed: int):
         if test_size > 0:
diff --git a/nemo_rl/data/datasets/response_datasets/__init__.py b/nemo_rl/data/datasets/response_datasets/__init__.py
index 961b7b9ba8..eb48bb5204 100644
--- a/nemo_rl/data/datasets/response_datasets/__init__.py
+++ b/nemo_rl/data/datasets/response_datasets/__init__.py
@@ -15,11 +15,15 @@
 from nemo_rl.data import ResponseDatasetConfig
 from nemo_rl.data.datasets.response_datasets.aime24 import AIME2024Dataset
 from nemo_rl.data.datasets.response_datasets.clevr import CLEVRCoGenTDataset
+from nemo_rl.data.datasets.response_datasets.daily_omni import DailyOmniDataset
 from nemo_rl.data.datasets.response_datasets.dapo_math import (
     DAPOMath17KDataset,
     DAPOMathAIME2024Dataset,
 )
 from nemo_rl.data.datasets.response_datasets.deepscaler import DeepScalerDataset
+from nemo_rl.data.datasets.response_datasets.general_conversations_dataset import (
+    GeneralConversationsJsonlDataset,
+)
 from nemo_rl.data.datasets.response_datasets.geometry3k import Geometry3KDataset
 from nemo_rl.data.datasets.response_datasets.helpsteer3 import HelpSteer3Dataset
 from nemo_rl.data.datasets.response_datasets.nemogym_dataset import NemoGymDataset
@@ -39,6 +43,8 @@
     # built-in datasets
     "AIME2024": AIME2024Dataset,
     "clevr-cogent": CLEVRCoGenTDataset,
+    "daily-omni": DailyOmniDataset,
+    "general-conversation-jsonl": GeneralConversationsJsonlDataset,
     "DAPOMath17K": DAPOMath17KDataset,
     "DAPOMathAIME2024": DAPOMathAIME2024Dataset,
     "DeepScaler": DeepScalerDataset,
@@ -84,6 +90,8 @@ def load_response_dataset(data_config: ResponseDatasetConfig):
 __all__ = [
     "AIME2024Dataset",
     "CLEVRCoGenTDataset",
+    "DailyOmniDataset",
+    "GeneralConversationsJsonlDataset",
     "DAPOMath17KDataset",
     "DAPOMathAIME2024Dataset",
     "DeepScalerDataset",
diff --git a/nemo_rl/data/datasets/response_datasets/daily_omni.py b/nemo_rl/data/datasets/response_datasets/daily_omni.py
new file mode 100644
index 0000000000..b2307e337f
--- /dev/null
+++ b/nemo_rl/data/datasets/response_datasets/daily_omni.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Any
+
+from huggingface_hub import snapshot_download
+
+from nemo_rl.data.datasets.raw_dataset import RawDataset
+from nemo_rl.data.datasets.utils import (
+    get_huggingface_cache_path,
+    load_dataset_from_path,
+)
+
+
+class DailyOmniDataset(RawDataset):
+    """Simple wrapper around the Daily-Omni dataset.
+
+    Args:
+        split: Split name for the dataset, default is "train"
+    """
+
+    task_name = "daily-omni"
+
+    def __init__(
+        self,
+        split: str = "train",
+        split_validation_size: float = 0,
+        seed: int = 42,
+        **kwargs,
+    ):
+        # train, valA, and valB are supported splits.
+        SPLIT_TO_HF_NAME = {
+            "train": "liarliar/Daily-Omni",
+        }
+        if split not in SPLIT_TO_HF_NAME:
+            raise ValueError(f"Invalid split: {split}. Please use 'train'.")
+
+        self.hf_cache_dir = get_huggingface_cache_path(SPLIT_TO_HF_NAME[split])
+        if not self.hf_cache_dir:
+            # download the dataset
+            self.hf_cache_dir = snapshot_download(
+                repo_id=SPLIT_TO_HF_NAME[split], repo_type="dataset"
+            )
+        if not self.hf_cache_dir:
+            raise ValueError("Cannot download DailyOmniDataset.")
+
+        json_file = os.path.join(self.hf_cache_dir, "qa.json")
+
+        if not os.path.isfile(json_file):
+            raise ValueError(f"{json_file} cannot be found.")
+
+        files_folder = os.path.join(self.hf_cache_dir, "Videos")
+        if not os.path.isdir(files_folder):
+            # prepare the dataset
+            # TODO: move untar, unzip func to utils?
+            import tarfile
+
+            archive_filename = os.path.join(self.hf_cache_dir, "Videos.tar")
+            if not os.path.isfile(archive_filename):
+                raise ValueError(f"{archive_filename} cannot be found.")
+            try:
+                with tarfile.open(archive_filename, "r:*") as tar:
+                    # Extract all contents to the specified path
+                    tar.extractall(path=self.hf_cache_dir)
+                if os.path.isdir(files_folder):
+                    print(
+                        f"Successfully extracted '{archive_filename}' to '{files_folder}'"
+                    )
+                else:
+                    raise ValueError(
+                        f"Cannot find the extracted folder {files_folder}. Extraction failed."
+                    )
+            except tarfile.ReadError:
+                raise tarfile.ReadError(
+                    "Error: Could not read the tar file. It might be corrupted or not a tar file."
+                )
+            except Exception as e:
+                raise Exception(f"An unexpected error occurred: {e}")
+
+        self.dataset = load_dataset_from_path(json_file)
+
+        # format - disable features to avoid schema conflicts
+        self.dataset = self.dataset.add_column(
+            "task_name", [self.task_name] * len(self.dataset)
+        )
+
+        self.preprocessor = self.format_data
+
+        # `self.val_dataset` is used (not None) only when current dataset is used for both training and validation
+        self.val_dataset = None
+        self.split_train_validation(split_validation_size, seed)
+
+    @classmethod
+    def get_prompt(cls, data: dict[str, Any]) -> str:
+        # WARNING: model could have preference of a different prompt
+        prompt = data["Question"] + "\n" + "\n".join(data["Choice"])
+        candidate_answers = [chr(ord("A") + idx) for idx in range(len(data["Choice"]))]
+        candidate_answers_all_but_last = ",".join(candidate_answers[:-1])
+        prompt += (
+            "\n"
+            + "Your replies must contain only a single letter "
+            + f"(either {candidate_answers_all_but_last} or {candidate_answers[-1]})."
+        )
+        return prompt
+
+    def format_data(self, data: dict[str, Any]) -> dict[str, Any]:
+        user_content = [
+            {
+                "type": "video",
+                "video": os.path.join(
+                    self.hf_cache_dir,
+                    "Videos",
+                    data["video_id"],
+                    data["video_id"] + "_video.mp4",
+                ),
+            },
+            {
+                "type": "text",
+                "text": self.get_prompt(data),
+            },
+        ]
+        return {
+            "messages": [
+                {"role": "user", "content": user_content},
+                {"role": "assistant", "content": data["Answer"]},
+            ],
+            "task_name": self.task_name,
+        }
diff --git a/nemo_rl/data/datasets/response_datasets/general_conversations_dataset.py b/nemo_rl/data/datasets/response_datasets/general_conversations_dataset.py
new file mode 100644
index 0000000000..10651c8490
--- /dev/null
+++ b/nemo_rl/data/datasets/response_datasets/general_conversations_dataset.py
@@ -0,0 +1,268 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import warnings
+from collections import defaultdict
+from functools import partial
+from typing import Any, Callable, Dict, Optional
+
+from nemo_rl.data import multimodal_utils
+from nemo_rl.data.datasets.raw_dataset import RawDataset
+from nemo_rl.data.datasets.utils import load_dataset_from_path
+
+# map the senders from the sample to the allowed ones
+conversation_sender_mapping_sample_to_allowed = {
+    "human": "user",
+    "gpt": "assistant",
+    "agent": "assistant",
+}
+
+
+# convert
+def convert_metadata(metadata: Dict[str, Any]):
+    data = metadata.copy()
+
+    for tag in multimodal_utils.MEDIA_TAGS_TO_ALLOWED:
+        if tag in data:
+            tag_mapped = multimodal_utils.MEDIA_TAGS_TO_ALLOWED[tag]
+            if tag_mapped not in data:
+                data[tag_mapped] = data[tag]
+                del data[tag]
+            else:
+                warnings.warn(
+                    f"Trying to map {tag} to {tag_mapped}, but {tag_mapped} already exists in the raw data. Mapping is not carried out."
+                )
+
+    for idx, message in enumerate(data["conversations"]):
+        msg_str = message["value"]
+        for tag in multimodal_utils.MEDIA_TAGS_TO_ALLOWED:
+            tag_str = "<" + tag + ">"
+            if tag_str in msg_str:
+                tag_str_mapped = multimodal_utils.MEDIA_TAGS[
+                    multimodal_utils.MEDIA_TAGS_TO_ALLOWED[tag]
+                ]
+                msg_str = msg_str.replace(tag_str, tag_str_mapped)
+        message["value"] = msg_str
+        data["conversations"][idx] = message
+
+    return data
+
+
+def conversation_process_message(
+    metadata: Dict[str, Any],
+    message: Dict[str, str],
+    media_index: dict,
+    raw: Optional[Dict[str, Any]] = None,
+    allow_empty_text: bool = False,
+    check_if_media_file_exist: bool = True,
+    tried_default_extensions: Optional[set] = None,
+    process_message_fragment: Callable = lambda tag, fragment: [{tag: fragment}],
+) -> list[Dict[str, Any]]:
+    """Convert one conversation message from a string to a list of dictionaries representing media or text.
+
+    Args:
+        raw: dictionary with all webdataset compliant keys of a sample.
+            Emtpy for jsonl dataset, non-empty otherwise.
+        metadata:
+    """
+    if raw is None:
+        raw = {}
+    if tried_default_extensions is None:
+        tried_default_extensions = set()
+    fragments = []
+    parts = re.split(multimodal_utils.MEDIA_TAG_PATTERN, message["value"])
+
+    # Convert the parts to message fragments
+    empty_text = True
+    for i, part in enumerate(parts):
+        if part in multimodal_utils.MEDIA_TAGS.values():
+            # process multimodal tags
+            tag = multimodal_utils.MEDIA_TAGS_REVERSED[part]
+            if tag not in metadata:
+                raise ValueError(
+                    f"{part} is found in the message, but no corresponding {tag} key can be found in {metadata}"
+                )
+            if not isinstance(metadata[tag], list):
+                metadata[tag] = [metadata[tag]]
+            # try to extract the media object from the shard
+            basename = os.path.basename(metadata[tag][media_index[tag]])
+            ext = basename.split(".", 1)[1] if "." in basename else ""
+            if (
+                raw
+                and ext not in raw
+                and ext not in tried_default_extensions
+                and tag in multimodal_utils.DEFAULT_MEDIA_EXTENSIONS
+            ):
+                # try the default extension
+                for ext in multimodal_utils.DEFAULT_MEDIA_EXTENSIONS[tag]:
+                    if ext in raw:
+                        tried_default_extensions.add(ext)
+                        break
+            media_file = None
+            if ext in raw:
+                media_file = ext
+            elif isinstance(metadata[tag][media_index[tag]], str) and os.path.isfile(
+                metadata[tag][media_index[tag]]
+            ):
+                # if cannot get it from the shard files, try to find the local file
+                media_file = metadata[tag][media_index[tag]]
+            elif check_if_media_file_exist:
+                sample_to_print = raw if raw else metadata
+                raise ValueError(
+                    f"Cannot find the media file {metadata[tag][media_index[tag]]} from {sample_to_print} or locally."
+                )
+            else:
+                media_file = metadata[tag][media_index[tag]]
+            media_index[tag] += 1
+            fragments += process_message_fragment(tag, media_file)
+        else:
+            # process text
+            if part.strip():
+                fragments += process_message_fragment("text", part)
+                empty_text = False
+
+    if not allow_empty_text and empty_text:
+        fragments += process_message_fragment("text", " ")
+
+    return fragments
+
+
+class GeneralConversationsJsonlDataset(RawDataset):
+    """Loads general conversation datasets that have the json (manifest) files and media files in separate files (jsonl datasets).
+
+    Each sample can be single/multi-turn conversations with multiple modalities.
+    Each modality can have one or more number of media objects.
+    There is no requirement of where the media tag (e.g. '<sound>') should appear in the conversations.
+
+    The structure of the jsonl files could be like this.
+
+    Example media filenames::
+
+        sample_000001.2345ew.flac
+        sample_000001.35tags.mp4
+        sample_000001.as23ds.jpg
+        sample_000001.gd1dtg.wav
+        sample_000001.gds233.jpg
+        sample_000002.asf234.wav
+        ...
+
+    Example JSON structure::
+
+        {
+          "sound": ["sample_000001.2345ew.flac", "sample_000001.gd1dtg.wav"],
+          "video": "sample_000001.35tags.mp4",
+          "image": ["sample_000001.as23ds.jpg", "sample_000001.gds233.jpg"],
+          "conversations": [
+            {
+              "from": "user",
+              "value": "<sound>"
+            },
+            {
+              "from": "assistant",
+              "value": "Automatic speech recognition is a technology that allows computers to recognize and transcribe spoken language. In the NeMo Framework, ASR is used for tasks such as speech-to-text and voice recognition."
+            },
+            {
+              "from": "user",
+              "value": "Describe what is NeMo based on the tutorial video: <video> and the information in the two images: <image> <image>. Combine that information with sound <sound>. Answer: "
+            },
+            {
+              "from": "assistant",
+              "value": "The NeMo Framework provides a range of tools and features for training and deploying ASR models, including model parallelism, data parallelism, and distributed checkpointing. This allows for faster training and inference times, as well as improved model accuracy and reliability."
+            }
+          ]
+        }
+    """
+
+    task_name = "general-conversation-jsonl"
+
+    def __init__(
+        self,
+        data_path: str,
+        media_data_dir: Optional[str] = None,
+        split_validation_size: float = 0,
+        seed: int = 42,
+        **kwargs,
+    ):
+        self.media_data_dir = media_data_dir
+        self.dataset = load_dataset_from_path(data_path)
+        self.dataset = self.dataset.add_column(
+            "task_name", [self.task_name] * len(self.dataset)
+        )
+
+        self.preprocessor = partial(
+            self._datum_preprocessor, media_directory=media_data_dir
+        )
+
+        # `self.val_dataset` is used (not None) only when current dataset is used for both training and validation
+        self.val_dataset = None
+        self.split_train_validation(split_validation_size, seed)
+
+    @classmethod
+    def process_message_fragment(
+        cls, tag: str, fragment: Any, media_directory: Optional[str] = None
+    ) -> list[dict[str, Any]]:
+        if (
+            media_directory is not None
+            and tag in multimodal_utils.MEDIA_TAGS
+            and isinstance(fragment, str)
+            and not os.path.isfile(fragment)
+        ):
+            media_path = os.path.join(media_directory, fragment)
+            if os.path.isfile(media_path):
+                fragment = media_path
+        ret = []
+        for t in tag.split("-"):
+            ret.append({"type": t, t: fragment})
+        return ret
+
+    @classmethod
+    def _datum_preprocessor(
+        cls, example: dict[str, Any], media_directory: Optional[str] = None
+    ) -> dict[str, list[dict[str, Any]]]:
+        """Convert the json structure into an OpenAI-API-like message log."""
+        processed_example = {
+            "messages": [],
+            "task_name": cls.task_name,
+        }
+
+        if "conversations" in example:
+            media_index = defaultdict(int)
+            tried_default_extensions = set()
+            data = convert_metadata(example)
+
+            for message in data["conversations"]:
+                role = message["from"]
+                if role not in {"user", "assistant"}:
+                    role = conversation_sender_mapping_sample_to_allowed.get(role)
+                    if role is None:
+                        raise ValueError(
+                            f"Unknown conversation role: {message['from']}"
+                        )
+                content = conversation_process_message(
+                    data,
+                    message,
+                    media_index,
+                    allow_empty_text=True,
+                    check_if_media_file_exist=False,
+                    tried_default_extensions=tried_default_extensions,
+                    process_message_fragment=partial(
+                        cls.process_message_fragment, media_directory=media_directory
+                    ),
+                )
+
+                processed_example["messages"].append({"role": role, "content": content})
+
+        return processed_example
diff --git a/nemo_rl/data/datasets/utils.py b/nemo_rl/data/datasets/utils.py
index 3a7d269c71..5cb407dbdd 100644
--- a/nemo_rl/data/datasets/utils.py
+++ b/nemo_rl/data/datasets/utils.py
@@ -15,10 +15,12 @@
 import base64
 import io
 import os
+from pathlib import Path
 from typing import Optional, Union
 
 import torch
 from datasets import DatasetDict, load_dataset, load_from_disk
+from huggingface_hub.utils._cache_manager import _scan_cached_repo
 from PIL import Image
 from transformers import AutoProcessor, PreTrainedTokenizerBase
 
@@ -141,3 +143,34 @@ def extract_necessary_env_names(data_config: dict) -> list[str]:
         ):
             necessary_env_names.add(data_config[key]["env_name"])
     return list(necessary_env_names)
+
+
+def get_huggingface_cache_path(repo_id, branch="main", repo_type="datasets"):
+    cache_path = None
+    try:
+        cache_list = ["HUGGINGFACE_HUB_CACHE", "HF_HOME"]
+        for cache_name in cache_list:
+            if cache_name in os.environ and os.path.exists(os.environ[cache_name]):
+                if os.environ[cache_name].split("/")[-1] == "hub":
+                    cache_path = os.environ[cache_name]
+                else:
+                    cache_path = os.path.join(os.environ[cache_name], "hub")
+        if not cache_path:
+            home = os.path.expanduser("~")
+            cache_path = os.path.join(home, ".cache", "huggingface", "hub")
+        if cache_path and os.path.isdir(cache_path):
+            org, repo_name = repo_id.split("/")
+            repo_path = Path(
+                os.path.join(cache_path, f"{repo_type}--{org}--{repo_name}/")
+            )
+            hf_cache_info = _scan_cached_repo(repo_path=repo_path)
+            revs = {r.refs: r for r in hf_cache_info.revisions}
+            if branch is not None:
+                revs = {refs: r for refs, r in revs.items() if branch in refs}
+            rev2keep = max(revs.values(), key=lambda r: r.last_modified)
+            return str(rev2keep.snapshot_path)
+        else:
+            return None
+    except Exception as e:
+        print(f"{type(e)}: {e}")
+        return None
diff --git a/nemo_rl/data/interfaces.py b/nemo_rl/data/interfaces.py
index 257741b8ca..f3f88b3b5e 100644
--- a/nemo_rl/data/interfaces.py
+++ b/nemo_rl/data/interfaces.py
@@ -101,3 +101,10 @@ def __call__(
         idx: int,
     ) -> DatumSpec:
         raise NotImplementedError("Task data process not implemented")
+
+
+class TaskDataPreProcessFnCallable(Protocol):
+    """A callable that processes a loaded raw datum dictionary into a dictionary with required format for further processing."""
+
+    def __call__(self, datum_dict: dict[str, Any]) -> dict[str, Any]:
+        raise NotImplementedError("Task data preprocess not implemented")
diff --git a/nemo_rl/data/llm_message_utils.py b/nemo_rl/data/llm_message_utils.py
index c0572ce3a1..a66403aefd 100644
--- a/nemo_rl/data/llm_message_utils.py
+++ b/nemo_rl/data/llm_message_utils.py
@@ -25,7 +25,10 @@
 )
 from nemo_rl.data.multimodal_utils import (
     PackedTensor,
+    get_dim_to_pack_along,
+    get_multimodal_default_settings_from_processor,
     get_multimodal_keys_from_processor,
+    load_media_from_message,
 )
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 
@@ -422,24 +425,6 @@ def get_first_index_that_differs(str1: str, str2: str) -> int:
     return min(len(str1), len(str2))
 
 
-def get_images_from_message(message: dict[str, Any]) -> list[Any]:
-    """Get all images from a message log item."""
-    # Handle None or missing content (e.g., assistant messages with only tool_calls)
-    if message.get("content") is None:
-        return []
-    # Handle string content (no images)
-    if isinstance(message["content"], str):
-        return []
-    # iterate over the content list
-    images = []
-    for item in message["content"]:
-        if item["type"] == "image":
-            images.extend(list(item["image"])) if isinstance(
-                item["image"], (list, tuple)
-            ) else images.append(item["image"])
-    return images
-
-
 def get_formatted_message_log(
     message_log: LLMMessageLogType,
     tokenizer: TokenizerType,
@@ -469,6 +454,7 @@ def get_formatted_message_log(
     )  # we just use the str:str parts here
 
     multimodal_keys = get_multimodal_keys_from_processor(tokenizer)
+    multimodal_load_kwargs = get_multimodal_default_settings_from_processor(tokenizer)
 
     def _format_content_helper(
         content: Union[str, list[dict[str, Any]]],
@@ -606,28 +592,41 @@ def _format_content_helper(
                     message_chunk += tokenizer.eos_token
 
         # get images too (extend this for other modalities)
-        images_cur_message = get_images_from_message(message)
+        media_cur_message = load_media_from_message(
+            message, multimodal_load_kwargs=multimodal_load_kwargs
+        )
 
         new_message = message.copy()
         # extend this if statement to check for all(len(modality)) == 0 when adding other modalities
-        if len(images_cur_message) == 0:
+        if len(media_cur_message) == 0:
             new_message["token_ids"] = tokenizer(
                 text=message_chunk, return_tensors="pt", add_special_tokens=False
             )["input_ids"][0]
         else:
             # extend the else statement to add other modalities (in this case, tokenizer will be a processor)
+            media_kwargs = {}
+            if "image" in media_cur_message:
+                media_kwargs["images"] = media_cur_message["image"]
+            if "audio" in media_cur_message:
+                media_kwargs["audio"] = media_cur_message["audio"]
+            if "video" in media_cur_message:
+                media_kwargs["videos"] = media_cur_message["video"]
+
             processed_chunk = tokenizer(
                 text=[message_chunk],
-                images=images_cur_message,
                 return_tensors="pt",
                 add_special_tokens=False,
+                **media_kwargs,
             )
             new_message["token_ids"] = processed_chunk["input_ids"][0]
 
             # add all vlm keys to the message
             for key in multimodal_keys:
                 if key in processed_chunk:
-                    new_message[key] = PackedTensor(processed_chunk[key], dim_to_pack=0)
+                    new_message[key] = PackedTensor(
+                        processed_chunk[key],
+                        dim_to_pack=get_dim_to_pack_along(tokenizer, key),
+                    )
 
         if len(new_message["token_ids"]) == 0:
             # if there is an empty message, the empty `token_ids` tensor ends up being in fp32,
diff --git a/nemo_rl/data/multimodal_utils.py b/nemo_rl/data/multimodal_utils.py
index 918c589ad1..0513ec9760 100644
--- a/nemo_rl/data/multimodal_utils.py
+++ b/nemo_rl/data/multimodal_utils.py
@@ -13,13 +13,58 @@
 # limitations under the License.
 
 import base64
+import inspect
+import logging
+import re
+from collections import defaultdict
 from io import BytesIO
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
+import decord
 import requests
 import torch
 from PIL import Image
 from transformers import PreTrainedTokenizerBase
+from transformers.audio_utils import load_audio
+from transformers.video_utils import load_video
+
+# List of allowed placeholder strings for different media types in the dataset string
+# e.g. "This is an example of <image>"
+MEDIA_TAGS = {
+    "image": "<image>",
+    "video": "<video>",
+    "audio": "<audio>",
+    "video-audio": "<video-audio>",
+}
+MEDIA_TAGS_REVERSED = {v: k for k, v in MEDIA_TAGS.items()}
+
+DEFAULT_MEDIA_EXTENSIONS = {
+    "image": ["png", "jpeg", "jpg", "img"],
+    "video": ["mp4"],
+    "video-audio": ["mp4"],
+    "audio": ["wav", "flac", "mp3"],
+}
+
+
+# different media namings maybe used in the raw dataset,
+# in which case, they need to be mapped to the allowed ones
+# WARNING: values cannot be used as the keys in the same dict to avoid cyclic graph
+MEDIA_TAGS_TO_ALLOWED = {
+    "speech": "audio",
+    "speeches": "audio",
+    "sound": "audio",
+    "audios": "audio",
+    "images": "image",
+    "videos": "video",
+}
+
+
+# Build a pattern like: <image>|<video>|<audio>|<video-audio>
+MEDIA_TAG_PATTERN = re.compile(
+    r"(" + "|".join(re.escape(tag) for tag in MEDIA_TAGS.values()) + ")"
+)
+
+logger = logging.getLogger(__name__)
 
 
 class PackedTensor:
@@ -174,6 +219,51 @@ def get_multimodal_keys_from_processor(processor) -> list[str]:
     return list(all_keys)
 
 
+def get_multimodal_default_settings_from_processor(
+    processor,
+) -> dict[str, dict[str, Any]]:
+    if isinstance(processor, PreTrainedTokenizerBase):
+        return {}
+
+    default_settings = {}
+    if hasattr(processor, "video_processor"):
+        video_settings_dict = processor.video_processor.to_dict()
+        if (
+            "fps" in video_settings_dict
+            and video_settings_dict["fps"] is None
+            and "num_frames" in video_settings_dict
+            and video_settings_dict["num_frames"] is None
+            and "max_frames" in video_settings_dict
+            and video_settings_dict["max_frames"] is not None
+        ):
+            video_settings_dict["num_frames"] = video_settings_dict["max_frames"]
+        if not hasattr(
+            get_multimodal_default_settings_from_processor, "load_video_kwargs"
+        ):
+            get_multimodal_default_settings_from_processor.load_video_kwargs = [
+                param for param in inspect.signature(load_video).parameters
+            ]
+        default_settings["video"] = {
+            arg: video_settings_dict[arg]
+            for arg in get_multimodal_default_settings_from_processor.load_video_kwargs
+            if arg in video_settings_dict
+        }
+    if hasattr(processor, "feature_extractor"):
+        if not hasattr(
+            get_multimodal_default_settings_from_processor, "load_audio_kwargs"
+        ):
+            get_multimodal_default_settings_from_processor.load_audio_kwargs = [
+                param for param in inspect.signature(load_audio).parameters
+            ]
+        audio_settings_dict = processor.feature_extractor.to_dict()
+        default_settings["audio"] = {
+            arg: audio_settings_dict[arg]
+            for arg in get_multimodal_default_settings_from_processor.load_audio_kwargs
+            if arg in audio_settings_dict
+        }
+    return default_settings
+
+
 def get_dim_to_pack_along(processor, key: str) -> int:
     """Special considerations for packing certain keys from certain processors.
 
@@ -210,3 +300,90 @@ def resolve_to_image(image_path_or_image: str | Image.Image) -> Image.Image:
     else:
         # Handle local file path
         return Image.open(image_path_or_image).convert("RGB")
+
+
+def get_media_from_message(message: dict[str, Any]) -> dict[str, list[Any]]:
+    """Get all media from a message log item."""
+    # Handle None or missing content (e.g., assistant messages with only tool_calls)
+    if message.get("content") is None:
+        return {}
+    # Handle string content (no images)
+    if isinstance(message["content"], str):
+        return {}
+    # iterate over the content list
+    media = defaultdict(list)
+    for item in message["content"]:
+        tag = item["type"]
+        if tag in MEDIA_TAGS:
+            media[tag].extend(list(item[tag])) if isinstance(
+                item[tag], (list, tuple)
+            ) else media[tag].append(item[tag])
+    return media
+
+
+def load_media_from_message(
+    message: dict[str, Any],
+    processor=None,
+    multimodal_load_kwargs: Optional[dict[str, dict[str, Any]]] = None,
+) -> dict[str, list[Any]]:
+    loaded_media = defaultdict(list)
+    media_in_message = get_media_from_message(message)
+
+    if multimodal_load_kwargs is None:
+        multimodal_load_kwargs = {}
+
+    if not multimodal_load_kwargs and processor is not None:
+        multimodal_load_kwargs = get_multimodal_default_settings_from_processor(
+            processor
+        )
+
+    if "image" in media_in_message:
+        loaded_media["image"] += [
+            resolve_to_image(img) for img in media_in_message["image"]
+        ]
+    if "audio" in media_in_message:
+        for aud in media_in_message["audio"]:
+            if isinstance(aud, str):
+                if (
+                    "audio" not in multimodal_load_kwargs
+                    or "sampling_rate" not in multimodal_load_kwargs.get("audio", {})
+                ):
+                    raise ValueError(
+                        "multimodal_load_kwargs must include 'audio' with a 'sampling_rate' "
+                        "key to load audio from file path."
+                    )
+                try:
+                    loaded_media["audio"].append(
+                        load_audio(aud, **multimodal_load_kwargs["audio"])
+                    )
+                except (RuntimeError, FileNotFoundError, OSError) as e:
+                    logger.warning("Audio loading failed. Fall back to decord.")
+                    # use decord
+                    loaded_audio = decord.AudioReader(
+                        aud,
+                        sample_rate=multimodal_load_kwargs["audio"]["sampling_rate"],
+                        mono=True,
+                    )
+                    loaded_media["audio"].append(
+                        loaded_audio[:].asnumpy()[
+                            get_dim_to_pack_along(processor, "audio")
+                        ]
+                    )
+            else:
+                loaded_media["audio"].append(aud)
+    if "video" in media_in_message:
+        for vid in media_in_message["video"]:
+            if isinstance(vid, str):
+                load_video_kwargs = (
+                    multimodal_load_kwargs["video"]
+                    if "video" in multimodal_load_kwargs
+                    else {}
+                )
+                # seems decord backend loads video faster with multithread ffmpeg and it is easier to install
+                loaded_media["video"].append(
+                    load_video(vid, backend="decord", **load_video_kwargs)[0]
+                )
+            else:
+                loaded_media["video"].append(vid)
+
+    return loaded_media
diff --git a/nemo_rl/data/utils.py b/nemo_rl/data/utils.py
index 4b11e80e7f..7fe335140e 100644
--- a/nemo_rl/data/utils.py
+++ b/nemo_rl/data/utils.py
@@ -84,6 +84,7 @@ def setup_response_data(
     print("\n▶ Setting up data...")
     # setup train dataset
     task_data_processors = {}
+    task_data_preprocessors = {}
     task_to_env = {}
     data_list = []
 
@@ -99,6 +100,8 @@ def setup_response_data(
         # bind task_name to task_data_processors and task_to_env
         task_name = data.task_name
         task_data_processors[task_name] = (data.task_spec, data.processor)
+        if hasattr(data, "preprocessor") and data.preprocessor is not None:
+            task_data_preprocessors[task_name] = data.preprocessor
         if has_envs:
             task_to_env[task_name] = envs[cfg["env_name"]]
 
@@ -108,12 +111,14 @@ def setup_response_data(
         tokenizer,
         None,
         task_data_processors,
+        task_data_preprocessors=task_data_preprocessors,
         max_seq_length=data_config["max_input_seq_length"],
     )
     print(f"  ✓ Training dataset loaded with {len(dataset)} samples.")
 
     # setup validation dataset
     val_task_data_processors = {}
+    val_task_data_preprocessors = {}
     val_task_to_env = {}
     val_data_list = []
 
@@ -124,6 +129,10 @@ def setup_response_data(
             # bind task_name to task_data_processors and task_to_env
             task_name = data.task_name
             val_task_data_processors[task_name] = task_data_processors[task_name]
+            if task_name in task_data_preprocessors:
+                val_task_data_preprocessors[task_name] = task_data_preprocessors[
+                    task_name
+                ]
             if has_envs:
                 val_task_to_env[task_name] = task_to_env[task_name]
 
@@ -144,6 +153,8 @@ def setup_response_data(
                 val_data.task_spec,
                 val_data.processor,
             )
+            if hasattr(val_data, "preprocessor") and val_data.preprocessor is not None:
+                val_task_data_preprocessors[task_name] = val_data.preprocessor
             if has_envs:
                 val_task_to_env[task_name] = envs[cfg["env_name"]]
 
@@ -155,6 +166,7 @@ def setup_response_data(
             tokenizer,
             None,
             val_task_data_processors,
+            task_data_preprocessors=val_task_data_preprocessors,
             max_seq_length=data_config["max_input_seq_length"],
         )
         print(f"  ✓ Validation dataset loaded with {len(val_dataset)} samples.")
@@ -189,12 +201,16 @@ def setup_preference_data(tokenizer: AutoTokenizer, data_config: DataConfig):
         update_single_dataset_config(data_config["train"], data_config["default"])
     data = load_preference_dataset(data_config["train"])
     task_data_processors = {data.task_name: (data.task_spec, preference_preprocessor)}
+    task_data_preprocessors = {}
+    if hasattr(data, "preprocessor") and data.preprocessor is not None:
+        task_data_preprocessors[data.task_name] = data.preprocessor
 
     dataset = AllTaskProcessedDataset(
         data.dataset,
         tokenizer,
         None,
         task_data_processors,
+        task_data_preprocessors=task_data_preprocessors,
         max_seq_length=data_config["max_input_seq_length"],
     )
     print(f"  ✓ Training dataset loaded with {len(dataset)} samples.")
@@ -202,6 +218,7 @@ def setup_preference_data(tokenizer: AutoTokenizer, data_config: DataConfig):
     # setup validation dataset
     # TODO @yukih: unify the code when support multiple datasets for preference dataset
     val_dataset = {}
+    val_task_data_preprocessors = {}
     if "val_data_paths" in data_config and data_config["val_data_paths"]:
         assert isinstance(data_config["val_data_paths"], dict), (
             f"Invalid type for val_data_paths: {type(data_config['val_data_paths'])}. val_data_paths must be a dictionary."
@@ -217,12 +234,17 @@ def setup_preference_data(tokenizer: AutoTokenizer, data_config: DataConfig):
             val_task_data_processors = {
                 val_data.task_name: (val_data.task_spec, preference_preprocessor)
             }
+            if hasattr(val_data, "preprocessor") and val_data.preprocessor is not None:
+                val_task_data_preprocessors = {
+                    val_data.task_name: val_data.preprocessor
+                }
 
             val_dataset[val_dataset_name] = AllTaskProcessedDataset(
                 val_data.dataset,
                 tokenizer,
                 None,
                 val_task_data_processors,
+                task_data_preprocessors=val_task_data_preprocessors,
                 max_seq_length=data_config["max_input_seq_length"],
             )
             print(
@@ -238,12 +260,15 @@ def setup_preference_data(tokenizer: AutoTokenizer, data_config: DataConfig):
         val_task_data_processors = {
             val_data.task_name: (val_data.task_spec, preference_preprocessor)
         }
+        if hasattr(val_data, "preprocessor") and val_data.preprocessor is not None:
+            val_task_data_preprocessors = {val_data.task_name: val_data.preprocessor}
 
         val_dataset["default"] = AllTaskProcessedDataset(
             val_data.dataset,
             tokenizer,
             None,
             val_task_data_processors,
+            task_data_preprocessors=val_task_data_preprocessors,
             max_seq_length=data_config["max_input_seq_length"],
         )
         print(
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index 363399cbca..15f4133b3c 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -203,6 +203,9 @@ class TokenizerConfig(TypedDict):
     chat_template: NotRequired[str]
     # Arguments to pass to tokenizer.apply_chat_template(...). This can be used to pass kwargs like enable_thinking=true
     chat_template_kwargs: NotRequired[dict[str, Any] | None]
+    # Multimodal configs
+    audio: NotRequired[dict[str, Any]]
+    video: NotRequired[dict[str, Any]]
 
 
 class PytorchOptimizerConfig(TypedDict):
diff --git a/pyproject.toml b/pyproject.toml
index 7b702d2662..970b6b874b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,7 @@ dependencies = [
   "nvidia-nvshmem-cu12; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # for deep_ep build
   "swanlab",
   "pyzmq",
+  "decord",
   "nvidia-resiliency-ext",
   "nccl4py",                                                                                                          # for non-colocated refit
   "cuda-bindings",                                                                                                    # for non-colocated refit
diff --git a/tests/functional/L1_Functional_Tests_GPU.sh b/tests/functional/L1_Functional_Tests_GPU.sh
index cf6bdca734..2a728cf1d2 100644
--- a/tests/functional/L1_Functional_Tests_GPU.sh
+++ b/tests/functional/L1_Functional_Tests_GPU.sh
@@ -54,6 +54,7 @@ time uv run --no-sync bash ./tests/functional/test_automodel_extra_installed_cor
 # time uv run --no-sync bash ./tests/functional/test_converters.sh
 time uv run --no-sync bash ./tests/functional/test_mcore_extra_installed_correctly.sh
 time uv run --no-sync bash ./tests/functional/vlm_grpo.sh
+time uv run --no-sync bash ./tests/functional/sft_avlm.sh
 
 # Research functional tests (self-discovery)
 for test_script in research/*/tests/functional/*.sh; do
diff --git a/tests/functional/sft_avlm.sh b/tests/functional/sft_avlm.sh
new file mode 100644
index 0000000000..38e6b76e7c
--- /dev/null
+++ b/tests/functional/sft_avlm.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# clean up checkpoint directory on exit
+trap "rm -rf /tmp/sft_avlm_checkpoints" EXIT
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetches metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+set -eou pipefail
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+rm -rf $EXP_DIR $LOG_DIR
+mkdir -p $EXP_DIR $LOG_DIR
+
+cd $PROJECT_ROOT
+uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJECT_ROOT/nemo_rl \
+    $PROJECT_ROOT/examples/run_vlm_sft.py \
+    --config $PROJECT_ROOT/examples/configs/sft_avlm.yaml \
+    cluster.gpus_per_node=2 \
+    sft.max_num_steps=3 \
+    policy.train_global_batch_size=2 \
+    sft.val_period=3 \
+    logger.tensorboard_enabled=true \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=false \
+    logger.monitor_gpus=true \
+    checkpointing.enabled=true \
+    checkpointing.checkpoint_dir=/tmp/sft_avlm_checkpoints \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+uv run tests/check_metrics.py $JSON_METRICS \
+  'data["train/loss"]["3"] < 4.0'
+
diff --git a/tests/unit/data/datasets/test_general_conversations_dataset.py b/tests/unit/data/datasets/test_general_conversations_dataset.py
new file mode 100644
index 0000000000..9e0cbef19c
--- /dev/null
+++ b/tests/unit/data/datasets/test_general_conversations_dataset.py
@@ -0,0 +1,342 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import tempfile
+
+from nemo_rl.data.datasets import load_response_dataset
+
+
+def create_sample_general_conversation_jsonl_multimodal_interleaved_multiturn():
+    """Create a temporary jsonl file with one sample: audio + video + image + user/assistant conversations."""
+    sample = [
+        {
+            "sound": ["sample_000001.2345ew.flac", "sample_000001.gd1dtg.wav"],
+            "video-audio": "sample_000001.35tags.mp4",
+            "image": ["sample_000001.as23ds.jpg", "sample_000001.gds233.jpg"],
+            "conversations": [
+                {"from": "user", "value": "<sound>"},
+                {
+                    "from": "assistant",
+                    "value": "Automatic speech recognition is a technology that allows computers to recognize and transcribe spoken language. In the NeMo Framework, ASR is used for tasks such as speech-to-text and voice recognition.",
+                },
+                {
+                    "from": "user",
+                    "value": "Describe what is NeMo based on the tutorial video: <video-audio> and the information in the two images: <image> <image>. Combine that information with sound <sound>. Answer: ",
+                },
+                {
+                    "from": "assistant",
+                    "value": "The NeMo Framework provides a range of tools and features for training and deploying ASR models, including model parallelism, data parallelism, and distributed checkpointing. This allows for faster training and inference times, as well as improved model accuracy and reliability.",
+                },
+            ],
+        }
+    ]
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+        for item in sample:
+            f.write(json.dumps(item) + "\n")
+        return f.name
+
+
+def test_general_conversation_jsonl_multimodal_interleaved_multiturn():
+    """Test that a mock local jsonl sample is converted to OpenAI-compatible message form by the preprocessor."""
+    data_path = (
+        create_sample_general_conversation_jsonl_multimodal_interleaved_multiturn()
+    )
+    try:
+        data_config = {
+            "dataset_name": "general-conversation-jsonl",
+            "data_path": data_path,
+        }
+        dataset = load_response_dataset(data_config)
+
+        assert len(dataset.dataset) == 1
+
+        # Raw first example from the jsonl
+        first_raw = dataset.dataset[0]
+        # Run the preprocessor (same as used in the pipeline)
+        formatted = dataset.preprocessor(first_raw)
+
+        # Expected OpenAI-compatible structure
+        assert "messages" in formatted
+        assert "task_name" in formatted
+        assert formatted["task_name"] == "general-conversation-jsonl"
+
+        assert len(formatted["messages"]) == 4
+
+        # User message: content is list of audio/video/image block + text block
+        user_msg0 = formatted["messages"][0]
+        assert user_msg0["role"] == "user"
+        user_content0 = user_msg0["content"]
+        assert isinstance(user_content0, list)
+        assert len(user_content0) == 1
+        # the "sound" tag will be converted to the "audio" tag
+        assert user_content0[0] == {
+            "type": "audio",
+            "audio": "sample_000001.2345ew.flac",
+        }
+
+        # Assistant message: content is list of text block(s).
+        # Multimodal tokens are also supported in a similar fashion as for the user message.
+        assistant_msg0 = formatted["messages"][1]
+        assert assistant_msg0["role"] == "assistant"
+        assistant_content0 = assistant_msg0["content"]
+        assert isinstance(assistant_content0, list)
+        assert len(assistant_content0) == 1
+        assert assistant_content0[0] == {
+            "type": "text",
+            "text": "Automatic speech recognition is a technology that allows computers to recognize and transcribe spoken language. In the NeMo Framework, ASR is used for tasks such as speech-to-text and voice recognition.",
+        }
+
+        user_msg1 = formatted["messages"][2]
+        assert user_msg1["role"] == "user"
+        user_content1 = user_msg1["content"]
+        assert isinstance(user_content1, list)
+        assert len(user_content1) == 9
+        assert user_content1[0] == {
+            "type": "text",
+            "text": "Describe what is NeMo based on the tutorial video: ",
+        }
+        # video-audio tag will be splitted into one video tag followed by one audio tag
+        # TODO: more advanced video-audio interleaving technique? Should be handled on the model level.
+        assert user_content1[1] == {
+            "type": "video",
+            "video": "sample_000001.35tags.mp4",
+        }
+        assert user_content1[2] == {
+            "type": "audio",
+            "audio": "sample_000001.35tags.mp4",
+        }
+        assert user_content1[3] == {
+            "type": "text",
+            "text": " and the information in the two images: ",
+        }
+        assert user_content1[4] == {
+            "type": "image",
+            "image": "sample_000001.as23ds.jpg",
+        }
+        assert user_content1[5] == {
+            "type": "image",
+            "image": "sample_000001.gds233.jpg",
+        }
+        assert user_content1[6] == {
+            "type": "text",
+            "text": ". Combine that information with sound ",
+        }
+        assert user_content1[7] == {
+            "type": "audio",
+            "audio": "sample_000001.gd1dtg.wav",
+        }
+        assert user_content1[8] == {"type": "text", "text": ". Answer: "}
+
+        assistant_msg1 = formatted["messages"][3]
+        assert assistant_msg1["role"] == "assistant"
+        assistant_content1 = assistant_msg1["content"]
+        assert isinstance(assistant_content1, list)
+        assert len(assistant_content1) == 1
+        assert assistant_content1[0] == {
+            "type": "text",
+            "text": "The NeMo Framework provides a range of tools and features for training and deploying ASR models, including model parallelism, data parallelism, and distributed checkpointing. This allows for faster training and inference times, as well as improved model accuracy and reliability.",
+        }
+
+    finally:
+        import os
+
+        try:
+            os.unlink(data_path)
+        except OSError:
+            pass
+
+
+def create_sample_general_conversation_jsonl_multimodal_singleturn():
+    """Create a temporary jsonl file with multiple samples with each sample contains one modality."""
+    sample = [
+        {
+            "image": "sample_000001.as23ds.jpg",
+            "conversations": [
+                {"from": "user", "value": "<image>\nPlease describe this image."},
+                {
+                    "from": "assistant",
+                    "value": "Two kids are playing ping pong in this image.",
+                },
+            ],
+        },
+        {
+            "audio": ["sample_000001.2345ew.flac"],
+            "conversations": [
+                {"from": "user", "value": "<audio>"},
+                {
+                    "from": "assistant",
+                    "value": "Automatic speech recognition is a technology that allows computers to recognize and transcribe spoken language. In the NeMo Framework, ASR is used for tasks such as speech-to-text and voice recognition.",
+                },
+            ],
+        },
+        {
+            "video-audio": "sample_000001.35tags.mp4",
+            "conversations": [
+                {
+                    "from": "user",
+                    "value": "<video-audio>\nDescribe what is NeMo based on the tutorial video. Answer: ",
+                },
+                {
+                    "from": "assistant",
+                    "value": "The NeMo Framework provides a range of tools and features for training and deploying ASR models, including model parallelism, data parallelism, and distributed checkpointing. This allows for faster training and inference times, as well as improved model accuracy and reliability.",
+                },
+            ],
+        },
+    ]
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+        for item in sample:
+            f.write(json.dumps(item) + "\n")
+        return f.name
+
+
+def test_general_conversation_jsonl_multimodal_singleturn():
+    """Test that a mock local jsonl sample is converted to OpenAI-compatible message form by the preprocessor."""
+    data_path = create_sample_general_conversation_jsonl_multimodal_singleturn()
+    try:
+        data_config = {
+            "dataset_name": "general-conversation-jsonl",
+            "data_path": data_path,
+        }
+        dataset = load_response_dataset(data_config)
+
+        assert len(dataset.dataset) == 3
+
+        # Raw first example from the jsonl
+        first_raw = dataset.dataset[0]
+        # Run the preprocessor (same as used in the pipeline)
+        formatted = dataset.preprocessor(first_raw)
+
+        # Expected OpenAI-compatible structure
+        assert "messages" in formatted
+        assert "task_name" in formatted
+        assert formatted["task_name"] == "general-conversation-jsonl"
+
+        assert len(formatted["messages"]) == 2
+
+        # User message: content is list of audio/video/image block + text block
+        user_msg0 = formatted["messages"][0]
+        assert user_msg0["role"] == "user"
+        user_content0 = user_msg0["content"]
+        assert isinstance(user_content0, list)
+        assert len(user_content0) == 2
+        # the "sound" tag will be converted to the "audio" tag
+        assert user_content0[0] == {
+            "type": "image",
+            "image": "sample_000001.as23ds.jpg",
+        }
+        assert user_content0[1] == {
+            "type": "text",
+            "text": "\nPlease describe this image.",
+        }
+
+        # Assistant message: content is list of text block(s).
+        # Multimodal tokens are also supported in a similar fashion as for the user message.
+        assistant_msg0 = formatted["messages"][1]
+        assert assistant_msg0["role"] == "assistant"
+        assistant_content0 = assistant_msg0["content"]
+        assert isinstance(assistant_content0, list)
+        assert len(assistant_content0) == 1
+        assert assistant_content0[0] == {
+            "type": "text",
+            "text": "Two kids are playing ping pong in this image.",
+        }
+
+        # Raw Second example from the jsonl
+        second_raw = dataset.dataset[1]
+        # Run the preprocessor (same as used in the pipeline)
+        formatted = dataset.preprocessor(second_raw)
+
+        # Expected OpenAI-compatible structure
+        assert "messages" in formatted
+        assert "task_name" in formatted
+        assert formatted["task_name"] == "general-conversation-jsonl"
+
+        assert len(formatted["messages"]) == 2
+
+        # User message: content is list of audio/video/image block + text block
+        user_msg0 = formatted["messages"][0]
+        assert user_msg0["role"] == "user"
+        user_content0 = user_msg0["content"]
+        assert isinstance(user_content0, list)
+        assert len(user_content0) == 1
+        # the "sound" tag will be converted to the "audio" tag
+        assert user_content0[0] == {
+            "type": "audio",
+            "audio": "sample_000001.2345ew.flac",
+        }
+
+        # Assistant message: content is list of text block(s).
+        # Multimodal tokens are also supported in a similar fashion as for the user message.
+        assistant_msg0 = formatted["messages"][1]
+        assert assistant_msg0["role"] == "assistant"
+        assistant_content0 = assistant_msg0["content"]
+        assert isinstance(assistant_content0, list)
+        assert len(assistant_content0) == 1
+        assert assistant_content0[0] == {
+            "type": "text",
+            "text": "Automatic speech recognition is a technology that allows computers to recognize and transcribe spoken language. In the NeMo Framework, ASR is used for tasks such as speech-to-text and voice recognition.",
+        }
+
+        # Raw Third example from the jsonl
+        third_raw = dataset.dataset[2]
+        # Run the preprocessor (same as used in the pipeline)
+        formatted = dataset.preprocessor(third_raw)
+
+        # Expected OpenAI-compatible structure
+        assert "messages" in formatted
+        assert "task_name" in formatted
+        assert formatted["task_name"] == "general-conversation-jsonl"
+
+        assert len(formatted["messages"]) == 2
+
+        # User message: content is list of audio/video/image block + text block
+        user_msg0 = formatted["messages"][0]
+        assert user_msg0["role"] == "user"
+        user_content0 = user_msg0["content"]
+        assert isinstance(user_content0, list)
+        assert len(user_content0) == 3
+        # the "sound" tag will be converted to the "audio" tag
+        assert user_content0[0] == {
+            "type": "video",
+            "video": "sample_000001.35tags.mp4",
+        }
+        assert user_content0[1] == {
+            "type": "audio",
+            "audio": "sample_000001.35tags.mp4",
+        }
+        assert user_content0[2] == {
+            "type": "text",
+            "text": "\nDescribe what is NeMo based on the tutorial video. Answer: ",
+        }
+
+        # Assistant message: content is list of text block(s).
+        # Multimodal tokens are also supported in a similar fashion as for the user message.
+        assistant_msg0 = formatted["messages"][1]
+        assert assistant_msg0["role"] == "assistant"
+        assistant_content0 = assistant_msg0["content"]
+        assert isinstance(assistant_content0, list)
+        assert len(assistant_content0) == 1
+        assert assistant_content0[0] == {
+            "type": "text",
+            "text": "The NeMo Framework provides a range of tools and features for training and deploying ASR models, including model parallelism, data parallelism, and distributed checkpointing. This allows for faster training and inference times, as well as improved model accuracy and reliability.",
+        }
+
+    finally:
+        import os
+
+        try:
+            os.unlink(data_path)
+        except OSError:
+            pass
diff --git a/tests/unit/data/datasets/test_response_dataset.py b/tests/unit/data/datasets/test_response_dataset.py
index fa27c74a01..d88524e27e 100644
--- a/tests/unit/data/datasets/test_response_dataset.py
+++ b/tests/unit/data/datasets/test_response_dataset.py
@@ -334,3 +334,26 @@ def test_vlm_dataset(dataset_name, format_func):
         assert first_example["messages"][1]["content"] == "3"
     elif dataset_name == "refcoco":
         assert first_example["messages"][1]["content"] == "[243, 469, 558, 746]"
+
+
+def test_dailyomni_dataset():
+    # load the dataset
+    dataset = load_response_dataset({"dataset_name": "daily-omni"})
+
+    # check the first example
+    first_example = dataset.dataset[0]
+    assert hasattr(dataset, "preprocessor") and dataset.preprocessor is not None
+    first_example = dataset.preprocessor(first_example)
+
+    # only contains messages and task_name
+    assert len(first_example.keys()) == 2
+    assert "messages" in first_example
+    assert "task_name" in first_example
+
+    # check the content
+    assert first_example["messages"][0]["role"] == "user"
+    assert first_example["messages"][0]["content"][0]["type"] == "video"
+    assert first_example["messages"][0]["content"][1]["type"] == "text"
+    assert first_example["messages"][1]["role"] == "assistant"
+
+    assert first_example["messages"][1]["content"] == "B"
diff --git a/uv.lock b/uv.lock
index e0c3cda97f..ac9efcb3e8 100644
--- a/uv.lock
+++ b/uv.lock
@@ -4657,6 +4657,7 @@ dependencies = [
     { name = "cuda-bindings" },
     { name = "datasets" },
     { name = "debugpy" },
+    { name = "decord" },
     { name = "hydra-core" },
     { name = "math-verify" },
     { name = "matplotlib" },
@@ -4796,6 +4797,7 @@ requires-dist = [
     { name = "cuda-python", marker = "extra == 'vllm'" },
     { name = "datasets", specifier = ">=4.0.0" },
     { name = "debugpy" },
+    { name = "decord" },
     { name = "deep-ep", marker = "extra == 'automodel'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480" },
     { name = "deep-ep", marker = "extra == 'mcore'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480" },
     { name = "deep-ep", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480" },