diff --git a/agentscope-core/src/main/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalTool.java b/agentscope-core/src/main/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalTool.java
index 45baf84e9..7c3b7b06f 100644
--- a/agentscope-core/src/main/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalTool.java
+++ b/agentscope-core/src/main/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalTool.java
@@ -23,6 +23,10 @@
 import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationOutput;
 import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
 import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
+import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesis;
+import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisOutput;
+import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisParam;
+import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisResult;
 import com.alibaba.dashscope.audio.asr.recognition.Recognition;
 import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam;
 import com.alibaba.dashscope.audio.asr.recognition.RecognitionResult;
@@ -41,6 +45,7 @@
 import io.agentscope.core.message.TextBlock;
 import io.agentscope.core.message.ToolResultBlock;
 import io.agentscope.core.message.URLSource;
+import io.agentscope.core.message.VideoBlock;
 import io.agentscope.core.tool.Tool;
 import io.agentscope.core.tool.ToolParam;
 import java.io.IOException;
@@ -51,6 +56,7 @@
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Base64;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
@@ -227,7 +233,12 @@ public Mono<ToolResultBlock> dashscopeTextToImage(
     public Mono<ToolResultBlock> dashscopeImageToText(
             @ToolParam(
                             name = "image_urls",
-                            description = "The URL(s) of image(s) to be converted into text.")
+                            description =
+                                    "The URL(s), local file path(s) or Base64 data URL(s)(the"
+                                        + " format pattern is"
+                                        + " data:[MIME_type];base64,{base64_image}, e.g.,"
+                                        + " 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABDg...')"
+                                        + " of image(s) to be converted into text.")
                     List<String> imageUrls,
             @ToolParam(name = "prompt", description = "The text prompt.", required = false)
                     String prompt,
@@ -716,14 +727,14 @@ public void onError(Exception e) {
 
     /**
      * Send audio input stream by chunk to DashScope.
-     * marked as public for unit test.
+     * marked as package-private for unit test.
      *
      * @param audioUrl   The file path or URL of audio.
      * @param recognizer DashScope Recognition instance
      * @throws IOException          if read failed
      * @throws InterruptedException if interrupted
      */
-    public void sendAudioChunk(String audioUrl, Recognition recognizer)
+    void sendAudioChunk(String audioUrl, Recognition recognizer)
             throws IOException, InterruptedException {
         // chunk size set to 1 seconds for 16KHz sample rate
         byte[] buffer = new byte[3200];
@@ -752,4 +763,658 @@ public void sendAudioChunk(String audioUrl, Recognition recognizer)
             }
         }
     }
+
+    /**
+     * Generate video based on the given prompt.
+     *
+     * @param prompt          The text prompt to generate video.
+     * @param model           The model to use, e.g., 'wan2.6-t2v', 'wan2.5-t2v-preview', etc.
+     * @param negativePrompt  The negative prompt to avoid certain elements.
+     * @param audioUrl        The URL for background audio.
+     * @param size            Size of the video, e.g., '1920*1080', '1280*720', etc.
+     * @param duration        Duration of the video in seconds, e.g., '5', '10', etc.
+     * @param shotType        Specify the shot type that generates the video.
+     *                        single: default value, output single shot video; multi: output multi-lens video.
+     * @param promptExtend    Whether to extend the prompt automatically (default true)
+     * @param watermark       Whether to include watermark (default false)
+     * @param seed            The seed for reproducibility
+     * @return A ToolResultBlock containing the generated video url or error message.
+     */
+    @Tool(
+            name = "dashscope_text_to_video",
+            description = "Generate video based on the given text prompt")
+    public Mono<ToolResultBlock> dashscopeTextToVideo(
+            @ToolParam(name = "prompt", description = "The text prompt to generate video")
+                    String prompt,
+            @ToolParam(
+                            name = "model",
+                            description =
+                                    "The model to use, e.g., 'wan2.6-t2v', 'wan2.5-t2v-preview',"
+                                            + " etc",
+                            required = false)
+                    String model,
+            @ToolParam(
+                            name = "negative_prompt",
+                            description = "The negative prompt to avoid certain elements",
+                            required = false)
+                    String negativePrompt,
+            @ToolParam(
+                            name = "audio_url",
+                            description = "The URL for background audio",
+                            required = false)
+                    String audioUrl,
+            @ToolParam(
+                            name = "size",
+                            description = "Size of the video, e.g., '1920*1080', '1280*720', etc",
+                            required = false)
+                    String size,
+            @ToolParam(
+                            name = "duration",
+                            description = "Duration of the video in seconds, e.g., '5', '10', etc",
+                            required = false)
+                    Integer duration,
+            @ToolParam(
+                            name = "shot_type",
+                            description =
+                                    "Specify the shot type that generates the video. single:"
+                                        + " default value, output single shot video; multi: output"
+                                        + " multi-lens video",
+                            required = false)
+                    String shotType,
+            @ToolParam(
+                            name = "prompt_extend",
+                            description =
+                                    "Whether to automatically extend the prompt (default true)",
+                            required = false)
+                    Boolean promptExtend,
+            @ToolParam(
+                            name = "watermark",
+                            description = "Whether to include watermark (default false)",
+                            required = false)
+                    Boolean watermark,
+            @ToolParam(
+                            name = "seed",
+                            description = "The seed for reproducibility",
+                            required = false)
+                    Integer seed) {
+
+        String finalModel =
+                Optional.ofNullable(model).filter(s -> !s.trim().isEmpty()).orElse("wan2.6-t2v");
+        String finalSize =
+                Optional.ofNullable(size).filter(s -> !s.trim().isEmpty()).orElse("1920*1080");
+        String finalShotType = Optional.ofNullable(shotType).orElse("single");
+        boolean finalPromptExtend = Optional.ofNullable(promptExtend).orElse(true);
+        boolean finalWatermark = Optional.ofNullable(watermark).orElse(false);
+
+        log.debug(
+                "dashscope_text_to_video called: prompt='{}', model='{}', negativePrompt='{}',"
+                    + " audioUrl='{}', size='{}', duration={}, shotType='{}', promptExtend='{}',"
+                    + " watermark='{}', seed={}",
+                prompt,
+                finalModel,
+                negativePrompt,
+                audioUrl,
+                finalSize,
+                duration,
+                finalShotType,
+                finalPromptExtend,
+                finalWatermark,
+                seed);
+
+        return Mono.fromCallable(
+                        () -> {
+                            Map<String, Object> parameters = new HashMap<>();
+                            parameters.put("size", finalSize);
+                            parameters.put("shot_type", finalShotType);
+                            parameters.put("prompt_extend", finalPromptExtend);
+                            parameters.put("watermark", finalWatermark);
+                            if (duration != null) {
+                                parameters.put("duration", duration);
+                            }
+                            if (seed != null) {
+                                parameters.put("seed", seed);
+                            }
+
+                            VideoSynthesisParam param =
+                                    VideoSynthesisParam.builder()
+                                            .apiKey(this.apiKey)
+                                            .model(finalModel)
+                                            .prompt(prompt)
+                                            .negativePrompt(negativePrompt)
+                                            .audioUrl(audioUrl)
+                                            .parameters(parameters)
+                                            .header("user-agent", Version.getUserAgent())
+                                            .build();
+
+                            VideoSynthesis videoSynthesis = new VideoSynthesis();
+
+                            // The video call method blocks until the video is generated or fails
+                            log.info(
+                                    "Starting text to video generation task, please wait for a"
+                                            + " while...");
+                            VideoSynthesisResult response = videoSynthesis.call(param);
+
+                            // Extract video URL from response
+                            String videoUrl =
+                                    Optional.ofNullable(response)
+                                            .map(VideoSynthesisResult::getOutput)
+                                            .map(VideoSynthesisOutput::getVideoUrl)
+                                            .orElse(null);
+
+                            if (videoUrl == null || videoUrl.trim().isEmpty()) {
+                                log.error("No video url returned. Response: {}", response);
+                                return ToolResultBlock.error("Failed to generate video.");
+                            }
+
+                            log.info("Text to video generated successfully, videoUrl:{}", videoUrl);
+
+                            VideoBlock vb =
+                                    VideoBlock.builder()
+                                            .source(URLSource.builder().url(videoUrl).build())
+                                            .build();
+
+                            return ToolResultBlock.of(vb);
+                        })
+                .onErrorResume(
+                        e -> {
+                            log.error("Failed to generate video '{}'", e.getMessage(), e);
+                            return Mono.just(ToolResultBlock.error(e.getMessage()));
+                        });
+    }
+
+    /**
+     * Generate a video based on a single input image (first frame) and an optional text prompt.
+     *
+     * @param prompt          The text prompt describing the video content and motion.
+     * @param model           The model to use, e.g., 'wan2.6-i2v-flash', 'wan2.1-i2v-turbo'.
+     * @param imageUrl        The URL, local file path, or Base64 data of the input image (first frame).
+     * @param audioUrl        URL of the audio file that the model will use to generate video.
+     * @param negativePrompt  The negative prompt to avoid certain elements.
+     * @param template        Name of video effect template, e.g., 'squish', 'rotation', etc
+     * @param resolution      Resolution of the video, e.g., '720P', '1080P'.
+     * @param duration        Duration of the video in seconds (e.g., 5, 10, 15). Default depends on model.
+     * @param shotType        Specify the shot type that generates the video.
+     *                        single: default value, output single shot video; multi: output multi-lens video.
+     * @param audio           Whether to generate audio video (default true).
+     * @param promptExtend    Whether to automatically extend the prompt (default true).
+     * @param watermark       Whether to include watermark (default false).
+     * @param seed            Optional seed for reproducibility.
+     * @return A ToolResultBlock containing the generated video url or error message.
+     */
+    @Tool(
+            name = "dashscope_image_to_video",
+            description =
+                    "Generate a video from a single input image and an optional text prompt."
+                            + "Supports optional audio guidance and duration control.")
+    public Mono<ToolResultBlock> dashscopeImageToVideo(
+            @ToolParam(
+                            name = "prompt",
+                            description = "Text prompt describing the video content and motion",
+                            required = false)
+                    String prompt,
+            @ToolParam(
+                            name = "model",
+                            description =
+                                    "Model to use, e.g., 'wan2.6-i2v-flash', 'wan2.6-i2v', etc",
+                            required = false)
+                    String model,
+            @ToolParam(
+                            name = "image_url",
+                            description =
+                                    "URL, local file path or Base64 data URL((the format pattern is"
+                                        + " data:[MIME_type];base64,{base64_image}, e.g.,"
+                                        + " 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABDg...'))"
+                                        + " of the first frame image")
+                    String imageUrl,
+            @ToolParam(
+                            name = "audio_url",
+                            description =
+                                    "URL of the audio file that the model will use to generate"
+                                            + " video",
+                            required = false)
+                    String audioUrl,
+            @ToolParam(
+                            name = "negative_prompt",
+                            description = "The negative prompt to avoid certain elements",
+                            required = false)
+                    String negativePrompt,
+            @ToolParam(
+                            name = "template",
+                            description =
+                                    "Name of video effect template, e.g., 'squish', 'rotation',"
+                                            + " etc",
+                            required = false)
+                    String template,
+            @ToolParam(
+                            name = "resolution",
+                            description = "Video resolution, e.g., '720P', '1080P'",
+                            required = false)
+                    String resolution,
+            @ToolParam(
+                            name = "duration",
+                            description = "Duration of the video in seconds, e.g., 5, 10, 15",
+                            required = false)
+                    Integer duration,
+            @ToolParam(
+                            name = "shot_type",
+                            description =
+                                    "Specify the shot type that generates the video. single:"
+                                        + " default value, output single shot video; multi: output"
+                                        + " multi-lens video",
+                            required = false)
+                    String shotType,
+            @ToolParam(
+                            name = "audio",
+                            description = "Whether to generate audio video (default true)",
+                            required = false)
+                    Boolean audio,
+            @ToolParam(
+                            name = "prompt_extend",
+                            description =
+                                    "Whether to automatically extend the prompt (default true)",
+                            required = false)
+                    Boolean promptExtend,
+            @ToolParam(
+                            name = "watermark",
+                            description = "Whether to include watermark (default false)",
+                            required = false)
+                    Boolean watermark,
+            @ToolParam(
+                            name = "seed",
+                            description = "The seed for reproducibility",
+                            required = false)
+                    Integer seed) {
+
+        String finalModel =
+                Optional.ofNullable(model)
+                        .filter(s -> !s.trim().isEmpty())
+                        .orElse("wan2.6-i2v-flash");
+        String finalResolution =
+                Optional.ofNullable(resolution).filter(s -> !s.trim().isEmpty()).orElse("720P");
+        String finalShotType = Optional.ofNullable(shotType).orElse("single");
+        boolean finalAudio = Optional.ofNullable(audio).orElse(true);
+        boolean finalPromptExtend = Optional.ofNullable(promptExtend).orElse(true);
+        boolean finalWatermark = Optional.ofNullable(watermark).orElse(false);
+
+        log.debug(
+                "dashscope_image_to_video called: prompt='{}', model='{}', imageUrl='{}',"
+                        + " audioUrl='{}', negativePrompt='{}', template='{}', resolution='{}',"
+                        + " duration={}, shotType='{}', audio={}, promptExtend={}, watermark={},"
+                        + " seed={}",
+                prompt,
+                finalModel,
+                imageUrl,
+                audioUrl,
+                negativePrompt,
+                template,
+                finalResolution,
+                duration,
+                finalShotType,
+                finalAudio,
+                finalPromptExtend,
+                finalWatermark,
+                seed);
+
+        return Mono.fromCallable(
+                        () -> {
+                            Map<String, Object> parameters = new HashMap<>();
+                            parameters.put("resolution", finalResolution);
+                            parameters.put("shot_type", finalShotType);
+                            parameters.put("audio", finalAudio);
+                            parameters.put("prompt_extend", finalPromptExtend);
+                            parameters.put("watermark", finalWatermark);
+                            if (duration != null) {
+                                parameters.put("duration", duration);
+                            }
+                            if (seed != null) {
+                                parameters.put("seed", seed);
+                            }
+
+                            VideoSynthesisParam param =
+                                    VideoSynthesisParam.builder()
+                                            .apiKey(this.apiKey)
+                                            .prompt(prompt)
+                                            .model(finalModel)
+                                            .imgUrl(MediaUtils.urlToProtocolUrl(imageUrl))
+                                            .audioUrl(audioUrl)
+                                            .negativePrompt(negativePrompt)
+                                            .template(template)
+                                            .parameters(parameters)
+                                            .header("user-agent", Version.getUserAgent())
+                                            .build();
+
+                            VideoSynthesis videoSynthesis = new VideoSynthesis();
+
+                            log.info(
+                                    "Starting image to video generation task, please wait for a"
+                                            + " while...");
+                            VideoSynthesisResult response = videoSynthesis.call(param);
+
+                            // Extract video URL from response
+                            String videoUrl =
+                                    Optional.ofNullable(response)
+                                            .map(VideoSynthesisResult::getOutput)
+                                            .map(VideoSynthesisOutput::getVideoUrl)
+                                            .orElse(null);
+
+                            if (videoUrl == null || videoUrl.trim().isEmpty()) {
+                                log.error("Failed to generate video. No video url returned.");
+                                return ToolResultBlock.error("Failed to generate video.");
+                            }
+
+                            log.info(
+                                    "Image to video generated successfully, videoUrl:{}", videoUrl);
+
+                            VideoBlock vb =
+                                    VideoBlock.builder()
+                                            .source(URLSource.builder().url(videoUrl).build())
+                                            .build();
+
+                            return ToolResultBlock.of(vb);
+                        })
+                .onErrorResume(
+                        e -> {
+                            log.error("Failed to generate video: '{}'", e.getMessage(), e);
+                            return Mono.just(ToolResultBlock.error(e.getMessage()));
+                        });
+    }
+
+    /**
+     * Generate video transitioning from a first frame to a last frame and an optional text prompt.
+     *
+     * @param prompt          The text prompt describing the video content and camera movement.
+     * @param model           The model to use, e.g., 'wan2.2-kf2v-flash', 'wanx2.1-kf2v-plus'.
+     * @param firstFrameUrl   The URL or Base64 data of the first frame image.
+     * @param lastFrameUrl    The URL or Base64 data of the last frame image.
+     * @param negativePrompt  The negative prompt to avoid certain elements.
+     * @param template        Name of video effect template, e.g., 'hanfu-1', 'solaron', etc.
+     * @param resolution      Resolution of the video, e.g., '480P', '720P', '1080P'.
+     * @param promptExtend    Whether to automatically extend the prompt (default true).
+     * @param watermark       Whether to include watermark (default false).
+     * @param seed            Optional seed for reproducibility.
+     * @return A ToolResultBlock containing the generated video url or error message.
+     */
+    @Tool(
+            name = "dashscope_first_and_last_frame_image_to_video",
+            description =
+                    "Generate video transitioning from a first frame to a last frame and an"
+                            + " optional text prompt")
+    public Mono<ToolResultBlock> dashscopeFirstAndLastFrameImageToVideo(
+            @ToolParam(
+                            name = "prompt",
+                            description = "Text prompt describing the video content and motion",
+                            required = false)
+                    String prompt,
+            @ToolParam(
+                            name = "model",
+                            description =
+                                    "Model to use, e.g., 'wan2.2-kf2v-flash', 'wanx2.1-kf2v-plus'",
+                            required = false)
+                    String model,
+            @ToolParam(
+                            name = "first_frame_url",
+                            description =
+                                    "URL, local file path or Base64 data URL(the format pattern is"
+                                        + " data:[MIME_type];base64,{base64_image}, e.g.,"
+                                        + " 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABDg...')"
+                                        + " of the first frame image")
+                    String firstFrameUrl,
+            @ToolParam(
+                            name = "last_frame_url",
+                            description =
+                                    "URL, local file path or Base64 data URL(the format pattern is"
+                                        + " data:[MIME_type];base64,{base64_image}, e.g.,"
+                                        + " 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABDg...')"
+                                        + " of the last frame image",
+                            required = false)
+                    String lastFrameUrl,
+            @ToolParam(
+                            name = "negative_prompt",
+                            description = "The negative prompt to avoid certain elements",
+                            required = false)
+                    String negativePrompt,
+            @ToolParam(
+                            name = "template",
+                            description =
+                                    "Name of video effect template, e.g., 'hanfu-1', 'solaron',"
+                                            + " etc",
+                            required = false)
+                    String template,
+            @ToolParam(
+                            name = "resolution",
+                            description = "Video resolution, e.g., '720P', '1080P'",
+                            required = false)
+                    String resolution,
+            @ToolParam(
+                            name = "prompt_extend",
+                            description =
+                                    "Whether to automatically extend the prompt (default true)",
+                            required = false)
+                    Boolean promptExtend,
+            @ToolParam(
+                            name = "watermark",
+                            description = "Whether to include watermark (default false)",
+                            required = false)
+                    Boolean watermark,
+            @ToolParam(
+                            name = "seed",
+                            description = "The seed for reproducibility",
+                            required = false)
+                    Integer seed) {
+
+        String finalModel =
+                Optional.ofNullable(model)
+                        .filter(s -> !s.trim().isEmpty())
+                        .orElse("wan2.2-kf2v-flash");
+        String finalResolution =
+                Optional.ofNullable(resolution).filter(s -> !s.trim().isEmpty()).orElse("720P");
+        boolean finalPromptExtend = Optional.ofNullable(promptExtend).orElse(true);
+        boolean finalWatermark = Optional.ofNullable(watermark).orElse(false);
+
+        log.debug(
+                "dashscope_image_to_video called: prompt='{}', model='{}', firstFrameUrl='{}',"
+                        + " lastFrameUrl='{}', negativePrompt='{}', template='{}', resolution='{}',"
+                        + " promptExtend={}, watermark={}, seed={}",
+                prompt,
+                finalModel,
+                firstFrameUrl,
+                lastFrameUrl,
+                negativePrompt,
+                template,
+                finalResolution,
+                finalPromptExtend,
+                finalWatermark,
+                seed);
+
+        return Mono.fromCallable(
+                        () -> {
+                            Map<String, Object> parameters = new HashMap<>();
+                            parameters.put("resolution", finalResolution);
+                            parameters.put("prompt_extend", finalPromptExtend);
+                            parameters.put("watermark", finalWatermark);
+                            if (seed != null) {
+                                parameters.put("seed", seed);
+                            }
+
+                            VideoSynthesisParam param =
+                                    VideoSynthesisParam.builder()
+                                            .apiKey(this.apiKey)
+                                            .model(finalModel)
+                                            .prompt(prompt)
+                                            .firstFrameUrl(
+                                                    MediaUtils.urlToProtocolUrl(firstFrameUrl))
+                                            .lastFrameUrl(MediaUtils.urlToProtocolUrl(lastFrameUrl))
+                                            .negativePrompt(negativePrompt)
+                                            .template(template)
+                                            .parameters(parameters)
+                                            .header("user-agent", Version.getUserAgent())
+                                            .build();
+
+                            VideoSynthesis videoSynthesis = new VideoSynthesis();
+
+                            log.info(
+                                    "Starting first and last frame image to video generation task,"
+                                            + " please wait for a while...");
+                            // The video call method blocks until the video is generated or fails
+                            VideoSynthesisResult response = videoSynthesis.call(param);
+
+                            // Extract video URL
+                            String videoUrl =
+                                    Optional.ofNullable(response)
+                                            .map(VideoSynthesisResult::getOutput)
+                                            .map(VideoSynthesisOutput::getVideoUrl)
+                                            .orElse(null);
+
+                            if (videoUrl == null || videoUrl.trim().isEmpty()) {
+                                log.error("Failed to generate video. No URL returned.");
+                                return ToolResultBlock.error("Failed to generate video.");
+                            }
+
+                            log.info(
+                                    "First and last frame image to video video generated"
+                                            + " successfully, videoUrl:{}",
+                                    videoUrl);
+
+                            VideoBlock vb =
+                                    VideoBlock.builder()
+                                            .source(URLSource.builder().url(videoUrl).build())
+                                            .build();
+
+                            return ToolResultBlock.of(vb);
+                        })
+                .onErrorResume(
+                        e -> {
+                            log.error("Failed to generate key-frame video '{}'", e.getMessage(), e);
+                            return Mono.just(ToolResultBlock.error(e.getMessage()));
+                        });
+    }
+
+    /**
+     * Analyze video and generate a text description or answer questions based on the video content.
+     *
+     * @param videoUrl      The URL or local path of the video to analyze.
+     * @param prompt        The text prompt or question regarding the video content.
+     * @param model         The vision model to use, e.g., 'qwen3.5-plus', 'qwen3.5-flash', 'qwen3-vl-plus', 'qwen3-vl-flash', etc.
+     * @param fps           Frames per second to sample from the video (e.g., 1, 2, 4). Default is 2.
+     * @return A ToolResultBlock containing the generated text analysis or error message.
+     */
+    @Tool(
+            name = "dashscope_video_to_text",
+            description =
+                    "Analyze video and generate a text description or answer questions based on the"
+                            + " video content.Supports controlling the frame sampling rate (fps).")
+    public Mono<ToolResultBlock> dashscopeVideoToText(
+            @ToolParam(
+                            name = "video_url",
+                            description =
+                                    "The URL, local file path or base64 data URL(the format pattern"
+                                        + " is data:[MIME_type];base64,{base64_image}, e.g.,"
+                                        + " 'data:video/mp4;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAA...')"
+                                        + " of the video to analyze.")
+                    String videoUrl,
+            @ToolParam(
+                            name = "prompt",
+                            description = "The question or instruction regarding the video content",
+                            required = false)
+                    String prompt,
+            @ToolParam(
+                            name = "model",
+                            description =
+                                    "The vision model to use, e.g., 'qwen3.5-plus',"
+                                            + " 'qwen3.5-flash', 'qwen3-vl-plus', 'qwen3-vl-flash',"
+                                            + " etc",
+                            required = false)
+                    String model,
+            @ToolParam(
+                            name = "fps",
+                            description =
+                                    "Frames per second to sample from the video for analysis"
+                                            + " (default 2.0)",
+                            required = false)
+                    Double fps) {
+
+        String finalModel =
+                Optional.ofNullable(model).filter(s -> !s.trim().isEmpty()).orElse("qwen3.5-plus");
+        String finalPrompt =
+                Optional.ofNullable(prompt)
+                        .filter(s -> !s.trim().isEmpty())
+                        .orElse("Describe the video");
+        double finalFps = Optional.ofNullable(fps).orElse(2.0);
+
+        log.debug(
+                "dashscope_video_to_text called: videoUrl:'{}', prompt='{}', model='{}', fps={}",
+                videoUrl,
+                prompt,
+                finalModel,
+                finalFps);
+
+        return Mono.fromCallable(
+                        () -> {
+                            Map<String, Object> videoParams = new HashMap<>();
+                            videoParams.put("video", MediaUtils.urlToProtocolUrl(videoUrl));
+                            videoParams.put("fps", finalFps);
+                            List<Map<String, Object>> content = new ArrayList<>();
+                            content.add(Map.of("text", finalPrompt));
+                            content.add(videoParams);
+                            MultiModalMessage userMessage =
+                                    MultiModalMessage.builder()
+                                            .role(Role.USER.getValue())
+                                            .content(content)
+                                            .build();
+
+                            MultiModalMessage systemMessage =
+                                    MultiModalMessage.builder()
+                                            .role(Role.SYSTEM.getValue())
+                                            .content(
+                                                    List.of(
+                                                            Map.of(
+                                                                    "text",
+                                                                    "You are a helpful"
+                                                                            + " assistant.")))
+                                            .build();
+
+                            List<MultiModalMessage> multiModalMessages = new ArrayList<>();
+                            multiModalMessages.add(systemMessage);
+                            multiModalMessages.add(userMessage);
+
+                            MultiModalConversationParam param =
+                                    MultiModalConversationParam.builder()
+                                            .apiKey(this.apiKey)
+                                            .model(finalModel)
+                                            .messages(multiModalMessages)
+                                            .header("user-agent", Version.getUserAgent())
+                                            .build();
+
+                            MultiModalConversation conv = new MultiModalConversation();
+                            MultiModalConversationResult result = conv.call(param);
+
+                            // Extract text from the result
+                            String text =
+                                    Optional.ofNullable(result)
+                                            .map(MultiModalConversationResult::getOutput)
+                                            .map(MultiModalConversationOutput::getChoices)
+                                            .flatMap(choices -> choices.stream().findFirst())
+                                            .map(MultiModalConversationOutput.Choice::getMessage)
+                                            .map(MultiModalMessage::getContent)
+                                            .flatMap(contents -> contents.stream().findFirst())
+                                            .map(contentMap -> contentMap.get("text"))
+                                            .map(Object::toString)
+                                            .orElse(null);
+
+                            if (text == null || text.trim().isEmpty()) {
+                                log.error("Failed to analyze video. No text response returned.");
+                                return ToolResultBlock.error("Failed to analyze video.");
+                            }
+
+                            log.info("Video analysis completed successfully.");
+
+                            TextBlock tb = TextBlock.builder().text(text).build();
+                            return ToolResultBlock.of(tb);
+                        })
+                .onErrorResume(
+                        e -> {
+                            log.error("Failed to analyze video '{}'", e.getMessage(), e);
+                            return Mono.just(ToolResultBlock.error(e.getMessage()));
+                        });
+    }
 }
diff --git a/agentscope-core/src/test/java/io/agentscope/core/model/ChatModelNonStreamingBlockingBehaviorTest.java b/agentscope-core/src/test/java/io/agentscope/core/model/ChatModelNonStreamingBlockingBehaviorTest.java
index 51f76ce03..ed862375b 100644
--- a/agentscope-core/src/test/java/io/agentscope/core/model/ChatModelNonStreamingBlockingBehaviorTest.java
+++ b/agentscope-core/src/test/java/io/agentscope/core/model/ChatModelNonStreamingBlockingBehaviorTest.java
@@ -152,7 +152,7 @@ void testOpenAIChatModelNonBlocking() throws Exception {
                         },
                         error -> latch.countDown());
 
-        latch.await(3, TimeUnit.SECONDS);
+        latch.await(5, TimeUnit.SECONDS);
         assertNotNull(streamThreadName.get());
         assertNotEquals(
                 currentThreadName,
diff --git a/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolE2ETest.java b/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolE2ETest.java
index 164f19be8..9b6bcb26d 100644
--- a/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolE2ETest.java
+++ b/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolE2ETest.java
@@ -16,16 +16,20 @@
 package io.agentscope.core.tool.multimodal;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
-import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import io.agentscope.core.e2e.E2ETestCondition;
+import io.agentscope.core.formatter.MediaUtils;
 import io.agentscope.core.message.AudioBlock;
 import io.agentscope.core.message.Base64Source;
 import io.agentscope.core.message.ImageBlock;
 import io.agentscope.core.message.TextBlock;
 import io.agentscope.core.message.ToolResultBlock;
 import io.agentscope.core.message.URLSource;
+import io.agentscope.core.message.VideoBlock;
+import java.io.IOException;
+import java.nio.file.Paths;
 import java.util.List;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.DisplayName;
@@ -50,13 +54,28 @@ class DashScopeMultiModalToolE2ETest {
 
     private static final String TEXT_TO_IMAGE_PROMPT = "A small dog.";
     private static final String IMAGE_TO_TEXT_PROMPT = "Describe the image.";
+    private static final String TEXT_TO_VIDEO_PROMPT = "A small dog is running in moonlight.";
+    private static final String IMAGE_TO_VIDEO_PROMPT = "A tiger is running in moonlight.";
+    private static final String FIRST_AND_LAST_FRAME_IMAGE_TO_VIDEO_PROMPT =
+            "A black kitten looks curiously into the sky.";
+    private static final String VIDEO_TO_TEXT_PROMPT = "Describe the video.";
     private static final String TEST_IMAGE_URL =
             "https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png";
-    private static final String TEST_IMAGE_PATH = "src/test/resources/dog.png";
+    private static final String TEST_IMAGE_PATH =
+            Paths.get("src", "test", "resources", "dog.png").toString();
     private static final String TEST_AUDIO_URL =
             "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male2.wav";
     private static final String TEST_AUDIO_PATH =
-            "src/test/resources/hello_world_male_16k_16bit_mono.wav";
+            Paths.get("src", "test", "resources", "hello_world_male_16k_16bit_mono.wav").toString();
+    private static final String TEST_VIDEO_URL =
+            "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4";
+    private static final String TEST_VIDEO_PATH =
+            Paths.get("src", "test", "resources", "test_video.mp4").toString();
+    private static final String TEST_FIRST_FRAME_URL =
+            "https://wanx.alicdn.com/material/20250318/first_frame.png";
+    private static final String TEST_LAST_FRAME_URL =
+            "https://wanx.alicdn.com/material/20250318/last_frame.png";
+
     private DashScopeMultiModalTool multiModalTool;
 
     @BeforeEach
@@ -76,9 +95,9 @@ void testTextToImageUrlMode() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock);
+                            assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0));
                             ImageBlock imageBlock = (ImageBlock) toolResultBlock.getOutput().get(0);
-                            assertTrue(imageBlock.getSource() instanceof URLSource);
+                            assertInstanceOf(URLSource.class, imageBlock.getSource());
                             assertNotNull(((URLSource) imageBlock.getSource()).getUrl());
                         })
                 .verifyComplete();
@@ -96,9 +115,9 @@ void testTextToImageBase64Mode() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock);
+                            assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0));
                             ImageBlock imageBlock = (ImageBlock) toolResultBlock.getOutput().get(0);
-                            assertTrue(imageBlock.getSource() instanceof Base64Source);
+                            assertInstanceOf(Base64Source.class, imageBlock.getSource());
                             assertNotNull(((Base64Source) imageBlock.getSource()).getMediaType());
                             assertNotNull(((Base64Source) imageBlock.getSource()).getData());
                         })
@@ -117,15 +136,15 @@ void testTextToImageResponseMultiUrls() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(2, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock);
-                            assertTrue(toolResultBlock.getOutput().get(1) instanceof ImageBlock);
+                            assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0));
+                            assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(1));
                             ImageBlock image0Block =
                                     (ImageBlock) toolResultBlock.getOutput().get(0);
-                            assertTrue(image0Block.getSource() instanceof URLSource);
+                            assertInstanceOf(URLSource.class, image0Block.getSource());
                             assertNotNull(((URLSource) image0Block.getSource()).getUrl());
                             ImageBlock image1Block =
                                     (ImageBlock) toolResultBlock.getOutput().get(1);
-                            assertTrue(image1Block.getSource() instanceof URLSource);
+                            assertInstanceOf(URLSource.class, image1Block.getSource());
                             assertNotNull(((URLSource) image1Block.getSource()).getUrl());
                         })
                 .verifyComplete();
@@ -143,7 +162,7 @@ void testImageToTextWithUrl() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertNotNull(
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
                         })
@@ -162,7 +181,28 @@ void testImageToTextWithFile() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertNotNull(
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("Image to text with base64 data url")
+    void testImageToTextWithBase64DataUrl() throws IOException {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeImageToText(
+                        List.of(MediaUtils.urlToBase64DataUrl(TEST_IMAGE_URL)),
+                        IMAGE_TO_TEXT_PROMPT,
+                        "qwen3-vl-plus");
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertNotNull(
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
                         })
@@ -183,7 +223,7 @@ void testImageToTextWithUrlAndFile() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertNotNull(
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
                         })
@@ -202,9 +242,9 @@ void testTextToAudio() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof AudioBlock);
+                            assertInstanceOf(AudioBlock.class, toolResultBlock.getOutput().get(0));
                             AudioBlock audioBlock = (AudioBlock) toolResultBlock.getOutput().get(0);
-                            assertTrue(audioBlock.getSource() instanceof Base64Source);
+                            assertInstanceOf(Base64Source.class, audioBlock.getSource());
                             assertNotNull(((Base64Source) audioBlock.getSource()).getData());
                         })
                 .verifyComplete();
@@ -222,7 +262,7 @@ void testAudioToTextWithUrl() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertNotNull(
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
                         })
@@ -241,7 +281,279 @@ void testAudioToTextWithFile() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertNotNull(
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("Text to video response url")
+    void testTextToVideo() {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeTextToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.6-t2v",
+                        "low quality",
+                        TEST_AUDIO_URL,
+                        "1920*1080",
+                        5,
+                        "single",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertNotNull(((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("Image to video with image url")
+    void testImageToVideoWithUrl() {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeImageToVideo(
+                        IMAGE_TO_VIDEO_PROMPT,
+                        "wan2.6-i2v-flash",
+                        TEST_IMAGE_URL,
+                        TEST_AUDIO_URL,
+                        "low quality",
+                        "hanfu-1",
+                        "480P",
+                        10,
+                        "single",
+                        true,
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertNotNull(((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("Image to video with local image file")
+    void testImageToVideoWithFile() {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.6-i2v-flash",
+                        TEST_IMAGE_PATH,
+                        TEST_AUDIO_URL,
+                        "low quality",
+                        "hanfu-1",
+                        "480P",
+                        10,
+                        "single",
+                        true,
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertNotNull(((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("Image to video with base64 data url")
+    void testImageToVideoWithBase64DataUrl() throws IOException {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeImageToVideo(
+                        MediaUtils.urlToBase64DataUrl(TEST_IMAGE_PATH),
+                        "wan2.6-i2v-flash",
+                        TEST_IMAGE_PATH,
+                        TEST_AUDIO_URL,
+                        "low quality",
+                        "hanfu-1",
+                        "480P",
+                        10,
+                        "single",
+                        true,
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertNotNull(((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("First and last frame image to video with image url")
+    void testFirstAndLastFrameImageToVideoWithUrl() {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeFirstAndLastFrameImageToVideo(
+                        FIRST_AND_LAST_FRAME_IMAGE_TO_VIDEO_PROMPT,
+                        "wan2.2-kf2v-flash",
+                        TEST_FIRST_FRAME_URL,
+                        TEST_LAST_FRAME_URL,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertNotNull(((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("First and last frame image to video with local image file")
+    void testFirstAndLastFrameImageToVideoWithFile() throws IOException {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeFirstAndLastFrameImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.2-kf2v-flash",
+                        TEST_IMAGE_PATH,
+                        null,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertNotNull(((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("First and last frame image to video with base64 data url")
+    void testFirstAndLastFrameImageToVideoWithBase64DataUrl() throws IOException {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeFirstAndLastFrameImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.2-kf2v-flash",
+                        MediaUtils.urlToBase64DataUrl(TEST_FIRST_FRAME_URL),
+                        MediaUtils.urlToBase64DataUrl(TEST_LAST_FRAME_URL),
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertNotNull(((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("Video to text with video url")
+    void testVideoToTextWithUrl() {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeVideoToText(
+                        TEST_VIDEO_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertNotNull(
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("Video to text with local video file")
+    void testVideoToTextWithFile() {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeVideoToText(
+                        TEST_VIDEO_PATH, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertNotNull(
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+    }
+
+    @Test
+    @DisplayName("Video to text with base64 data url")
+    void testVideoToTextWithBase64DataUrl() throws IOException {
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeVideoToText(
+                        MediaUtils.urlToBase64DataUrl(TEST_VIDEO_URL),
+                        VIDEO_TO_TEXT_PROMPT,
+                        "qwen3.5-plus",
+                        2.0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertNotNull(
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
                         })
diff --git a/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolTest.java b/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolTest.java
index 2e99db059..3f62ad5ee 100644
--- a/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolTest.java
+++ b/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolTest.java
@@ -17,6 +17,7 @@
 
 import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.mockito.ArgumentMatchers.any;
@@ -41,6 +42,10 @@
 import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationOutput;
 import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
 import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
+import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesis;
+import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisOutput;
+import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisParam;
+import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisResult;
 import com.alibaba.dashscope.api.SynchronizeFullDuplexApi;
 import com.alibaba.dashscope.audio.asr.recognition.Recognition;
 import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam;
@@ -57,6 +62,7 @@
 import io.agentscope.core.message.TextBlock;
 import io.agentscope.core.message.ToolResultBlock;
 import io.agentscope.core.message.URLSource;
+import io.agentscope.core.message.VideoBlock;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.OutputStream;
@@ -89,12 +95,20 @@ class DashScopeMultiModalToolTest {
     private static final String TEST_API_KEY = "test_api_key";
     private static final String TEXT_TO_IMAGE_PROMPT = "A small dog.";
     private static final String IMAGE_TO_TEXT_PROMPT = "Describe the image.";
+    private static final String TEXT_TO_VIDEO_PROMPT = "A smart cat is running in the moonlight.";
+    private static final String VIDEO_TO_TEXT_PROMPT = "Describe the video.";
     private static final String TEST_IMAGE0_URL = "https://example.com/image0.png";
     private static final String TEST_IMAGE1_URL = "https://example.com/image1.png";
     private static final String TEST_IMAGE_PATH = "/path/image.png";
+    private static final String TEST_IMAGE_BASE64_DATA_URL =
+            "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABDg...";
     private static final String TEST_AUDIO_URL = "https://example.com/audio.wav";
     private static final String TEST_AUDIO_PATH = "/path/audio.wav";
-    private static final String TEST_AUDIO_TEXT = "text audio text";
+    private static final String TEST_AUDIO_TEXT = "test audio text";
+    private static final String TEST_VIDEO_URL = "https://example.com/video.mp4";
+    private static final String TEST_VIDEO_PATH = "/path/video.mp4";
+    private static final String TEST_VIDEO_BASE64_DATA_URL =
+            "data:video/mp4;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAA...";
     // base64 of "hello"
     private static final String TEST_BASE64_DATA = "aGVsbG8=";
     private static final String TEST_MULTI_MODAL_CONTENT = "This is a small dog.";
@@ -131,9 +145,9 @@ void testTextToImageUrlMode() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock);
+                            assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0));
                             ImageBlock imageBlock = (ImageBlock) toolResultBlock.getOutput().get(0);
-                            assertTrue(imageBlock.getSource() instanceof URLSource);
+                            assertInstanceOf(URLSource.class, imageBlock.getSource());
                             assertEquals(
                                     TEST_IMAGE0_URL, ((URLSource) imageBlock.getSource()).getUrl());
                         })
@@ -171,9 +185,9 @@ void testTextToImageBase64Mode() throws IOException {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock);
+                            assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0));
                             ImageBlock imageBlock = (ImageBlock) toolResultBlock.getOutput().get(0);
-                            assertTrue(imageBlock.getSource() instanceof Base64Source);
+                            assertInstanceOf(Base64Source.class, imageBlock.getSource());
                             assertEquals(
                                     "image/png",
                                     ((Base64Source) imageBlock.getSource()).getMediaType());
@@ -215,17 +229,17 @@ void testTextToImageResponseMultiUrl() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(2, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock);
-                            assertTrue(toolResultBlock.getOutput().get(1) instanceof ImageBlock);
+                            assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0));
+                            assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(1));
                             ImageBlock image0Block =
                                     (ImageBlock) toolResultBlock.getOutput().get(0);
-                            assertTrue(image0Block.getSource() instanceof URLSource);
+                            assertInstanceOf(URLSource.class, image0Block.getSource());
                             assertEquals(
                                     TEST_IMAGE0_URL,
                                     ((URLSource) image0Block.getSource()).getUrl());
                             ImageBlock image1Block =
                                     (ImageBlock) toolResultBlock.getOutput().get(1);
-                            assertTrue(image1Block.getSource() instanceof URLSource);
+                            assertInstanceOf(URLSource.class, image1Block.getSource());
                             assertEquals(
                                     TEST_IMAGE1_URL,
                                     ((URLSource) image1Block.getSource()).getUrl());
@@ -259,7 +273,7 @@ void testTextToImageResponseEmpty() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     String.format("Error: %s", "Failed to generate images."),
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -293,7 +307,7 @@ void testTextToImageResponseNull() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     String.format("Error: %s", "Failed to generate images."),
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -309,9 +323,9 @@ void testTextToImageError() {
         MockedConstruction<ImageSynthesis> mockCtor =
                 mockConstruction(
                         ImageSynthesis.class,
-                        (mock, context) -> {
-                            when(mock.call(any(ImageSynthesisParam.class))).thenThrow(TEST_ERROR);
-                        });
+                        (mock, context) ->
+                                when(mock.call(any(ImageSynthesisParam.class)))
+                                        .thenThrow(TEST_ERROR));
 
         Mono<ToolResultBlock> result =
                 multiModalTool.dashscopeTextToImage(
@@ -322,7 +336,7 @@ void testTextToImageError() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     String.format("Error: %s", TEST_ERROR.getMessage()),
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -370,7 +384,7 @@ void testImageToTextWithUrl() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     TEST_MULTI_MODAL_CONTENT,
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -421,7 +435,7 @@ void testImageToTextWithFile() throws IOException {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     TEST_MULTI_MODAL_CONTENT,
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -432,6 +446,54 @@ void testImageToTextWithFile() throws IOException {
         mockedConv.close();
     }
 
+    @Test
+    @DisplayName("Image to text with base64 data url")
+    void testImageToTextWithBase64DataUrl() {
+        MockedConstruction<MultiModalConversation> mockedConv =
+                mockConstruction(
+                        MultiModalConversation.class,
+                        (mock, context) -> {
+                            MultiModalConversationResult mockResult =
+                                    mock(MultiModalConversationResult.class);
+                            MultiModalConversationOutput mockOutput =
+                                    mock(MultiModalConversationOutput.class);
+
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            MultiModalConversationOutput.Choice choice =
+                                    new MultiModalConversationOutput.Choice();
+                            choice.setMessage(
+                                    MultiModalMessage.builder()
+                                            .content(
+                                                    List.of(
+                                                            Map.of(
+                                                                    "text",
+                                                                    TEST_MULTI_MODAL_CONTENT)))
+                                            .build());
+                            choice.setFinishReason("stop");
+                            when(mockOutput.getChoices()).thenReturn(List.of(choice));
+                            when(mock.call(any(MultiModalConversationParam.class)))
+                                    .thenReturn(mockResult);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeImageToText(
+                        List.of(TEST_IMAGE_BASE64_DATA_URL), IMAGE_TO_TEXT_PROMPT, "qwen3-vl-plus");
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    TEST_MULTI_MODAL_CONTENT,
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockedConv.close();
+    }
+
     @Test
     @DisplayName("Image to text with local file and web url")
     void testImageToTextWithFileAndUrl() throws IOException {
@@ -475,7 +537,7 @@ void testImageToTextWithFileAndUrl() throws IOException {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                         })
                 .verifyComplete();
 
@@ -515,7 +577,7 @@ void testImageToTextResponseEmpty() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     String.format("Error: %s", "Failed to generate text."),
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -552,7 +614,7 @@ void testImageToTextResponseNull() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     String.format("Error: %s", "Failed to generate text."),
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -568,10 +630,9 @@ void testImageToTextError() {
         MockedConstruction<MultiModalConversation> mockConv =
                 mockConstruction(
                         MultiModalConversation.class,
-                        (mock, context) -> {
-                            when(mock.call(any(MultiModalConversationParam.class)))
-                                    .thenThrow(TEST_ERROR);
-                        });
+                        (mock, context) ->
+                                when(mock.call(any(MultiModalConversationParam.class)))
+                                        .thenThrow(TEST_ERROR));
 
         Mono<ToolResultBlock> result =
                 multiModalTool.dashscopeImageToText(
@@ -582,7 +643,7 @@ void testImageToTextError() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     String.format("Error: %s", TEST_ERROR.getMessage()),
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -612,9 +673,9 @@ void testTextToAudioWithSambertSuccess() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof AudioBlock);
+                            assertInstanceOf(AudioBlock.class, toolResultBlock.getOutput().get(0));
                             AudioBlock audioBlock = (AudioBlock) toolResultBlock.getOutput().get(0);
-                            assertTrue(audioBlock.getSource() instanceof Base64Source);
+                            assertInstanceOf(Base64Source.class, audioBlock.getSource());
                             assertEquals(
                                     TEST_BASE64_DATA,
                                     ((Base64Source) audioBlock.getSource()).getData());
@@ -643,15 +704,17 @@ private ToolResultBlock invokeParseQwenTTSResponse(String responseBody) throws E
         @DisplayName("Parse Qwen TTS response with URL")
         void testParseQwenTTSResponseWithUrl() throws Exception {
             String responseJson =
-                    "{\"output\":{\"audio\":{\"url\":\"https://example.com/audio.wav\"}},\"request_id\":\"test-request-id\"}";
+                    """
+                    {"output":{"audio":{"url":"https://example.com/audio.wav"}},"request_id":"test-request-id"}
+                    """;
 
             ToolResultBlock result = invokeParseQwenTTSResponse(responseJson);
 
             assertNotNull(result);
             assertEquals(1, result.getOutput().size());
-            assertTrue(result.getOutput().get(0) instanceof AudioBlock);
+            assertInstanceOf(AudioBlock.class, result.getOutput().get(0));
             AudioBlock audioBlock = (AudioBlock) result.getOutput().get(0);
-            assertTrue(audioBlock.getSource() instanceof URLSource);
+            assertInstanceOf(URLSource.class, audioBlock.getSource());
             assertEquals(
                     "https://example.com/audio.wav", ((URLSource) audioBlock.getSource()).getUrl());
         }
@@ -669,9 +732,9 @@ void testParseQwenTTSResponseWithBase64() throws Exception {
 
             assertNotNull(result);
             assertEquals(1, result.getOutput().size());
-            assertTrue(result.getOutput().get(0) instanceof AudioBlock);
+            assertInstanceOf(AudioBlock.class, result.getOutput().get(0));
             AudioBlock audioBlock = (AudioBlock) result.getOutput().get(0);
-            assertTrue(audioBlock.getSource() instanceof Base64Source);
+            assertInstanceOf(Base64Source.class, audioBlock.getSource());
             assertEquals(testBase64, ((Base64Source) audioBlock.getSource()).getData());
             assertEquals("audio/wav", ((Base64Source) audioBlock.getSource()).getMediaType());
         }
@@ -685,7 +748,7 @@ void testParseQwenTTSResponseWithError() throws Exception {
 
             assertNotNull(result);
             assertEquals(1, result.getOutput().size());
-            assertTrue(result.getOutput().get(0) instanceof TextBlock);
+            assertInstanceOf(TextBlock.class, result.getOutput().get(0));
             assertTrue(
                     ((TextBlock) result.getOutput().get(0)).getText().contains("Invalid request"));
         }
@@ -699,7 +762,7 @@ void testParseQwenTTSResponseMissingOutput() throws Exception {
 
             assertNotNull(result);
             assertEquals(1, result.getOutput().size());
-            assertTrue(result.getOutput().get(0) instanceof TextBlock);
+            assertInstanceOf(TextBlock.class, result.getOutput().get(0));
             assertTrue(
                     ((TextBlock) result.getOutput().get(0))
                             .getText()
@@ -715,7 +778,7 @@ void testParseQwenTTSResponseMissingAudio() throws Exception {
 
             assertNotNull(result);
             assertEquals(1, result.getOutput().size());
-            assertTrue(result.getOutput().get(0) instanceof TextBlock);
+            assertInstanceOf(TextBlock.class, result.getOutput().get(0));
             assertTrue(
                     ((TextBlock) result.getOutput().get(0))
                             .getText()
@@ -731,7 +794,7 @@ void testParseQwenTTSResponseNoAudioData() throws Exception {
 
             assertNotNull(result);
             assertEquals(1, result.getOutput().size());
-            assertTrue(result.getOutput().get(0) instanceof TextBlock);
+            assertInstanceOf(TextBlock.class, result.getOutput().get(0));
             assertTrue(
                     ((TextBlock) result.getOutput().get(0))
                             .getText()
@@ -747,7 +810,7 @@ void testParseQwenTTSResponseInvalidJson() throws Exception {
 
             assertNotNull(result);
             assertEquals(1, result.getOutput().size());
-            assertTrue(result.getOutput().get(0) instanceof TextBlock);
+            assertInstanceOf(TextBlock.class, result.getOutput().get(0));
             assertTrue(
                     ((TextBlock) result.getOutput().get(0))
                             .getText()
@@ -763,7 +826,7 @@ void testParseQwenTTSResponseErrorNoMessage() throws Exception {
 
             assertNotNull(result);
             assertEquals(1, result.getOutput().size());
-            assertTrue(result.getOutput().get(0) instanceof TextBlock);
+            assertInstanceOf(TextBlock.class, result.getOutput().get(0));
             assertTrue(((TextBlock) result.getOutput().get(0)).getText().contains("Unknown error"));
         }
     }
@@ -796,10 +859,9 @@ void testTextToAudioResponseEmpty() {
         MockedConstruction<SpeechSynthesizer> mockCtor =
                 Mockito.mockConstruction(
                         SpeechSynthesizer.class,
-                        (mock, context) -> {
-                            when(mock.call(any(SpeechSynthesisParam.class)))
-                                    .thenReturn(ByteBuffer.allocate(0));
-                        });
+                        (mock, context) ->
+                                when(mock.call(any(SpeechSynthesisParam.class)))
+                                        .thenReturn(ByteBuffer.allocate(0)));
 
         Mono<ToolResultBlock> result =
                 multiModalTool.dashscopeTextToAudio(
@@ -810,7 +872,7 @@ void testTextToAudioResponseEmpty() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     String.format("Error: %s", "Failed to generate audio."),
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -826,9 +888,8 @@ void testTextToAudioResponseNull() {
         MockedConstruction<SpeechSynthesizer> mockCtor =
                 Mockito.mockConstruction(
                         SpeechSynthesizer.class,
-                        (mock, context) -> {
-                            when(mock.call(any(SpeechSynthesisParam.class))).thenReturn(null);
-                        });
+                        (mock, context) ->
+                                when(mock.call(any(SpeechSynthesisParam.class))).thenReturn(null));
 
         Mono<ToolResultBlock> result =
                 multiModalTool.dashscopeTextToAudio(
@@ -839,7 +900,7 @@ void testTextToAudioResponseNull() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     String.format("Error: %s", "Failed to generate audio."),
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -855,9 +916,9 @@ void testTextToAudioError() {
         MockedConstruction<SpeechSynthesizer> mockCtor =
                 Mockito.mockConstruction(
                         SpeechSynthesizer.class,
-                        (mock, context) -> {
-                            when(mock.call(any(SpeechSynthesisParam.class))).thenThrow(TEST_ERROR);
-                        });
+                        (mock, context) ->
+                                when(mock.call(any(SpeechSynthesisParam.class)))
+                                        .thenThrow(TEST_ERROR));
 
         Mono<ToolResultBlock> result =
                 multiModalTool.dashscopeTextToAudio(
@@ -868,7 +929,7 @@ void testTextToAudioError() {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     String.format("Error: %s", TEST_ERROR.getMessage()),
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -920,7 +981,7 @@ void testAudioToTextWithUrl() throws Exception {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     TEST_AUDIO_TEXT,
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -972,7 +1033,7 @@ void testAudioToTextWithFile() throws Exception {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     TEST_AUDIO_TEXT,
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -1010,7 +1071,7 @@ void testAudioToTextError() throws Exception {
                         toolResultBlock -> {
                             assertNotNull(toolResultBlock);
                             assertEquals(1, toolResultBlock.getOutput().size());
-                            assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock);
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
                             assertEquals(
                                     String.format("Error: %s", TEST_ERROR.getMessage()),
                                     ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
@@ -1055,4 +1116,831 @@ void testSendChunkAudioWithFile() throws Exception {
 
         Files.deleteIfExists(tempAudioFile);
     }
+
+    @Test
+    @DisplayName("Should return a video url when text to video invoked success")
+    void testTextToVideoUrl() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) -> {
+                            VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class);
+                            VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class);
+
+                            when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult);
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeTextToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.6-t2v",
+                        "low quality",
+                        TEST_AUDIO_URL,
+                        "1920*1080",
+                        5,
+                        "single",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("Should return error TextBlock when call text to video response null")
+    void testTextToVideoResponseNull() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) -> {
+                            VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class);
+                            VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class);
+
+                            when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult);
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getVideoUrl()).thenReturn(null);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeTextToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.6-t2v",
+                        "low quality",
+                        TEST_AUDIO_URL,
+                        "1920*1080",
+                        5,
+                        "single",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    String.format("Error: %s", "Failed to generate video."),
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("Should return error TextBlock when call text to video occurs error")
+    void testTextToVideoError() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) ->
+                                when(mock.call(any(VideoSynthesisParam.class)))
+                                        .thenThrow(TEST_ERROR));
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeTextToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.6-t2v",
+                        "low quality",
+                        TEST_AUDIO_URL,
+                        "1920*1080",
+                        5,
+                        "single",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    String.format("Error: %s", TEST_ERROR.getMessage()),
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("Image to video with image url")
+    void testImageToVideoWithUrl() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) -> {
+                            VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class);
+                            VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class);
+
+                            when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult);
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.6-i2v-flash",
+                        TEST_IMAGE0_URL,
+                        TEST_AUDIO_URL,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        10,
+                        "single",
+                        true,
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("Image to video with local image file")
+    void testImageToVideoWithFile() throws IOException {
+        MockedStatic<MediaUtils> mockMediaUtils = mockStatic(MediaUtils.class);
+        when(MediaUtils.urlToProtocolUrl(TEST_IMAGE_PATH)).thenReturn("file://" + TEST_IMAGE_PATH);
+
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) -> {
+                            VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class);
+                            VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class);
+
+                            when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult);
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.6-i2v-flash",
+                        TEST_IMAGE_PATH,
+                        TEST_AUDIO_URL,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        10,
+                        "single",
+                        true,
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+
+        mockMediaUtils.close();
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("Image to video with base64 data url")
+    void testImageToVideoWithBase64DataUrl() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) -> {
+                            VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class);
+                            VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class);
+
+                            when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult);
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.6-i2v-flash",
+                        TEST_IMAGE_BASE64_DATA_URL,
+                        TEST_AUDIO_URL,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        10,
+                        "single",
+                        true,
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("Should return error TextBlock when call image to video response" + " null")
+    void testImageToVideoResponseNull() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) -> {
+                            VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class);
+                            VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class);
+
+                            when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult);
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getVideoUrl()).thenReturn(null);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.6-i2v-flash",
+                        TEST_IMAGE0_URL,
+                        TEST_AUDIO_URL,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        10,
+                        "single",
+                        true,
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    String.format("Error: %s", "Failed to generate video."),
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("Should return error TextBlock when call image to video occurs" + " error")
+    void testImageToVideoError() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) ->
+                                when(mock.call(any(VideoSynthesisParam.class)))
+                                        .thenThrow(TEST_ERROR));
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.6-i2v-flash",
+                        TEST_IMAGE0_URL,
+                        TEST_AUDIO_URL,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        10,
+                        "single",
+                        true,
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    String.format("Error: %s", TEST_ERROR.getMessage()),
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("First and last frame image to video with image url")
+    void testFirstAndLastFrameImageToVideoWithUrl() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) -> {
+                            VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class);
+                            VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class);
+
+                            when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult);
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeFirstAndLastFrameImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.2-kf2v-flash",
+                        TEST_IMAGE0_URL,
+                        TEST_IMAGE1_URL,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("First and last frame image to video with local image file")
+    void testFirstAndLastFrameImageToVideoWithFile() throws IOException {
+        MockedStatic<MediaUtils> mockMediaUtils = mockStatic(MediaUtils.class);
+        when(MediaUtils.urlToProtocolUrl(TEST_IMAGE_PATH)).thenReturn("file://" + TEST_IMAGE_PATH);
+
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) -> {
+                            VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class);
+                            VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class);
+
+                            when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult);
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeFirstAndLastFrameImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.2-kf2v-flash",
+                        TEST_IMAGE_PATH,
+                        null,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+
+        mockMediaUtils.close();
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("First and last frame image to video with base64 data url")
+    void testFirstAndLastFrameImageToVideoWithBase64DataUrl() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) -> {
+                            VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class);
+                            VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class);
+
+                            when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult);
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeFirstAndLastFrameImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.2-kf2v-flash",
+                        TEST_IMAGE_BASE64_DATA_URL,
+                        null,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0));
+                            VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0);
+                            assertInstanceOf(URLSource.class, vb.getSource());
+                            assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName(
+            "Should return error TextBlock when call first and last frame image to video response"
+                    + " null")
+    void testFirstAndLastFrameImageToVideoResponseNull() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) -> {
+                            VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class);
+                            VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class);
+
+                            when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult);
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getVideoUrl()).thenReturn(null);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeFirstAndLastFrameImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.2-kf2v-flash",
+                        TEST_IMAGE_BASE64_DATA_URL,
+                        null,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    String.format("Error: %s", "Failed to generate video."),
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName(
+            "Should return error TextBlock when call first and last frame image to video occurs"
+                    + " error")
+    void testFirstAndLastFrameImageToVideoError() {
+        MockedConstruction<VideoSynthesis> mockCtor =
+                mockConstruction(
+                        VideoSynthesis.class,
+                        (mock, context) ->
+                                when(mock.call(any(VideoSynthesisParam.class)))
+                                        .thenThrow(TEST_ERROR));
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeFirstAndLastFrameImageToVideo(
+                        TEXT_TO_VIDEO_PROMPT,
+                        "wan2.2-kf2v-flash",
+                        TEST_IMAGE_BASE64_DATA_URL,
+                        null,
+                        "",
+                        "hanfu-1",
+                        "480P",
+                        true,
+                        false,
+                        0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    String.format("Error: %s", TEST_ERROR.getMessage()),
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockCtor.close();
+    }
+
+    @Test
+    @DisplayName("Video to text with video url")
+    void testVideoToTextWithUrl() {
+        MockedConstruction<MultiModalConversation> mockConv =
+                mockConstruction(
+                        MultiModalConversation.class,
+                        (mock, context) -> {
+                            MultiModalConversationResult mockResult =
+                                    mock(MultiModalConversationResult.class);
+                            MultiModalConversationOutput mockOutput =
+                                    mock(MultiModalConversationOutput.class);
+
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            MultiModalConversationOutput.Choice choice =
+                                    new MultiModalConversationOutput.Choice();
+                            choice.setMessage(
+                                    MultiModalMessage.builder()
+                                            .content(
+                                                    List.of(
+                                                            Map.of(
+                                                                    "text",
+                                                                    TEST_MULTI_MODAL_CONTENT)))
+                                            .build());
+                            choice.setFinishReason("stop");
+                            when(mockOutput.getChoices()).thenReturn(List.of(choice));
+                            when(mock.call(any(MultiModalConversationParam.class)))
+                                    .thenReturn(mockResult);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeVideoToText(
+                        TEST_VIDEO_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    TEST_MULTI_MODAL_CONTENT,
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockConv.close();
+    }
+
+    @Test
+    @DisplayName("Video to text with local video file")
+    void testVideoToTextWithFile() throws IOException {
+        MockedStatic<MediaUtils> mockMediaUtils = mockStatic(MediaUtils.class);
+        when(MediaUtils.urlToProtocolUrl(TEST_VIDEO_PATH)).thenReturn("file://" + TEST_VIDEO_PATH);
+
+        MockedConstruction<MultiModalConversation> mockConv =
+                mockConstruction(
+                        MultiModalConversation.class,
+                        (mock, context) -> {
+                            MultiModalConversationResult mockResult =
+                                    mock(MultiModalConversationResult.class);
+                            MultiModalConversationOutput mockOutput =
+                                    mock(MultiModalConversationOutput.class);
+
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            MultiModalConversationOutput.Choice choice =
+                                    new MultiModalConversationOutput.Choice();
+                            choice.setMessage(
+                                    MultiModalMessage.builder()
+                                            .content(
+                                                    List.of(
+                                                            Map.of(
+                                                                    "text",
+                                                                    TEST_MULTI_MODAL_CONTENT)))
+                                            .build());
+                            choice.setFinishReason("stop");
+                            when(mockOutput.getChoices()).thenReturn(List.of(choice));
+                            when(mock.call(any(MultiModalConversationParam.class)))
+                                    .thenReturn(mockResult);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeVideoToText(
+                        TEST_VIDEO_PATH, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    TEST_MULTI_MODAL_CONTENT,
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockMediaUtils.close();
+        mockConv.close();
+    }
+
+    @Test
+    @DisplayName("Video to text with base64 data url")
+    void testVideoToTextWithBase64DataUrl() {
+        MockedConstruction<MultiModalConversation> mockConv =
+                mockConstruction(
+                        MultiModalConversation.class,
+                        (mock, context) -> {
+                            MultiModalConversationResult mockResult =
+                                    mock(MultiModalConversationResult.class);
+                            MultiModalConversationOutput mockOutput =
+                                    mock(MultiModalConversationOutput.class);
+
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            MultiModalConversationOutput.Choice choice =
+                                    new MultiModalConversationOutput.Choice();
+                            choice.setMessage(
+                                    MultiModalMessage.builder()
+                                            .content(
+                                                    List.of(
+                                                            Map.of(
+                                                                    "text",
+                                                                    TEST_MULTI_MODAL_CONTENT)))
+                                            .build());
+                            choice.setFinishReason("stop");
+                            when(mockOutput.getChoices()).thenReturn(List.of(choice));
+                            when(mock.call(any(MultiModalConversationParam.class)))
+                                    .thenReturn(mockResult);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeVideoToText(
+                        TEST_VIDEO_BASE64_DATA_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    TEST_MULTI_MODAL_CONTENT,
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockConv.close();
+    }
+
+    @Test
+    @DisplayName("Should return error TextBlock when call video to text response empty")
+    void testVideoToTextResponseEmpty() {
+        MockedConstruction<MultiModalConversation> mockConv =
+                mockConstruction(
+                        MultiModalConversation.class,
+                        (mock, context) -> {
+                            MultiModalConversationResult mockResult =
+                                    mock(MultiModalConversationResult.class);
+                            MultiModalConversationOutput mockOutput =
+                                    mock(MultiModalConversationOutput.class);
+
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            MultiModalConversationOutput.Choice choice =
+                                    new MultiModalConversationOutput.Choice();
+                            choice.setMessage(
+                                    MultiModalMessage.builder().content(List.of()).build());
+                            choice.setFinishReason("stop");
+                            when(mockOutput.getChoices()).thenReturn(List.of(choice));
+                            when(mock.call(any(MultiModalConversationParam.class)))
+                                    .thenReturn(mockResult);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeVideoToText(
+                        TEST_VIDEO_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    String.format("Error: %s", "Failed to analyze video."),
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockConv.close();
+    }
+
+    @Test
+    @DisplayName("Should return error TextBlock when call video to text response null")
+    void testVideoToTextResponseNull() {
+        MockedConstruction<MultiModalConversation> mockConv =
+                mockConstruction(
+                        MultiModalConversation.class,
+                        (mock, context) -> {
+                            MultiModalConversationResult mockResult =
+                                    mock(MultiModalConversationResult.class);
+                            MultiModalConversationOutput mockOutput =
+                                    mock(MultiModalConversationOutput.class);
+
+                            when(mockResult.getOutput()).thenReturn(mockOutput);
+                            when(mockOutput.getChoices()).thenReturn(null);
+                            when(mock.call(any(MultiModalConversationParam.class)))
+                                    .thenReturn(mockResult);
+                        });
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeVideoToText(
+                        TEST_VIDEO_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    String.format("Error: %s", "Failed to analyze video."),
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockConv.close();
+    }
+
+    @Test
+    @DisplayName("Should return error TextBlock when call video to text occurs error")
+    void testVideoToTextError() {
+        MockedConstruction<MultiModalConversation> mockConv =
+                mockConstruction(
+                        MultiModalConversation.class,
+                        (mock, context) ->
+                                when(mock.call(any(MultiModalConversationParam.class)))
+                                        .thenThrow(TEST_ERROR));
+
+        Mono<ToolResultBlock> result =
+                multiModalTool.dashscopeVideoToText(
+                        TEST_VIDEO_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0);
+
+        StepVerifier.create(result)
+                .assertNext(
+                        toolResultBlock -> {
+                            assertNotNull(toolResultBlock);
+                            assertEquals(1, toolResultBlock.getOutput().size());
+                            assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0));
+                            assertEquals(
+                                    String.format("Error: %s", TEST_ERROR.getMessage()),
+                                    ((TextBlock) toolResultBlock.getOutput().get(0)).getText());
+                        })
+                .verifyComplete();
+
+        mockConv.close();
+    }
 }
diff --git a/agentscope-core/src/test/resources/test_video.mp4 b/agentscope-core/src/test/resources/test_video.mp4
new file mode 100644
index 000000000..19fc876e5
Binary files /dev/null and b/agentscope-core/src/test/resources/test_video.mp4 differ
diff --git a/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java b/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java
new file mode 100644
index 000000000..e3ead77be
--- /dev/null
+++ b/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2024-2026 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.agentscope.examples.quickstart;
+
+import io.agentscope.core.ReActAgent;
+import io.agentscope.core.formatter.dashscope.DashScopeChatFormatter;
+import io.agentscope.core.hook.Hook;
+import io.agentscope.core.hook.HookEvent;
+import io.agentscope.core.hook.PostActingEvent;
+import io.agentscope.core.hook.PreActingEvent;
+import io.agentscope.core.memory.InMemoryMemory;
+import io.agentscope.core.message.AudioBlock;
+import io.agentscope.core.message.Base64Source;
+import io.agentscope.core.message.ContentBlock;
+import io.agentscope.core.message.ImageBlock;
+import io.agentscope.core.message.Source;
+import io.agentscope.core.message.TextBlock;
+import io.agentscope.core.message.ToolResultBlock;
+import io.agentscope.core.message.URLSource;
+import io.agentscope.core.message.VideoBlock;
+import io.agentscope.core.model.DashScopeChatModel;
+import io.agentscope.core.tool.Toolkit;
+import io.agentscope.core.tool.multimodal.DashScopeMultiModalTool;
+import java.util.List;
+import reactor.core.publisher.Mono;
+
+/**
+ * MultiModalToolExample - Demonstrates how to equip an Agent with multimodal tools.
+ */
+public class MultiModalToolExample {
+
+    public static void main(String[] args) throws Exception {
+        // Print welcome message
+        ExampleUtils.printWelcome(
+                "MultiModal Tool Calling Example",
+                "This example demonstrates how to equip an Agent with multimodal tools.\n"
+                        + "The agent has image, audio and video multimodal tools.");
+
+        // Get API key
+        String apiKey = ExampleUtils.getDashScopeApiKey();
+
+        // Create and register tools
+        Toolkit toolkit = new Toolkit();
+        toolkit.registerTool(new DashScopeMultiModalTool(apiKey));
+        printRegisterTools();
+
+        // Create Agent with tools
+        ReActAgent agent =
+                ReActAgent.builder()
+                        .name("MultiModalToolAgent")
+                        .sysPrompt(
+                                "You are a helpful assistant with access to multimodal"
+                                        + " tools. Use tools when needed to answer questions"
+                                        + " accurately. Always explain what you're doing when using"
+                                        + " tools.")
+                        .model(
+                                DashScopeChatModel.builder()
+                                        .apiKey(apiKey)
+                                        .modelName("qwen-plus")
+                                        .stream(true)
+                                        .enableThinking(false)
+                                        .formatter(new DashScopeChatFormatter())
+                                        .build())
+                        .hook(new ToolCallLoggingHook())
+                        .toolkit(toolkit)
+                        .memory(new InMemoryMemory())
+                        .build();
+
+        printExamplePrompts();
+
+        ExampleUtils.startChat(agent);
+    }
+
+    private static void printRegisterTools() {
+        String registeredTools =
+                """
+                Registered tools:
+                - dashscope_text_to_image: Generate image(s) based on the given text.
+                - dashscope_image_to_text: Generate text based on the given images.
+                - dashscope_text_to_audio: Convert the given text to audio.
+                - dashscope_audio_to_text: Convert the given audio to text.
+                - dashscope_text_to_video: Generate video based on the given text prompt.
+                - dashscope_image_to_video: Generate a video from a single input image and an optional text prompt.
+                - dashscope_first_and_last_frame_image_to_video: Generate video transitioning from a first frame to a last frame and an optional text prompt.
+                - dashscope_video_to_text: Analyze video and generate a text description or answer questions based on the video content.
+                """;
+
+        System.out.println(registeredTools);
+        System.out.println("\n");
+    }
+
+    private static void printExamplePrompts() {
+        String examplePrompts =
+                """
+                Example Prompts:
+                [dashscope_text_to_image]:
+                Generate a black dog image url.
+                [dashscope_image_to_text]:
+                Describe the image url of 'https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png'.
+                [dashscope_text_to_audio]:
+                Convert the texts of 'hello, qwen!' to audio url.
+                [dashscope_audio_to_text]:
+                Convert the audio url of 'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male2.wav' to text.
+                [dashscope_text_to_video]:
+                Generate a smart cat is running in the moonlight video.
+                [dashscope_image_to_video]:
+                Generate a video that a tiger is running in moonlight based on the image url of 'https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png'.
+                [dashscope_first_and_last_frame_image_to_video]:
+                Generate a video that a black kitten curiously looking at the sky based on the first frame image url of 'https://wanx.alicdn.com/material/20250318/first_frame.png' and the last frame image url of 'https://wanx.alicdn.com/material/20250318/last_frame.png'.
+                [dashscope_video_to_text]:
+                Describe the video url of 'https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4'.
+                """;
+        System.out.println(examplePrompts);
+        System.out.println("\n");
+    }
+
+    static class ToolCallLoggingHook implements Hook {
+
+        @Override
+        public <T extends HookEvent> Mono<T> onEvent(T event) {
+            if (event instanceof PreActingEvent preActing) {
+                System.out.println(
+                        "\n[HOOK] PreActingEvent - Tool: "
+                                + preActing.getToolUse().getName()
+                                + ", Input: "
+                                + preActing.getToolUse().getInput());
+
+            } else if (event instanceof PostActingEvent postActingEvent) {
+                ToolResultBlock toolResult = postActingEvent.getToolResult();
+                List<ContentBlock> contentBlocks = toolResult.getOutput();
+                if (contentBlocks != null && !contentBlocks.isEmpty()) {
+                    for (ContentBlock cb : contentBlocks) {
+                        if (cb instanceof ImageBlock ib) {
+                            Source source = ib.getSource();
+                            if (source instanceof URLSource urlSource) {
+                                System.out.println(
+                                        "\n[HOOK] PostActingEvent - Tool Result: \nImage URL: "
+                                                + urlSource.getUrl());
+                            } else if (source instanceof Base64Source base64Source) {
+                                System.out.println(
+                                        "\n"
+                                                + "[HOOK] PostActingEvent - Tool Result: \n"
+                                                + "Image Base64 data: "
+                                                + base64Source.getData());
+                            }
+                        } else if (cb instanceof AudioBlock ab) {
+                            Source source = ab.getSource();
+                            if (source instanceof URLSource urlSource) {
+                                System.out.println(
+                                        "\n[HOOK] PostActingEvent - Tool Result: \nAudio URL: "
+                                                + urlSource.getUrl());
+                            } else if (source instanceof Base64Source base64Source) {
+                                System.out.println(
+                                        "\n"
+                                                + "[HOOK] PostActingEvent - Tool Result: \n"
+                                                + "Audio Base64 data: "
+                                                + base64Source.getData());
+                            }
+                        } else if (cb instanceof VideoBlock vb) {
+                            Source source = vb.getSource();
+                            if (source instanceof URLSource urlSource) {
+                                System.out.println(
+                                        "\n[HOOK] PostActingEvent - Tool Result: \nVideo URL: "
+                                                + urlSource.getUrl());
+                            } else if (source instanceof Base64Source base64Source) {
+                                System.out.println(
+                                        "\n"
+                                                + "[HOOK] PostActingEvent - Tool Result: \n"
+                                                + "Video Base64 data: "
+                                                + base64Source.getData());
+                            }
+                        } else if (cb instanceof TextBlock tb) {
+                            System.out.println(
+                                    "\n[HOOK] PostActingEvent - Tool Result: \nText: "
+                                            + tb.getText());
+                        }
+                    }
+                    System.out.println("\n");
+                }
+            }
+            return Mono.just(event);
+        }
+    }
+}
diff --git a/docs/en/task/tool.md b/docs/en/task/tool.md
index 213914490..70e2c2848 100644
--- a/docs/en/task/tool.md
+++ b/docs/en/task/tool.md
@@ -370,3 +370,9 @@ boolean isExternal = toolkit.isExternalTool("query_database");  // true
 ```
 
 The call flow is the same as Tool Suspend: LLM calls → returns `TOOL_SUSPENDED` → external execution → provide result to resume.
+
+## Complete Examples
+
+- **Tool Call Example**: [ToolCallingExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolCallingExample.java)
+- **Tool Group Example**: [ToolGroupExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolGroupExample.java)
+- **MultiModal Tool Example**: [MultiModalToolExample.java](https://github.com/agentscope-ai/agentscope-java/tree/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java)
\ No newline at end of file
diff --git a/docs/zh/task/tool.md b/docs/zh/task/tool.md
index 828abd370..fa3e57c5a 100644
--- a/docs/zh/task/tool.md
+++ b/docs/zh/task/tool.md
@@ -370,3 +370,9 @@ boolean isExternal = toolkit.isExternalTool("query_database");  // true
 ```
 
 调用流程与工具挂起相同：LLM 调用 → 返回 `TOOL_SUSPENDED` → 外部执行 → 提供结果恢复。
+
+## 完整示例
+
+- **工具调用示例**: [ToolCallingExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolCallingExample.java)
+- **工具组示例**: [ToolGroupExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolGroupExample.java)
+- **多模态工具示例**: [MultiModalToolExample.java](https://github.com/agentscope-ai/agentscope-java/tree/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java)
\ No newline at end of file