diff --git a/agentscope-core/src/main/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalTool.java b/agentscope-core/src/main/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalTool.java index 45baf84e9..7c3b7b06f 100644 --- a/agentscope-core/src/main/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalTool.java +++ b/agentscope-core/src/main/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalTool.java @@ -23,6 +23,10 @@ import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationOutput; import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam; import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult; +import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesis; +import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisOutput; +import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisParam; +import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisResult; import com.alibaba.dashscope.audio.asr.recognition.Recognition; import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam; import com.alibaba.dashscope.audio.asr.recognition.RecognitionResult; @@ -41,6 +45,7 @@ import io.agentscope.core.message.TextBlock; import io.agentscope.core.message.ToolResultBlock; import io.agentscope.core.message.URLSource; +import io.agentscope.core.message.VideoBlock; import io.agentscope.core.tool.Tool; import io.agentscope.core.tool.ToolParam; import java.io.IOException; @@ -51,6 +56,7 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.Base64; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -227,7 +233,12 @@ public Mono dashscopeTextToImage( public Mono dashscopeImageToText( @ToolParam( name = "image_urls", - description = "The URL(s) of image(s) to be converted into text.") + description = + "The URL(s), local file path(s) or Base64 data URL(s)(the" + + " format pattern is" + + " data:[MIME_type];base64,{base64_image}, e.g.," + + " 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABDg...')" + + " of image(s) to be converted into text.") List imageUrls, @ToolParam(name = "prompt", description = "The text prompt.", required = false) String prompt, @@ -716,14 +727,14 @@ public void onError(Exception e) { /** * Send audio input stream by chunk to DashScope. - * marked as public for unit test. + * marked as package-private for unit test. * * @param audioUrl The file path or URL of audio. * @param recognizer DashScope Recognition instance * @throws IOException if read failed * @throws InterruptedException if interrupted */ - public void sendAudioChunk(String audioUrl, Recognition recognizer) + void sendAudioChunk(String audioUrl, Recognition recognizer) throws IOException, InterruptedException { // chunk size set to 1 seconds for 16KHz sample rate byte[] buffer = new byte[3200]; @@ -752,4 +763,658 @@ public void sendAudioChunk(String audioUrl, Recognition recognizer) } } } + + /** + * Generate video based on the given prompt. + * + * @param prompt The text prompt to generate video. + * @param model The model to use, e.g., 'wan2.6-t2v', 'wan2.5-t2v-preview', etc. + * @param negativePrompt The negative prompt to avoid certain elements. + * @param audioUrl The URL for background audio. + * @param size Size of the video, e.g., '1920*1080', '1280*720', etc. + * @param duration Duration of the video in seconds, e.g., '5', '10', etc. + * @param shotType Specify the shot type that generates the video. + * single: default value, output single shot video; multi: output multi-lens video. + * @param promptExtend Whether to extend the prompt automatically (default true) + * @param watermark Whether to include watermark (default false) + * @param seed The seed for reproducibility + * @return A ToolResultBlock containing the generated video url or error message. + */ + @Tool( + name = "dashscope_text_to_video", + description = "Generate video based on the given text prompt") + public Mono dashscopeTextToVideo( + @ToolParam(name = "prompt", description = "The text prompt to generate video") + String prompt, + @ToolParam( + name = "model", + description = + "The model to use, e.g., 'wan2.6-t2v', 'wan2.5-t2v-preview'," + + " etc", + required = false) + String model, + @ToolParam( + name = "negative_prompt", + description = "The negative prompt to avoid certain elements", + required = false) + String negativePrompt, + @ToolParam( + name = "audio_url", + description = "The URL for background audio", + required = false) + String audioUrl, + @ToolParam( + name = "size", + description = "Size of the video, e.g., '1920*1080', '1280*720', etc", + required = false) + String size, + @ToolParam( + name = "duration", + description = "Duration of the video in seconds, e.g., '5', '10', etc", + required = false) + Integer duration, + @ToolParam( + name = "shot_type", + description = + "Specify the shot type that generates the video. single:" + + " default value, output single shot video; multi: output" + + " multi-lens video", + required = false) + String shotType, + @ToolParam( + name = "prompt_extend", + description = + "Whether to automatically extend the prompt (default true)", + required = false) + Boolean promptExtend, + @ToolParam( + name = "watermark", + description = "Whether to include watermark (default false)", + required = false) + Boolean watermark, + @ToolParam( + name = "seed", + description = "The seed for reproducibility", + required = false) + Integer seed) { + + String finalModel = + Optional.ofNullable(model).filter(s -> !s.trim().isEmpty()).orElse("wan2.6-t2v"); + String finalSize = + Optional.ofNullable(size).filter(s -> !s.trim().isEmpty()).orElse("1920*1080"); + String finalShotType = Optional.ofNullable(shotType).orElse("single"); + boolean finalPromptExtend = Optional.ofNullable(promptExtend).orElse(true); + boolean finalWatermark = Optional.ofNullable(watermark).orElse(false); + + log.debug( + "dashscope_text_to_video called: prompt='{}', model='{}', negativePrompt='{}'," + + " audioUrl='{}', size='{}', duration={}, shotType='{}', promptExtend='{}'," + + " watermark='{}', seed={}", + prompt, + finalModel, + negativePrompt, + audioUrl, + finalSize, + duration, + finalShotType, + finalPromptExtend, + finalWatermark, + seed); + + return Mono.fromCallable( + () -> { + Map parameters = new HashMap<>(); + parameters.put("size", finalSize); + parameters.put("shot_type", finalShotType); + parameters.put("prompt_extend", finalPromptExtend); + parameters.put("watermark", finalWatermark); + if (duration != null) { + parameters.put("duration", duration); + } + if (seed != null) { + parameters.put("seed", seed); + } + + VideoSynthesisParam param = + VideoSynthesisParam.builder() + .apiKey(this.apiKey) + .model(finalModel) + .prompt(prompt) + .negativePrompt(negativePrompt) + .audioUrl(audioUrl) + .parameters(parameters) + .header("user-agent", Version.getUserAgent()) + .build(); + + VideoSynthesis videoSynthesis = new VideoSynthesis(); + + // The video call method blocks until the video is generated or fails + log.info( + "Starting text to video generation task, please wait for a" + + " while..."); + VideoSynthesisResult response = videoSynthesis.call(param); + + // Extract video URL from response + String videoUrl = + Optional.ofNullable(response) + .map(VideoSynthesisResult::getOutput) + .map(VideoSynthesisOutput::getVideoUrl) + .orElse(null); + + if (videoUrl == null || videoUrl.trim().isEmpty()) { + log.error("No video url returned. Response: {}", response); + return ToolResultBlock.error("Failed to generate video."); + } + + log.info("Text to video generated successfully, videoUrl:{}", videoUrl); + + VideoBlock vb = + VideoBlock.builder() + .source(URLSource.builder().url(videoUrl).build()) + .build(); + + return ToolResultBlock.of(vb); + }) + .onErrorResume( + e -> { + log.error("Failed to generate video '{}'", e.getMessage(), e); + return Mono.just(ToolResultBlock.error(e.getMessage())); + }); + } + + /** + * Generate a video based on a single input image (first frame) and an optional text prompt. + * + * @param prompt The text prompt describing the video content and motion. + * @param model The model to use, e.g., 'wan2.6-i2v-flash', 'wan2.1-i2v-turbo'. + * @param imageUrl The URL, local file path, or Base64 data of the input image (first frame). + * @param audioUrl URL of the audio file that the model will use to generate video. + * @param negativePrompt The negative prompt to avoid certain elements. + * @param template Name of video effect template, e.g., 'squish', 'rotation', etc + * @param resolution Resolution of the video, e.g., '720P', '1080P'. + * @param duration Duration of the video in seconds (e.g., 5, 10, 15). Default depends on model. + * @param shotType Specify the shot type that generates the video. + * single: default value, output single shot video; multi: output multi-lens video. + * @param audio Whether to generate audio video (default true). + * @param promptExtend Whether to automatically extend the prompt (default true). + * @param watermark Whether to include watermark (default false). + * @param seed Optional seed for reproducibility. + * @return A ToolResultBlock containing the generated video url or error message. + */ + @Tool( + name = "dashscope_image_to_video", + description = + "Generate a video from a single input image and an optional text prompt." + + "Supports optional audio guidance and duration control.") + public Mono dashscopeImageToVideo( + @ToolParam( + name = "prompt", + description = "Text prompt describing the video content and motion", + required = false) + String prompt, + @ToolParam( + name = "model", + description = + "Model to use, e.g., 'wan2.6-i2v-flash', 'wan2.6-i2v', etc", + required = false) + String model, + @ToolParam( + name = "image_url", + description = + "URL, local file path or Base64 data URL((the format pattern is" + + " data:[MIME_type];base64,{base64_image}, e.g.," + + " 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABDg...'))" + + " of the first frame image") + String imageUrl, + @ToolParam( + name = "audio_url", + description = + "URL of the audio file that the model will use to generate" + + " video", + required = false) + String audioUrl, + @ToolParam( + name = "negative_prompt", + description = "The negative prompt to avoid certain elements", + required = false) + String negativePrompt, + @ToolParam( + name = "template", + description = + "Name of video effect template, e.g., 'squish', 'rotation'," + + " etc", + required = false) + String template, + @ToolParam( + name = "resolution", + description = "Video resolution, e.g., '720P', '1080P'", + required = false) + String resolution, + @ToolParam( + name = "duration", + description = "Duration of the video in seconds, e.g., 5, 10, 15", + required = false) + Integer duration, + @ToolParam( + name = "shot_type", + description = + "Specify the shot type that generates the video. single:" + + " default value, output single shot video; multi: output" + + " multi-lens video", + required = false) + String shotType, + @ToolParam( + name = "audio", + description = "Whether to generate audio video (default true)", + required = false) + Boolean audio, + @ToolParam( + name = "prompt_extend", + description = + "Whether to automatically extend the prompt (default true)", + required = false) + Boolean promptExtend, + @ToolParam( + name = "watermark", + description = "Whether to include watermark (default false)", + required = false) + Boolean watermark, + @ToolParam( + name = "seed", + description = "The seed for reproducibility", + required = false) + Integer seed) { + + String finalModel = + Optional.ofNullable(model) + .filter(s -> !s.trim().isEmpty()) + .orElse("wan2.6-i2v-flash"); + String finalResolution = + Optional.ofNullable(resolution).filter(s -> !s.trim().isEmpty()).orElse("720P"); + String finalShotType = Optional.ofNullable(shotType).orElse("single"); + boolean finalAudio = Optional.ofNullable(audio).orElse(true); + boolean finalPromptExtend = Optional.ofNullable(promptExtend).orElse(true); + boolean finalWatermark = Optional.ofNullable(watermark).orElse(false); + + log.debug( + "dashscope_image_to_video called: prompt='{}', model='{}', imageUrl='{}'," + + " audioUrl='{}', negativePrompt='{}', template='{}', resolution='{}'," + + " duration={}, shotType='{}', audio={}, promptExtend={}, watermark={}," + + " seed={}", + prompt, + finalModel, + imageUrl, + audioUrl, + negativePrompt, + template, + finalResolution, + duration, + finalShotType, + finalAudio, + finalPromptExtend, + finalWatermark, + seed); + + return Mono.fromCallable( + () -> { + Map parameters = new HashMap<>(); + parameters.put("resolution", finalResolution); + parameters.put("shot_type", finalShotType); + parameters.put("audio", finalAudio); + parameters.put("prompt_extend", finalPromptExtend); + parameters.put("watermark", finalWatermark); + if (duration != null) { + parameters.put("duration", duration); + } + if (seed != null) { + parameters.put("seed", seed); + } + + VideoSynthesisParam param = + VideoSynthesisParam.builder() + .apiKey(this.apiKey) + .prompt(prompt) + .model(finalModel) + .imgUrl(MediaUtils.urlToProtocolUrl(imageUrl)) + .audioUrl(audioUrl) + .negativePrompt(negativePrompt) + .template(template) + .parameters(parameters) + .header("user-agent", Version.getUserAgent()) + .build(); + + VideoSynthesis videoSynthesis = new VideoSynthesis(); + + log.info( + "Starting image to video generation task, please wait for a" + + " while..."); + VideoSynthesisResult response = videoSynthesis.call(param); + + // Extract video URL from response + String videoUrl = + Optional.ofNullable(response) + .map(VideoSynthesisResult::getOutput) + .map(VideoSynthesisOutput::getVideoUrl) + .orElse(null); + + if (videoUrl == null || videoUrl.trim().isEmpty()) { + log.error("Failed to generate video. No video url returned."); + return ToolResultBlock.error("Failed to generate video."); + } + + log.info( + "Image to video generated successfully, videoUrl:{}", videoUrl); + + VideoBlock vb = + VideoBlock.builder() + .source(URLSource.builder().url(videoUrl).build()) + .build(); + + return ToolResultBlock.of(vb); + }) + .onErrorResume( + e -> { + log.error("Failed to generate video: '{}'", e.getMessage(), e); + return Mono.just(ToolResultBlock.error(e.getMessage())); + }); + } + + /** + * Generate video transitioning from a first frame to a last frame and an optional text prompt. + * + * @param prompt The text prompt describing the video content and camera movement. + * @param model The model to use, e.g., 'wan2.2-kf2v-flash', 'wanx2.1-kf2v-plus'. + * @param firstFrameUrl The URL or Base64 data of the first frame image. + * @param lastFrameUrl The URL or Base64 data of the last frame image. + * @param negativePrompt The negative prompt to avoid certain elements. + * @param template Name of video effect template, e.g., 'hanfu-1', 'solaron', etc. + * @param resolution Resolution of the video, e.g., '480P', '720P', '1080P'. + * @param promptExtend Whether to automatically extend the prompt (default true). + * @param watermark Whether to include watermark (default false). + * @param seed Optional seed for reproducibility. + * @return A ToolResultBlock containing the generated video url or error message. + */ + @Tool( + name = "dashscope_first_and_last_frame_image_to_video", + description = + "Generate video transitioning from a first frame to a last frame and an" + + " optional text prompt") + public Mono dashscopeFirstAndLastFrameImageToVideo( + @ToolParam( + name = "prompt", + description = "Text prompt describing the video content and motion", + required = false) + String prompt, + @ToolParam( + name = "model", + description = + "Model to use, e.g., 'wan2.2-kf2v-flash', 'wanx2.1-kf2v-plus'", + required = false) + String model, + @ToolParam( + name = "first_frame_url", + description = + "URL, local file path or Base64 data URL(the format pattern is" + + " data:[MIME_type];base64,{base64_image}, e.g.," + + " 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABDg...')" + + " of the first frame image") + String firstFrameUrl, + @ToolParam( + name = "last_frame_url", + description = + "URL, local file path or Base64 data URL(the format pattern is" + + " data:[MIME_type];base64,{base64_image}, e.g.," + + " 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABDg...')" + + " of the last frame image", + required = false) + String lastFrameUrl, + @ToolParam( + name = "negative_prompt", + description = "The negative prompt to avoid certain elements", + required = false) + String negativePrompt, + @ToolParam( + name = "template", + description = + "Name of video effect template, e.g., 'hanfu-1', 'solaron'," + + " etc", + required = false) + String template, + @ToolParam( + name = "resolution", + description = "Video resolution, e.g., '720P', '1080P'", + required = false) + String resolution, + @ToolParam( + name = "prompt_extend", + description = + "Whether to automatically extend the prompt (default true)", + required = false) + Boolean promptExtend, + @ToolParam( + name = "watermark", + description = "Whether to include watermark (default false)", + required = false) + Boolean watermark, + @ToolParam( + name = "seed", + description = "The seed for reproducibility", + required = false) + Integer seed) { + + String finalModel = + Optional.ofNullable(model) + .filter(s -> !s.trim().isEmpty()) + .orElse("wan2.2-kf2v-flash"); + String finalResolution = + Optional.ofNullable(resolution).filter(s -> !s.trim().isEmpty()).orElse("720P"); + boolean finalPromptExtend = Optional.ofNullable(promptExtend).orElse(true); + boolean finalWatermark = Optional.ofNullable(watermark).orElse(false); + + log.debug( + "dashscope_image_to_video called: prompt='{}', model='{}', firstFrameUrl='{}'," + + " lastFrameUrl='{}', negativePrompt='{}', template='{}', resolution='{}'," + + " promptExtend={}, watermark={}, seed={}", + prompt, + finalModel, + firstFrameUrl, + lastFrameUrl, + negativePrompt, + template, + finalResolution, + finalPromptExtend, + finalWatermark, + seed); + + return Mono.fromCallable( + () -> { + Map parameters = new HashMap<>(); + parameters.put("resolution", finalResolution); + parameters.put("prompt_extend", finalPromptExtend); + parameters.put("watermark", finalWatermark); + if (seed != null) { + parameters.put("seed", seed); + } + + VideoSynthesisParam param = + VideoSynthesisParam.builder() + .apiKey(this.apiKey) + .model(finalModel) + .prompt(prompt) + .firstFrameUrl( + MediaUtils.urlToProtocolUrl(firstFrameUrl)) + .lastFrameUrl(MediaUtils.urlToProtocolUrl(lastFrameUrl)) + .negativePrompt(negativePrompt) + .template(template) + .parameters(parameters) + .header("user-agent", Version.getUserAgent()) + .build(); + + VideoSynthesis videoSynthesis = new VideoSynthesis(); + + log.info( + "Starting first and last frame image to video generation task," + + " please wait for a while..."); + // The video call method blocks until the video is generated or fails + VideoSynthesisResult response = videoSynthesis.call(param); + + // Extract video URL + String videoUrl = + Optional.ofNullable(response) + .map(VideoSynthesisResult::getOutput) + .map(VideoSynthesisOutput::getVideoUrl) + .orElse(null); + + if (videoUrl == null || videoUrl.trim().isEmpty()) { + log.error("Failed to generate video. No URL returned."); + return ToolResultBlock.error("Failed to generate video."); + } + + log.info( + "First and last frame image to video video generated" + + " successfully, videoUrl:{}", + videoUrl); + + VideoBlock vb = + VideoBlock.builder() + .source(URLSource.builder().url(videoUrl).build()) + .build(); + + return ToolResultBlock.of(vb); + }) + .onErrorResume( + e -> { + log.error("Failed to generate key-frame video '{}'", e.getMessage(), e); + return Mono.just(ToolResultBlock.error(e.getMessage())); + }); + } + + /** + * Analyze video and generate a text description or answer questions based on the video content. + * + * @param videoUrl The URL or local path of the video to analyze. + * @param prompt The text prompt or question regarding the video content. + * @param model The vision model to use, e.g., 'qwen3.5-plus', 'qwen3.5-flash', 'qwen3-vl-plus', 'qwen3-vl-flash', etc. + * @param fps Frames per second to sample from the video (e.g., 1, 2, 4). Default is 2. + * @return A ToolResultBlock containing the generated text analysis or error message. + */ + @Tool( + name = "dashscope_video_to_text", + description = + "Analyze video and generate a text description or answer questions based on the" + + " video content.Supports controlling the frame sampling rate (fps).") + public Mono dashscopeVideoToText( + @ToolParam( + name = "video_url", + description = + "The URL, local file path or base64 data URL(the format pattern" + + " is data:[MIME_type];base64,{base64_image}, e.g.," + + " 'data:video/mp4;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAA...')" + + " of the video to analyze.") + String videoUrl, + @ToolParam( + name = "prompt", + description = "The question or instruction regarding the video content", + required = false) + String prompt, + @ToolParam( + name = "model", + description = + "The vision model to use, e.g., 'qwen3.5-plus'," + + " 'qwen3.5-flash', 'qwen3-vl-plus', 'qwen3-vl-flash'," + + " etc", + required = false) + String model, + @ToolParam( + name = "fps", + description = + "Frames per second to sample from the video for analysis" + + " (default 2.0)", + required = false) + Double fps) { + + String finalModel = + Optional.ofNullable(model).filter(s -> !s.trim().isEmpty()).orElse("qwen3.5-plus"); + String finalPrompt = + Optional.ofNullable(prompt) + .filter(s -> !s.trim().isEmpty()) + .orElse("Describe the video"); + double finalFps = Optional.ofNullable(fps).orElse(2.0); + + log.debug( + "dashscope_video_to_text called: videoUrl:'{}', prompt='{}', model='{}', fps={}", + videoUrl, + prompt, + finalModel, + finalFps); + + return Mono.fromCallable( + () -> { + Map videoParams = new HashMap<>(); + videoParams.put("video", MediaUtils.urlToProtocolUrl(videoUrl)); + videoParams.put("fps", finalFps); + List> content = new ArrayList<>(); + content.add(Map.of("text", finalPrompt)); + content.add(videoParams); + MultiModalMessage userMessage = + MultiModalMessage.builder() + .role(Role.USER.getValue()) + .content(content) + .build(); + + MultiModalMessage systemMessage = + MultiModalMessage.builder() + .role(Role.SYSTEM.getValue()) + .content( + List.of( + Map.of( + "text", + "You are a helpful" + + " assistant."))) + .build(); + + List multiModalMessages = new ArrayList<>(); + multiModalMessages.add(systemMessage); + multiModalMessages.add(userMessage); + + MultiModalConversationParam param = + MultiModalConversationParam.builder() + .apiKey(this.apiKey) + .model(finalModel) + .messages(multiModalMessages) + .header("user-agent", Version.getUserAgent()) + .build(); + + MultiModalConversation conv = new MultiModalConversation(); + MultiModalConversationResult result = conv.call(param); + + // Extract text from the result + String text = + Optional.ofNullable(result) + .map(MultiModalConversationResult::getOutput) + .map(MultiModalConversationOutput::getChoices) + .flatMap(choices -> choices.stream().findFirst()) + .map(MultiModalConversationOutput.Choice::getMessage) + .map(MultiModalMessage::getContent) + .flatMap(contents -> contents.stream().findFirst()) + .map(contentMap -> contentMap.get("text")) + .map(Object::toString) + .orElse(null); + + if (text == null || text.trim().isEmpty()) { + log.error("Failed to analyze video. No text response returned."); + return ToolResultBlock.error("Failed to analyze video."); + } + + log.info("Video analysis completed successfully."); + + TextBlock tb = TextBlock.builder().text(text).build(); + return ToolResultBlock.of(tb); + }) + .onErrorResume( + e -> { + log.error("Failed to analyze video '{}'", e.getMessage(), e); + return Mono.just(ToolResultBlock.error(e.getMessage())); + }); + } } diff --git a/agentscope-core/src/test/java/io/agentscope/core/model/ChatModelNonStreamingBlockingBehaviorTest.java b/agentscope-core/src/test/java/io/agentscope/core/model/ChatModelNonStreamingBlockingBehaviorTest.java index 51f76ce03..ed862375b 100644 --- a/agentscope-core/src/test/java/io/agentscope/core/model/ChatModelNonStreamingBlockingBehaviorTest.java +++ b/agentscope-core/src/test/java/io/agentscope/core/model/ChatModelNonStreamingBlockingBehaviorTest.java @@ -152,7 +152,7 @@ void testOpenAIChatModelNonBlocking() throws Exception { }, error -> latch.countDown()); - latch.await(3, TimeUnit.SECONDS); + latch.await(5, TimeUnit.SECONDS); assertNotNull(streamThreadName.get()); assertNotEquals( currentThreadName, diff --git a/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolE2ETest.java b/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolE2ETest.java index 164f19be8..9b6bcb26d 100644 --- a/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolE2ETest.java +++ b/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolE2ETest.java @@ -16,16 +16,20 @@ package io.agentscope.core.tool.multimodal; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; import io.agentscope.core.e2e.E2ETestCondition; +import io.agentscope.core.formatter.MediaUtils; import io.agentscope.core.message.AudioBlock; import io.agentscope.core.message.Base64Source; import io.agentscope.core.message.ImageBlock; import io.agentscope.core.message.TextBlock; import io.agentscope.core.message.ToolResultBlock; import io.agentscope.core.message.URLSource; +import io.agentscope.core.message.VideoBlock; +import java.io.IOException; +import java.nio.file.Paths; import java.util.List; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; @@ -50,13 +54,28 @@ class DashScopeMultiModalToolE2ETest { private static final String TEXT_TO_IMAGE_PROMPT = "A small dog."; private static final String IMAGE_TO_TEXT_PROMPT = "Describe the image."; + private static final String TEXT_TO_VIDEO_PROMPT = "A small dog is running in moonlight."; + private static final String IMAGE_TO_VIDEO_PROMPT = "A tiger is running in moonlight."; + private static final String FIRST_AND_LAST_FRAME_IMAGE_TO_VIDEO_PROMPT = + "A black kitten looks curiously into the sky."; + private static final String VIDEO_TO_TEXT_PROMPT = "Describe the video."; private static final String TEST_IMAGE_URL = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png"; - private static final String TEST_IMAGE_PATH = "src/test/resources/dog.png"; + private static final String TEST_IMAGE_PATH = + Paths.get("src", "test", "resources", "dog.png").toString(); private static final String TEST_AUDIO_URL = "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male2.wav"; private static final String TEST_AUDIO_PATH = - "src/test/resources/hello_world_male_16k_16bit_mono.wav"; + Paths.get("src", "test", "resources", "hello_world_male_16k_16bit_mono.wav").toString(); + private static final String TEST_VIDEO_URL = + "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4"; + private static final String TEST_VIDEO_PATH = + Paths.get("src", "test", "resources", "test_video.mp4").toString(); + private static final String TEST_FIRST_FRAME_URL = + "https://wanx.alicdn.com/material/20250318/first_frame.png"; + private static final String TEST_LAST_FRAME_URL = + "https://wanx.alicdn.com/material/20250318/last_frame.png"; + private DashScopeMultiModalTool multiModalTool; @BeforeEach @@ -76,9 +95,9 @@ void testTextToImageUrlMode() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock); + assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0)); ImageBlock imageBlock = (ImageBlock) toolResultBlock.getOutput().get(0); - assertTrue(imageBlock.getSource() instanceof URLSource); + assertInstanceOf(URLSource.class, imageBlock.getSource()); assertNotNull(((URLSource) imageBlock.getSource()).getUrl()); }) .verifyComplete(); @@ -96,9 +115,9 @@ void testTextToImageBase64Mode() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock); + assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0)); ImageBlock imageBlock = (ImageBlock) toolResultBlock.getOutput().get(0); - assertTrue(imageBlock.getSource() instanceof Base64Source); + assertInstanceOf(Base64Source.class, imageBlock.getSource()); assertNotNull(((Base64Source) imageBlock.getSource()).getMediaType()); assertNotNull(((Base64Source) imageBlock.getSource()).getData()); }) @@ -117,15 +136,15 @@ void testTextToImageResponseMultiUrls() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(2, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock); - assertTrue(toolResultBlock.getOutput().get(1) instanceof ImageBlock); + assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0)); + assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(1)); ImageBlock image0Block = (ImageBlock) toolResultBlock.getOutput().get(0); - assertTrue(image0Block.getSource() instanceof URLSource); + assertInstanceOf(URLSource.class, image0Block.getSource()); assertNotNull(((URLSource) image0Block.getSource()).getUrl()); ImageBlock image1Block = (ImageBlock) toolResultBlock.getOutput().get(1); - assertTrue(image1Block.getSource() instanceof URLSource); + assertInstanceOf(URLSource.class, image1Block.getSource()); assertNotNull(((URLSource) image1Block.getSource()).getUrl()); }) .verifyComplete(); @@ -143,7 +162,7 @@ void testImageToTextWithUrl() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertNotNull( ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); }) @@ -162,7 +181,28 @@ void testImageToTextWithFile() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertNotNull( + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("Image to text with base64 data url") + void testImageToTextWithBase64DataUrl() throws IOException { + Mono result = + multiModalTool.dashscopeImageToText( + List.of(MediaUtils.urlToBase64DataUrl(TEST_IMAGE_URL)), + IMAGE_TO_TEXT_PROMPT, + "qwen3-vl-plus"); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertNotNull( ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); }) @@ -183,7 +223,7 @@ void testImageToTextWithUrlAndFile() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertNotNull( ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); }) @@ -202,9 +242,9 @@ void testTextToAudio() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof AudioBlock); + assertInstanceOf(AudioBlock.class, toolResultBlock.getOutput().get(0)); AudioBlock audioBlock = (AudioBlock) toolResultBlock.getOutput().get(0); - assertTrue(audioBlock.getSource() instanceof Base64Source); + assertInstanceOf(Base64Source.class, audioBlock.getSource()); assertNotNull(((Base64Source) audioBlock.getSource()).getData()); }) .verifyComplete(); @@ -222,7 +262,7 @@ void testAudioToTextWithUrl() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertNotNull( ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); }) @@ -241,7 +281,279 @@ void testAudioToTextWithFile() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertNotNull( + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("Text to video response url") + void testTextToVideo() { + Mono result = + multiModalTool.dashscopeTextToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.6-t2v", + "low quality", + TEST_AUDIO_URL, + "1920*1080", + 5, + "single", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertNotNull(((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("Image to video with image url") + void testImageToVideoWithUrl() { + Mono result = + multiModalTool.dashscopeImageToVideo( + IMAGE_TO_VIDEO_PROMPT, + "wan2.6-i2v-flash", + TEST_IMAGE_URL, + TEST_AUDIO_URL, + "low quality", + "hanfu-1", + "480P", + 10, + "single", + true, + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertNotNull(((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("Image to video with local image file") + void testImageToVideoWithFile() { + Mono result = + multiModalTool.dashscopeImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.6-i2v-flash", + TEST_IMAGE_PATH, + TEST_AUDIO_URL, + "low quality", + "hanfu-1", + "480P", + 10, + "single", + true, + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertNotNull(((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("Image to video with base64 data url") + void testImageToVideoWithBase64DataUrl() throws IOException { + Mono result = + multiModalTool.dashscopeImageToVideo( + MediaUtils.urlToBase64DataUrl(TEST_IMAGE_PATH), + "wan2.6-i2v-flash", + TEST_IMAGE_PATH, + TEST_AUDIO_URL, + "low quality", + "hanfu-1", + "480P", + 10, + "single", + true, + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertNotNull(((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("First and last frame image to video with image url") + void testFirstAndLastFrameImageToVideoWithUrl() { + Mono result = + multiModalTool.dashscopeFirstAndLastFrameImageToVideo( + FIRST_AND_LAST_FRAME_IMAGE_TO_VIDEO_PROMPT, + "wan2.2-kf2v-flash", + TEST_FIRST_FRAME_URL, + TEST_LAST_FRAME_URL, + "", + "hanfu-1", + "480P", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertNotNull(((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("First and last frame image to video with local image file") + void testFirstAndLastFrameImageToVideoWithFile() throws IOException { + Mono result = + multiModalTool.dashscopeFirstAndLastFrameImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.2-kf2v-flash", + TEST_IMAGE_PATH, + null, + "", + "hanfu-1", + "480P", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertNotNull(((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("First and last frame image to video with base64 data url") + void testFirstAndLastFrameImageToVideoWithBase64DataUrl() throws IOException { + Mono result = + multiModalTool.dashscopeFirstAndLastFrameImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.2-kf2v-flash", + MediaUtils.urlToBase64DataUrl(TEST_FIRST_FRAME_URL), + MediaUtils.urlToBase64DataUrl(TEST_LAST_FRAME_URL), + "", + "hanfu-1", + "480P", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertNotNull(((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("Video to text with video url") + void testVideoToTextWithUrl() { + Mono result = + multiModalTool.dashscopeVideoToText( + TEST_VIDEO_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertNotNull( + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("Video to text with local video file") + void testVideoToTextWithFile() { + Mono result = + multiModalTool.dashscopeVideoToText( + TEST_VIDEO_PATH, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertNotNull( + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + } + + @Test + @DisplayName("Video to text with base64 data url") + void testVideoToTextWithBase64DataUrl() throws IOException { + Mono result = + multiModalTool.dashscopeVideoToText( + MediaUtils.urlToBase64DataUrl(TEST_VIDEO_URL), + VIDEO_TO_TEXT_PROMPT, + "qwen3.5-plus", + 2.0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertNotNull( ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); }) diff --git a/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolTest.java b/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolTest.java index 2e99db059..3f62ad5ee 100644 --- a/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolTest.java +++ b/agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolTest.java @@ -17,6 +17,7 @@ import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; @@ -41,6 +42,10 @@ import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationOutput; import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam; import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult; +import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesis; +import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisOutput; +import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisParam; +import com.alibaba.dashscope.aigc.videosynthesis.VideoSynthesisResult; import com.alibaba.dashscope.api.SynchronizeFullDuplexApi; import com.alibaba.dashscope.audio.asr.recognition.Recognition; import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam; @@ -57,6 +62,7 @@ import io.agentscope.core.message.TextBlock; import io.agentscope.core.message.ToolResultBlock; import io.agentscope.core.message.URLSource; +import io.agentscope.core.message.VideoBlock; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.OutputStream; @@ -89,12 +95,20 @@ class DashScopeMultiModalToolTest { private static final String TEST_API_KEY = "test_api_key"; private static final String TEXT_TO_IMAGE_PROMPT = "A small dog."; private static final String IMAGE_TO_TEXT_PROMPT = "Describe the image."; + private static final String TEXT_TO_VIDEO_PROMPT = "A smart cat is running in the moonlight."; + private static final String VIDEO_TO_TEXT_PROMPT = "Describe the video."; private static final String TEST_IMAGE0_URL = "https://example.com/image0.png"; private static final String TEST_IMAGE1_URL = "https://example.com/image1.png"; private static final String TEST_IMAGE_PATH = "/path/image.png"; + private static final String TEST_IMAGE_BASE64_DATA_URL = + "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABDg..."; private static final String TEST_AUDIO_URL = "https://example.com/audio.wav"; private static final String TEST_AUDIO_PATH = "/path/audio.wav"; - private static final String TEST_AUDIO_TEXT = "text audio text"; + private static final String TEST_AUDIO_TEXT = "test audio text"; + private static final String TEST_VIDEO_URL = "https://example.com/video.mp4"; + private static final String TEST_VIDEO_PATH = "/path/video.mp4"; + private static final String TEST_VIDEO_BASE64_DATA_URL = + "data:video/mp4;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAA..."; // base64 of "hello" private static final String TEST_BASE64_DATA = "aGVsbG8="; private static final String TEST_MULTI_MODAL_CONTENT = "This is a small dog."; @@ -131,9 +145,9 @@ void testTextToImageUrlMode() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock); + assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0)); ImageBlock imageBlock = (ImageBlock) toolResultBlock.getOutput().get(0); - assertTrue(imageBlock.getSource() instanceof URLSource); + assertInstanceOf(URLSource.class, imageBlock.getSource()); assertEquals( TEST_IMAGE0_URL, ((URLSource) imageBlock.getSource()).getUrl()); }) @@ -171,9 +185,9 @@ void testTextToImageBase64Mode() throws IOException { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock); + assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0)); ImageBlock imageBlock = (ImageBlock) toolResultBlock.getOutput().get(0); - assertTrue(imageBlock.getSource() instanceof Base64Source); + assertInstanceOf(Base64Source.class, imageBlock.getSource()); assertEquals( "image/png", ((Base64Source) imageBlock.getSource()).getMediaType()); @@ -215,17 +229,17 @@ void testTextToImageResponseMultiUrl() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(2, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof ImageBlock); - assertTrue(toolResultBlock.getOutput().get(1) instanceof ImageBlock); + assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(0)); + assertInstanceOf(ImageBlock.class, toolResultBlock.getOutput().get(1)); ImageBlock image0Block = (ImageBlock) toolResultBlock.getOutput().get(0); - assertTrue(image0Block.getSource() instanceof URLSource); + assertInstanceOf(URLSource.class, image0Block.getSource()); assertEquals( TEST_IMAGE0_URL, ((URLSource) image0Block.getSource()).getUrl()); ImageBlock image1Block = (ImageBlock) toolResultBlock.getOutput().get(1); - assertTrue(image1Block.getSource() instanceof URLSource); + assertInstanceOf(URLSource.class, image1Block.getSource()); assertEquals( TEST_IMAGE1_URL, ((URLSource) image1Block.getSource()).getUrl()); @@ -259,7 +273,7 @@ void testTextToImageResponseEmpty() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( String.format("Error: %s", "Failed to generate images."), ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -293,7 +307,7 @@ void testTextToImageResponseNull() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( String.format("Error: %s", "Failed to generate images."), ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -309,9 +323,9 @@ void testTextToImageError() { MockedConstruction mockCtor = mockConstruction( ImageSynthesis.class, - (mock, context) -> { - when(mock.call(any(ImageSynthesisParam.class))).thenThrow(TEST_ERROR); - }); + (mock, context) -> + when(mock.call(any(ImageSynthesisParam.class))) + .thenThrow(TEST_ERROR)); Mono result = multiModalTool.dashscopeTextToImage( @@ -322,7 +336,7 @@ void testTextToImageError() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( String.format("Error: %s", TEST_ERROR.getMessage()), ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -370,7 +384,7 @@ void testImageToTextWithUrl() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( TEST_MULTI_MODAL_CONTENT, ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -421,7 +435,7 @@ void testImageToTextWithFile() throws IOException { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( TEST_MULTI_MODAL_CONTENT, ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -432,6 +446,54 @@ void testImageToTextWithFile() throws IOException { mockedConv.close(); } + @Test + @DisplayName("Image to text with base64 data url") + void testImageToTextWithBase64DataUrl() { + MockedConstruction mockedConv = + mockConstruction( + MultiModalConversation.class, + (mock, context) -> { + MultiModalConversationResult mockResult = + mock(MultiModalConversationResult.class); + MultiModalConversationOutput mockOutput = + mock(MultiModalConversationOutput.class); + + when(mockResult.getOutput()).thenReturn(mockOutput); + MultiModalConversationOutput.Choice choice = + new MultiModalConversationOutput.Choice(); + choice.setMessage( + MultiModalMessage.builder() + .content( + List.of( + Map.of( + "text", + TEST_MULTI_MODAL_CONTENT))) + .build()); + choice.setFinishReason("stop"); + when(mockOutput.getChoices()).thenReturn(List.of(choice)); + when(mock.call(any(MultiModalConversationParam.class))) + .thenReturn(mockResult); + }); + + Mono result = + multiModalTool.dashscopeImageToText( + List.of(TEST_IMAGE_BASE64_DATA_URL), IMAGE_TO_TEXT_PROMPT, "qwen3-vl-plus"); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + TEST_MULTI_MODAL_CONTENT, + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockedConv.close(); + } + @Test @DisplayName("Image to text with local file and web url") void testImageToTextWithFileAndUrl() throws IOException { @@ -475,7 +537,7 @@ void testImageToTextWithFileAndUrl() throws IOException { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); }) .verifyComplete(); @@ -515,7 +577,7 @@ void testImageToTextResponseEmpty() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( String.format("Error: %s", "Failed to generate text."), ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -552,7 +614,7 @@ void testImageToTextResponseNull() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( String.format("Error: %s", "Failed to generate text."), ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -568,10 +630,9 @@ void testImageToTextError() { MockedConstruction mockConv = mockConstruction( MultiModalConversation.class, - (mock, context) -> { - when(mock.call(any(MultiModalConversationParam.class))) - .thenThrow(TEST_ERROR); - }); + (mock, context) -> + when(mock.call(any(MultiModalConversationParam.class))) + .thenThrow(TEST_ERROR)); Mono result = multiModalTool.dashscopeImageToText( @@ -582,7 +643,7 @@ void testImageToTextError() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( String.format("Error: %s", TEST_ERROR.getMessage()), ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -612,9 +673,9 @@ void testTextToAudioWithSambertSuccess() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof AudioBlock); + assertInstanceOf(AudioBlock.class, toolResultBlock.getOutput().get(0)); AudioBlock audioBlock = (AudioBlock) toolResultBlock.getOutput().get(0); - assertTrue(audioBlock.getSource() instanceof Base64Source); + assertInstanceOf(Base64Source.class, audioBlock.getSource()); assertEquals( TEST_BASE64_DATA, ((Base64Source) audioBlock.getSource()).getData()); @@ -643,15 +704,17 @@ private ToolResultBlock invokeParseQwenTTSResponse(String responseBody) throws E @DisplayName("Parse Qwen TTS response with URL") void testParseQwenTTSResponseWithUrl() throws Exception { String responseJson = - "{\"output\":{\"audio\":{\"url\":\"https://example.com/audio.wav\"}},\"request_id\":\"test-request-id\"}"; + """ + {"output":{"audio":{"url":"https://example.com/audio.wav"}},"request_id":"test-request-id"} + """; ToolResultBlock result = invokeParseQwenTTSResponse(responseJson); assertNotNull(result); assertEquals(1, result.getOutput().size()); - assertTrue(result.getOutput().get(0) instanceof AudioBlock); + assertInstanceOf(AudioBlock.class, result.getOutput().get(0)); AudioBlock audioBlock = (AudioBlock) result.getOutput().get(0); - assertTrue(audioBlock.getSource() instanceof URLSource); + assertInstanceOf(URLSource.class, audioBlock.getSource()); assertEquals( "https://example.com/audio.wav", ((URLSource) audioBlock.getSource()).getUrl()); } @@ -669,9 +732,9 @@ void testParseQwenTTSResponseWithBase64() throws Exception { assertNotNull(result); assertEquals(1, result.getOutput().size()); - assertTrue(result.getOutput().get(0) instanceof AudioBlock); + assertInstanceOf(AudioBlock.class, result.getOutput().get(0)); AudioBlock audioBlock = (AudioBlock) result.getOutput().get(0); - assertTrue(audioBlock.getSource() instanceof Base64Source); + assertInstanceOf(Base64Source.class, audioBlock.getSource()); assertEquals(testBase64, ((Base64Source) audioBlock.getSource()).getData()); assertEquals("audio/wav", ((Base64Source) audioBlock.getSource()).getMediaType()); } @@ -685,7 +748,7 @@ void testParseQwenTTSResponseWithError() throws Exception { assertNotNull(result); assertEquals(1, result.getOutput().size()); - assertTrue(result.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, result.getOutput().get(0)); assertTrue( ((TextBlock) result.getOutput().get(0)).getText().contains("Invalid request")); } @@ -699,7 +762,7 @@ void testParseQwenTTSResponseMissingOutput() throws Exception { assertNotNull(result); assertEquals(1, result.getOutput().size()); - assertTrue(result.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, result.getOutput().get(0)); assertTrue( ((TextBlock) result.getOutput().get(0)) .getText() @@ -715,7 +778,7 @@ void testParseQwenTTSResponseMissingAudio() throws Exception { assertNotNull(result); assertEquals(1, result.getOutput().size()); - assertTrue(result.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, result.getOutput().get(0)); assertTrue( ((TextBlock) result.getOutput().get(0)) .getText() @@ -731,7 +794,7 @@ void testParseQwenTTSResponseNoAudioData() throws Exception { assertNotNull(result); assertEquals(1, result.getOutput().size()); - assertTrue(result.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, result.getOutput().get(0)); assertTrue( ((TextBlock) result.getOutput().get(0)) .getText() @@ -747,7 +810,7 @@ void testParseQwenTTSResponseInvalidJson() throws Exception { assertNotNull(result); assertEquals(1, result.getOutput().size()); - assertTrue(result.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, result.getOutput().get(0)); assertTrue( ((TextBlock) result.getOutput().get(0)) .getText() @@ -763,7 +826,7 @@ void testParseQwenTTSResponseErrorNoMessage() throws Exception { assertNotNull(result); assertEquals(1, result.getOutput().size()); - assertTrue(result.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, result.getOutput().get(0)); assertTrue(((TextBlock) result.getOutput().get(0)).getText().contains("Unknown error")); } } @@ -796,10 +859,9 @@ void testTextToAudioResponseEmpty() { MockedConstruction mockCtor = Mockito.mockConstruction( SpeechSynthesizer.class, - (mock, context) -> { - when(mock.call(any(SpeechSynthesisParam.class))) - .thenReturn(ByteBuffer.allocate(0)); - }); + (mock, context) -> + when(mock.call(any(SpeechSynthesisParam.class))) + .thenReturn(ByteBuffer.allocate(0))); Mono result = multiModalTool.dashscopeTextToAudio( @@ -810,7 +872,7 @@ void testTextToAudioResponseEmpty() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( String.format("Error: %s", "Failed to generate audio."), ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -826,9 +888,8 @@ void testTextToAudioResponseNull() { MockedConstruction mockCtor = Mockito.mockConstruction( SpeechSynthesizer.class, - (mock, context) -> { - when(mock.call(any(SpeechSynthesisParam.class))).thenReturn(null); - }); + (mock, context) -> + when(mock.call(any(SpeechSynthesisParam.class))).thenReturn(null)); Mono result = multiModalTool.dashscopeTextToAudio( @@ -839,7 +900,7 @@ void testTextToAudioResponseNull() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( String.format("Error: %s", "Failed to generate audio."), ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -855,9 +916,9 @@ void testTextToAudioError() { MockedConstruction mockCtor = Mockito.mockConstruction( SpeechSynthesizer.class, - (mock, context) -> { - when(mock.call(any(SpeechSynthesisParam.class))).thenThrow(TEST_ERROR); - }); + (mock, context) -> + when(mock.call(any(SpeechSynthesisParam.class))) + .thenThrow(TEST_ERROR)); Mono result = multiModalTool.dashscopeTextToAudio( @@ -868,7 +929,7 @@ void testTextToAudioError() { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( String.format("Error: %s", TEST_ERROR.getMessage()), ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -920,7 +981,7 @@ void testAudioToTextWithUrl() throws Exception { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( TEST_AUDIO_TEXT, ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -972,7 +1033,7 @@ void testAudioToTextWithFile() throws Exception { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( TEST_AUDIO_TEXT, ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -1010,7 +1071,7 @@ void testAudioToTextError() throws Exception { toolResultBlock -> { assertNotNull(toolResultBlock); assertEquals(1, toolResultBlock.getOutput().size()); - assertTrue(toolResultBlock.getOutput().get(0) instanceof TextBlock); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); assertEquals( String.format("Error: %s", TEST_ERROR.getMessage()), ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); @@ -1055,4 +1116,831 @@ void testSendChunkAudioWithFile() throws Exception { Files.deleteIfExists(tempAudioFile); } + + @Test + @DisplayName("Should return a video url when text to video invoked success") + void testTextToVideoUrl() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> { + VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class); + VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class); + + when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult); + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL); + }); + + Mono result = + multiModalTool.dashscopeTextToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.6-t2v", + "low quality", + TEST_AUDIO_URL, + "1920*1080", + 5, + "single", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName("Should return error TextBlock when call text to video response null") + void testTextToVideoResponseNull() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> { + VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class); + VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class); + + when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult); + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getVideoUrl()).thenReturn(null); + }); + + Mono result = + multiModalTool.dashscopeTextToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.6-t2v", + "low quality", + TEST_AUDIO_URL, + "1920*1080", + 5, + "single", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + String.format("Error: %s", "Failed to generate video."), + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName("Should return error TextBlock when call text to video occurs error") + void testTextToVideoError() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> + when(mock.call(any(VideoSynthesisParam.class))) + .thenThrow(TEST_ERROR)); + + Mono result = + multiModalTool.dashscopeTextToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.6-t2v", + "low quality", + TEST_AUDIO_URL, + "1920*1080", + 5, + "single", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + String.format("Error: %s", TEST_ERROR.getMessage()), + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName("Image to video with image url") + void testImageToVideoWithUrl() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> { + VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class); + VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class); + + when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult); + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL); + }); + + Mono result = + multiModalTool.dashscopeImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.6-i2v-flash", + TEST_IMAGE0_URL, + TEST_AUDIO_URL, + "", + "hanfu-1", + "480P", + 10, + "single", + true, + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName("Image to video with local image file") + void testImageToVideoWithFile() throws IOException { + MockedStatic mockMediaUtils = mockStatic(MediaUtils.class); + when(MediaUtils.urlToProtocolUrl(TEST_IMAGE_PATH)).thenReturn("file://" + TEST_IMAGE_PATH); + + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> { + VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class); + VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class); + + when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult); + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL); + }); + + Mono result = + multiModalTool.dashscopeImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.6-i2v-flash", + TEST_IMAGE_PATH, + TEST_AUDIO_URL, + "", + "hanfu-1", + "480P", + 10, + "single", + true, + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + + mockMediaUtils.close(); + mockCtor.close(); + } + + @Test + @DisplayName("Image to video with base64 data url") + void testImageToVideoWithBase64DataUrl() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> { + VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class); + VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class); + + when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult); + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL); + }); + + Mono result = + multiModalTool.dashscopeImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.6-i2v-flash", + TEST_IMAGE_BASE64_DATA_URL, + TEST_AUDIO_URL, + "", + "hanfu-1", + "480P", + 10, + "single", + true, + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName("Should return error TextBlock when call image to video response" + " null") + void testImageToVideoResponseNull() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> { + VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class); + VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class); + + when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult); + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getVideoUrl()).thenReturn(null); + }); + + Mono result = + multiModalTool.dashscopeImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.6-i2v-flash", + TEST_IMAGE0_URL, + TEST_AUDIO_URL, + "", + "hanfu-1", + "480P", + 10, + "single", + true, + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + String.format("Error: %s", "Failed to generate video."), + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName("Should return error TextBlock when call image to video occurs" + " error") + void testImageToVideoError() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> + when(mock.call(any(VideoSynthesisParam.class))) + .thenThrow(TEST_ERROR)); + + Mono result = + multiModalTool.dashscopeImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.6-i2v-flash", + TEST_IMAGE0_URL, + TEST_AUDIO_URL, + "", + "hanfu-1", + "480P", + 10, + "single", + true, + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + String.format("Error: %s", TEST_ERROR.getMessage()), + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName("First and last frame image to video with image url") + void testFirstAndLastFrameImageToVideoWithUrl() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> { + VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class); + VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class); + + when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult); + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL); + }); + + Mono result = + multiModalTool.dashscopeFirstAndLastFrameImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.2-kf2v-flash", + TEST_IMAGE0_URL, + TEST_IMAGE1_URL, + "", + "hanfu-1", + "480P", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName("First and last frame image to video with local image file") + void testFirstAndLastFrameImageToVideoWithFile() throws IOException { + MockedStatic mockMediaUtils = mockStatic(MediaUtils.class); + when(MediaUtils.urlToProtocolUrl(TEST_IMAGE_PATH)).thenReturn("file://" + TEST_IMAGE_PATH); + + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> { + VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class); + VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class); + + when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult); + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL); + }); + + Mono result = + multiModalTool.dashscopeFirstAndLastFrameImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.2-kf2v-flash", + TEST_IMAGE_PATH, + null, + "", + "hanfu-1", + "480P", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + + mockMediaUtils.close(); + mockCtor.close(); + } + + @Test + @DisplayName("First and last frame image to video with base64 data url") + void testFirstAndLastFrameImageToVideoWithBase64DataUrl() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> { + VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class); + VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class); + + when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult); + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getVideoUrl()).thenReturn(TEST_VIDEO_URL); + }); + + Mono result = + multiModalTool.dashscopeFirstAndLastFrameImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.2-kf2v-flash", + TEST_IMAGE_BASE64_DATA_URL, + null, + "", + "hanfu-1", + "480P", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(VideoBlock.class, toolResultBlock.getOutput().get(0)); + VideoBlock vb = (VideoBlock) toolResultBlock.getOutput().get(0); + assertInstanceOf(URLSource.class, vb.getSource()); + assertEquals(TEST_VIDEO_URL, ((URLSource) vb.getSource()).getUrl()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName( + "Should return error TextBlock when call first and last frame image to video response" + + " null") + void testFirstAndLastFrameImageToVideoResponseNull() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> { + VideoSynthesisResult mockResult = mock(VideoSynthesisResult.class); + VideoSynthesisOutput mockOutput = mock(VideoSynthesisOutput.class); + + when(mock.call(any(VideoSynthesisParam.class))).thenReturn(mockResult); + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getVideoUrl()).thenReturn(null); + }); + + Mono result = + multiModalTool.dashscopeFirstAndLastFrameImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.2-kf2v-flash", + TEST_IMAGE_BASE64_DATA_URL, + null, + "", + "hanfu-1", + "480P", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + String.format("Error: %s", "Failed to generate video."), + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName( + "Should return error TextBlock when call first and last frame image to video occurs" + + " error") + void testFirstAndLastFrameImageToVideoError() { + MockedConstruction mockCtor = + mockConstruction( + VideoSynthesis.class, + (mock, context) -> + when(mock.call(any(VideoSynthesisParam.class))) + .thenThrow(TEST_ERROR)); + + Mono result = + multiModalTool.dashscopeFirstAndLastFrameImageToVideo( + TEXT_TO_VIDEO_PROMPT, + "wan2.2-kf2v-flash", + TEST_IMAGE_BASE64_DATA_URL, + null, + "", + "hanfu-1", + "480P", + true, + false, + 0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + String.format("Error: %s", TEST_ERROR.getMessage()), + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockCtor.close(); + } + + @Test + @DisplayName("Video to text with video url") + void testVideoToTextWithUrl() { + MockedConstruction mockConv = + mockConstruction( + MultiModalConversation.class, + (mock, context) -> { + MultiModalConversationResult mockResult = + mock(MultiModalConversationResult.class); + MultiModalConversationOutput mockOutput = + mock(MultiModalConversationOutput.class); + + when(mockResult.getOutput()).thenReturn(mockOutput); + MultiModalConversationOutput.Choice choice = + new MultiModalConversationOutput.Choice(); + choice.setMessage( + MultiModalMessage.builder() + .content( + List.of( + Map.of( + "text", + TEST_MULTI_MODAL_CONTENT))) + .build()); + choice.setFinishReason("stop"); + when(mockOutput.getChoices()).thenReturn(List.of(choice)); + when(mock.call(any(MultiModalConversationParam.class))) + .thenReturn(mockResult); + }); + + Mono result = + multiModalTool.dashscopeVideoToText( + TEST_VIDEO_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + TEST_MULTI_MODAL_CONTENT, + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockConv.close(); + } + + @Test + @DisplayName("Video to text with local video file") + void testVideoToTextWithFile() throws IOException { + MockedStatic mockMediaUtils = mockStatic(MediaUtils.class); + when(MediaUtils.urlToProtocolUrl(TEST_VIDEO_PATH)).thenReturn("file://" + TEST_VIDEO_PATH); + + MockedConstruction mockConv = + mockConstruction( + MultiModalConversation.class, + (mock, context) -> { + MultiModalConversationResult mockResult = + mock(MultiModalConversationResult.class); + MultiModalConversationOutput mockOutput = + mock(MultiModalConversationOutput.class); + + when(mockResult.getOutput()).thenReturn(mockOutput); + MultiModalConversationOutput.Choice choice = + new MultiModalConversationOutput.Choice(); + choice.setMessage( + MultiModalMessage.builder() + .content( + List.of( + Map.of( + "text", + TEST_MULTI_MODAL_CONTENT))) + .build()); + choice.setFinishReason("stop"); + when(mockOutput.getChoices()).thenReturn(List.of(choice)); + when(mock.call(any(MultiModalConversationParam.class))) + .thenReturn(mockResult); + }); + + Mono result = + multiModalTool.dashscopeVideoToText( + TEST_VIDEO_PATH, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + TEST_MULTI_MODAL_CONTENT, + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockMediaUtils.close(); + mockConv.close(); + } + + @Test + @DisplayName("Video to text with base64 data url") + void testVideoToTextWithBase64DataUrl() { + MockedConstruction mockConv = + mockConstruction( + MultiModalConversation.class, + (mock, context) -> { + MultiModalConversationResult mockResult = + mock(MultiModalConversationResult.class); + MultiModalConversationOutput mockOutput = + mock(MultiModalConversationOutput.class); + + when(mockResult.getOutput()).thenReturn(mockOutput); + MultiModalConversationOutput.Choice choice = + new MultiModalConversationOutput.Choice(); + choice.setMessage( + MultiModalMessage.builder() + .content( + List.of( + Map.of( + "text", + TEST_MULTI_MODAL_CONTENT))) + .build()); + choice.setFinishReason("stop"); + when(mockOutput.getChoices()).thenReturn(List.of(choice)); + when(mock.call(any(MultiModalConversationParam.class))) + .thenReturn(mockResult); + }); + + Mono result = + multiModalTool.dashscopeVideoToText( + TEST_VIDEO_BASE64_DATA_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + TEST_MULTI_MODAL_CONTENT, + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockConv.close(); + } + + @Test + @DisplayName("Should return error TextBlock when call video to text response empty") + void testVideoToTextResponseEmpty() { + MockedConstruction mockConv = + mockConstruction( + MultiModalConversation.class, + (mock, context) -> { + MultiModalConversationResult mockResult = + mock(MultiModalConversationResult.class); + MultiModalConversationOutput mockOutput = + mock(MultiModalConversationOutput.class); + + when(mockResult.getOutput()).thenReturn(mockOutput); + MultiModalConversationOutput.Choice choice = + new MultiModalConversationOutput.Choice(); + choice.setMessage( + MultiModalMessage.builder().content(List.of()).build()); + choice.setFinishReason("stop"); + when(mockOutput.getChoices()).thenReturn(List.of(choice)); + when(mock.call(any(MultiModalConversationParam.class))) + .thenReturn(mockResult); + }); + + Mono result = + multiModalTool.dashscopeVideoToText( + TEST_VIDEO_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + String.format("Error: %s", "Failed to analyze video."), + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockConv.close(); + } + + @Test + @DisplayName("Should return error TextBlock when call video to text response null") + void testVideoToTextResponseNull() { + MockedConstruction mockConv = + mockConstruction( + MultiModalConversation.class, + (mock, context) -> { + MultiModalConversationResult mockResult = + mock(MultiModalConversationResult.class); + MultiModalConversationOutput mockOutput = + mock(MultiModalConversationOutput.class); + + when(mockResult.getOutput()).thenReturn(mockOutput); + when(mockOutput.getChoices()).thenReturn(null); + when(mock.call(any(MultiModalConversationParam.class))) + .thenReturn(mockResult); + }); + + Mono result = + multiModalTool.dashscopeVideoToText( + TEST_VIDEO_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + String.format("Error: %s", "Failed to analyze video."), + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockConv.close(); + } + + @Test + @DisplayName("Should return error TextBlock when call video to text occurs error") + void testVideoToTextError() { + MockedConstruction mockConv = + mockConstruction( + MultiModalConversation.class, + (mock, context) -> + when(mock.call(any(MultiModalConversationParam.class))) + .thenThrow(TEST_ERROR)); + + Mono result = + multiModalTool.dashscopeVideoToText( + TEST_VIDEO_URL, VIDEO_TO_TEXT_PROMPT, "qwen3.5-plus", 2.0); + + StepVerifier.create(result) + .assertNext( + toolResultBlock -> { + assertNotNull(toolResultBlock); + assertEquals(1, toolResultBlock.getOutput().size()); + assertInstanceOf(TextBlock.class, toolResultBlock.getOutput().get(0)); + assertEquals( + String.format("Error: %s", TEST_ERROR.getMessage()), + ((TextBlock) toolResultBlock.getOutput().get(0)).getText()); + }) + .verifyComplete(); + + mockConv.close(); + } } diff --git a/agentscope-core/src/test/resources/test_video.mp4 b/agentscope-core/src/test/resources/test_video.mp4 new file mode 100644 index 000000000..19fc876e5 Binary files /dev/null and b/agentscope-core/src/test/resources/test_video.mp4 differ diff --git a/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java b/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java new file mode 100644 index 000000000..e3ead77be --- /dev/null +++ b/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java @@ -0,0 +1,197 @@ +/* + * Copyright 2024-2026 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.agentscope.examples.quickstart; + +import io.agentscope.core.ReActAgent; +import io.agentscope.core.formatter.dashscope.DashScopeChatFormatter; +import io.agentscope.core.hook.Hook; +import io.agentscope.core.hook.HookEvent; +import io.agentscope.core.hook.PostActingEvent; +import io.agentscope.core.hook.PreActingEvent; +import io.agentscope.core.memory.InMemoryMemory; +import io.agentscope.core.message.AudioBlock; +import io.agentscope.core.message.Base64Source; +import io.agentscope.core.message.ContentBlock; +import io.agentscope.core.message.ImageBlock; +import io.agentscope.core.message.Source; +import io.agentscope.core.message.TextBlock; +import io.agentscope.core.message.ToolResultBlock; +import io.agentscope.core.message.URLSource; +import io.agentscope.core.message.VideoBlock; +import io.agentscope.core.model.DashScopeChatModel; +import io.agentscope.core.tool.Toolkit; +import io.agentscope.core.tool.multimodal.DashScopeMultiModalTool; +import java.util.List; +import reactor.core.publisher.Mono; + +/** + * MultiModalToolExample - Demonstrates how to equip an Agent with multimodal tools. + */ +public class MultiModalToolExample { + + public static void main(String[] args) throws Exception { + // Print welcome message + ExampleUtils.printWelcome( + "MultiModal Tool Calling Example", + "This example demonstrates how to equip an Agent with multimodal tools.\n" + + "The agent has image, audio and video multimodal tools."); + + // Get API key + String apiKey = ExampleUtils.getDashScopeApiKey(); + + // Create and register tools + Toolkit toolkit = new Toolkit(); + toolkit.registerTool(new DashScopeMultiModalTool(apiKey)); + printRegisterTools(); + + // Create Agent with tools + ReActAgent agent = + ReActAgent.builder() + .name("MultiModalToolAgent") + .sysPrompt( + "You are a helpful assistant with access to multimodal" + + " tools. Use tools when needed to answer questions" + + " accurately. Always explain what you're doing when using" + + " tools.") + .model( + DashScopeChatModel.builder() + .apiKey(apiKey) + .modelName("qwen-plus") + .stream(true) + .enableThinking(false) + .formatter(new DashScopeChatFormatter()) + .build()) + .hook(new ToolCallLoggingHook()) + .toolkit(toolkit) + .memory(new InMemoryMemory()) + .build(); + + printExamplePrompts(); + + ExampleUtils.startChat(agent); + } + + private static void printRegisterTools() { + String registeredTools = + """ + Registered tools: + - dashscope_text_to_image: Generate image(s) based on the given text. + - dashscope_image_to_text: Generate text based on the given images. + - dashscope_text_to_audio: Convert the given text to audio. + - dashscope_audio_to_text: Convert the given audio to text. + - dashscope_text_to_video: Generate video based on the given text prompt. + - dashscope_image_to_video: Generate a video from a single input image and an optional text prompt. + - dashscope_first_and_last_frame_image_to_video: Generate video transitioning from a first frame to a last frame and an optional text prompt. + - dashscope_video_to_text: Analyze video and generate a text description or answer questions based on the video content. + """; + + System.out.println(registeredTools); + System.out.println("\n"); + } + + private static void printExamplePrompts() { + String examplePrompts = + """ + Example Prompts: + [dashscope_text_to_image]: + Generate a black dog image url. + [dashscope_image_to_text]: + Describe the image url of 'https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png'. + [dashscope_text_to_audio]: + Convert the texts of 'hello, qwen!' to audio url. + [dashscope_audio_to_text]: + Convert the audio url of 'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male2.wav' to text. + [dashscope_text_to_video]: + Generate a smart cat is running in the moonlight video. + [dashscope_image_to_video]: + Generate a video that a tiger is running in moonlight based on the image url of 'https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png'. + [dashscope_first_and_last_frame_image_to_video]: + Generate a video that a black kitten curiously looking at the sky based on the first frame image url of 'https://wanx.alicdn.com/material/20250318/first_frame.png' and the last frame image url of 'https://wanx.alicdn.com/material/20250318/last_frame.png'. + [dashscope_video_to_text]: + Describe the video url of 'https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4'. + """; + System.out.println(examplePrompts); + System.out.println("\n"); + } + + static class ToolCallLoggingHook implements Hook { + + @Override + public Mono onEvent(T event) { + if (event instanceof PreActingEvent preActing) { + System.out.println( + "\n[HOOK] PreActingEvent - Tool: " + + preActing.getToolUse().getName() + + ", Input: " + + preActing.getToolUse().getInput()); + + } else if (event instanceof PostActingEvent postActingEvent) { + ToolResultBlock toolResult = postActingEvent.getToolResult(); + List contentBlocks = toolResult.getOutput(); + if (contentBlocks != null && !contentBlocks.isEmpty()) { + for (ContentBlock cb : contentBlocks) { + if (cb instanceof ImageBlock ib) { + Source source = ib.getSource(); + if (source instanceof URLSource urlSource) { + System.out.println( + "\n[HOOK] PostActingEvent - Tool Result: \nImage URL: " + + urlSource.getUrl()); + } else if (source instanceof Base64Source base64Source) { + System.out.println( + "\n" + + "[HOOK] PostActingEvent - Tool Result: \n" + + "Image Base64 data: " + + base64Source.getData()); + } + } else if (cb instanceof AudioBlock ab) { + Source source = ab.getSource(); + if (source instanceof URLSource urlSource) { + System.out.println( + "\n[HOOK] PostActingEvent - Tool Result: \nAudio URL: " + + urlSource.getUrl()); + } else if (source instanceof Base64Source base64Source) { + System.out.println( + "\n" + + "[HOOK] PostActingEvent - Tool Result: \n" + + "Audio Base64 data: " + + base64Source.getData()); + } + } else if (cb instanceof VideoBlock vb) { + Source source = vb.getSource(); + if (source instanceof URLSource urlSource) { + System.out.println( + "\n[HOOK] PostActingEvent - Tool Result: \nVideo URL: " + + urlSource.getUrl()); + } else if (source instanceof Base64Source base64Source) { + System.out.println( + "\n" + + "[HOOK] PostActingEvent - Tool Result: \n" + + "Video Base64 data: " + + base64Source.getData()); + } + } else if (cb instanceof TextBlock tb) { + System.out.println( + "\n[HOOK] PostActingEvent - Tool Result: \nText: " + + tb.getText()); + } + } + System.out.println("\n"); + } + } + return Mono.just(event); + } + } +} diff --git a/docs/en/task/tool.md b/docs/en/task/tool.md index 213914490..70e2c2848 100644 --- a/docs/en/task/tool.md +++ b/docs/en/task/tool.md @@ -370,3 +370,9 @@ boolean isExternal = toolkit.isExternalTool("query_database"); // true ``` The call flow is the same as Tool Suspend: LLM calls → returns `TOOL_SUSPENDED` → external execution → provide result to resume. + +## Complete Examples + +- **Tool Call Example**: [ToolCallingExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolCallingExample.java) +- **Tool Group Example**: [ToolGroupExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolGroupExample.java) +- **MultiModal Tool Example**: [MultiModalToolExample.java](https://github.com/agentscope-ai/agentscope-java/tree/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java) \ No newline at end of file diff --git a/docs/zh/task/tool.md b/docs/zh/task/tool.md index 828abd370..fa3e57c5a 100644 --- a/docs/zh/task/tool.md +++ b/docs/zh/task/tool.md @@ -370,3 +370,9 @@ boolean isExternal = toolkit.isExternalTool("query_database"); // true ``` 调用流程与工具挂起相同:LLM 调用 → 返回 `TOOL_SUSPENDED` → 外部执行 → 提供结果恢复。 + +## 完整示例 + +- **工具调用示例**: [ToolCallingExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolCallingExample.java) +- **工具组示例**: [ToolGroupExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolGroupExample.java) +- **多模态工具示例**: [MultiModalToolExample.java](https://github.com/agentscope-ai/agentscope-java/tree/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java) \ No newline at end of file