diff --git a/Runware/Runware-base.ts b/Runware/Runware-base.ts
index 6395f95..988e881 100644
--- a/Runware/Runware-base.ts
+++ b/Runware/Runware-base.ts
@@ -95,23 +95,13 @@ export class RunwareBase {
     this._timeoutDuration = timeoutDuration;
   }
 
-  private getResultUUID(result: any): string | undefined {
-    // Find the UUID for a given input type
-    // mediaUUID = generic input
-    // Others = specific input types
-    for (const key of ["mediaUUID", "imageUUID", "videoUUID"]) {
-      if (typeof result[key] === "string") return result[key];
-    }
-    return undefined;
-  }
-
   /**
    * Shared polling logic for async results.
    * @param taskUUID - The task UUID to poll for.
    * @param numberResults - Number of results expected.
    * @returns Promise resolving to array of results.
    */
-  private async pollForAsyncResults<T extends {mediaUUID?: string; imageUUID?: string; videoUUID?: string;}>({
+  private async pollForAsyncResults<T extends { status: string; taskUUID: string; }>({
     taskUUID,
     numberResults = 1,
   }: {
@@ -122,13 +112,12 @@ export class RunwareBase {
     await getIntervalAsyncWithPromise(
       async ({ resolve, reject }) => {
         try {
-          const results = await this.getResponse<T>({ taskUUID });
+          const response = await this.getResponse<T>({ taskUUID });
 
           // Add results to the collection
-          for (const result of results || []) {
-            const resultUUID = this.getResultUUID(result);
-            if (resultUUID) {
-              allResults.set(resultUUID, result);
+          for (const responseItem of response || []) {
+            if (responseItem.status === "success") {
+              allResults.set(responseItem.taskUUID, responseItem);
             }
           }
 
@@ -799,83 +788,60 @@ export class RunwareBase {
 
   requestImageToText = async ({
     inputImage,
+    inputs,
     includeCost,
     customTaskUUID,
     taskUUID: _taskUUID,
     retry,
     includePayload,
     includeGenerationTime,
+    deliveryMethod,
+    skipResponse,
+    model,
   }: IRequestImageToText): Promise<IImageToText> => {
-    const totalRetry = retry || this._globalMaxRetries;
-    let lis: any = undefined;
-
-    const startTime = Date.now();
-
     try {
-      return await asyncRetry(
-        async () => {
-          await this.ensureConnection();
-          const imageUploaded = inputImage
-            ? await this.uploadImage(inputImage as File | string)
-            : null;
-
-          const taskUUID = _taskUUID || customTaskUUID || getUUID();
-
-          const payload = {
-            taskUUID,
-            taskType: ETaskType.IMAGE_CAPTION,
-            inputImage: imageUploaded?.imageUUID,
-            ...evaluateNonTrue({ key: "includeCost", value: includeCost }),
-          };
-
-          this.send(payload);
-
-          lis = this.globalListener({
-            taskUUID,
-          });
-
-          const response = await getIntervalWithPromise(
-            ({ resolve, reject }) => {
-              const newReverseClip = this.getSingleMessage({
-                taskUUID,
-              });
+      let imageUploaded;
 
-              if (!newReverseClip) return;
+      // TODO: Add support for handling all media uploads from inputs object
+      // This is legacy support for inputImage only
+      if (inputImage) {
+        imageUploaded = await this.uploadImage(inputImage as File | string);
+      }
 
-              if (newReverseClip?.error) {
-                reject(newReverseClip);
-                return true;
-              }
+      const taskUUID = _taskUUID || customTaskUUID || getUUID();
+      const payload = {
+        taskUUID,
+        taskType: ETaskType.CAPTION,
+        model,
+        inputImage: imageUploaded?.imageUUID,
+        inputs,
+        ...evaluateNonTrue({ key: "includeCost", value: includeCost }),
+        retry,
+        includePayload,
+        includeGenerationTime,
+      };
 
-              if (newReverseClip) {
-                delete this._globalMessages[taskUUID];
-                resolve(newReverseClip);
-                return true;
-              }
-            },
-            {
-              debugKey: "remove-image-background",
-              timeoutDuration: this._timeoutDuration,
-            }
-          );
+      const request = await this.baseSingleRequest<IImageToText>({
+        payload: {
+          ...payload,
+          taskType: ETaskType.CAPTION,
+        },
+        debugKey: "caption",
+      });
 
-          lis.destroy();
+      if (skipResponse) {
+        return request;
+      }
 
-          this.insertAdditionalResponse({
-            response: response,
-            payload: includePayload ? payload : undefined,
-            startTime: includeGenerationTime ? startTime : undefined,
-          });
+      if (deliveryMethod === "async") {
+        const taskUUID = request?.taskUUID;
+        const results = await this.pollForAsyncResults<IImageToText>({
+          taskUUID,
+        });
+        return results[0];
+      }
 
-          return response as IImageToText;
-        },
-        {
-          maxRetries: totalRetry,
-          callback: () => {
-            lis?.destroy();
-          },
-        }
-      );
+      return request;
     } catch (e) {
       throw e;
     }
diff --git a/Runware/types.ts b/Runware/types.ts
index 61ca810..d56bbc5 100644
--- a/Runware/types.ts
+++ b/Runware/types.ts
@@ -14,9 +14,9 @@ export enum ETaskType {
   UPSCALE = "upscale",
   REMOVE_BACKGROUND = "removeBackground",
   VIDEO_INFERENCE = "videoInference",
+  CAPTION = "caption",
   GET_RESPONSE = "getResponse",
   PHOTO_MAKER = "photoMaker",
-  IMAGE_CAPTION = "imageCaption",
   IMAGE_CONTROL_NET_PRE_PROCESS = "imageControlNetPreProcess",
   IMAGE_MASKING = "imageMasking",
   PROMPT_ENHANCE = "promptEnhance",
@@ -49,6 +49,7 @@ export interface IImage {
   imageUUID?: string;
   inputImageUUID?: string;
   taskUUID: string;
+  status: string;
   imageURL?: string;
   imageBase64Data?: string;
   imageDataURI?: string;
@@ -251,15 +252,25 @@ export interface IRefiner {
   startStepPercentage?: number;
 }
 export interface IRequestImageToText extends IAdditionalResponsePayload {
+  model?: string;
   inputImage?: File | string;
+  inputs?: {
+    video?: InputsValue;
+  } & {
+    [key: string]: unknown;
+  };
   includeCost?: boolean;
   customTaskUUID?: string;
   taskUUID?: string;
   retry?: number;
+
+  deliveryMethod?: string;
+  skipResponse?: boolean;
 }
 export interface IImageToText {
   taskType: ETaskType;
   taskUUID: string;
+  status: string;
   text: string;
   cost?: number;
 }
@@ -333,6 +344,7 @@ export interface IRequestVideo extends IRequestImageToText {
 
   [key: string]: any;
 }
+
 export interface IAsyncResults {
   taskUUID: string;
   onPartialImages?: (images: IImage[], error?: IError) => void;
@@ -341,6 +353,7 @@ export interface IAsyncResults {
 export interface IRemoveImage {
   taskType: ETaskType;
   taskUUID: string;
+  status: string;
   imageUUID?: string;
   mediaUUID?: string;
   mediaURL?: string;
@@ -604,6 +617,7 @@ export type TPhotoMaker = {
 export type TPhotoMakerResponse = {
   taskType: string;
   taskUUID: string;
+  status: string;
   imageUUID: string;
   NSFWContent: boolean;
   cost: number;
diff --git a/package.json b/package.json
index 9dfa0f9..51a427c 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@runware/sdk-js",
-  "version": "1.2.0",
+  "version": "1.2.1-beta.2",
   "description": "The SDK is used to run image inference with the Runware API, powered by the RunWare inference platform. It can be used to generate imaged with text-to-image and image-to-image. It also allows the use of an existing gallery of models or selecting any model or LoRA from the CivitAI gallery. The API also supports upscaling, background removal, inpainting and outpainting, and a series of other ControlNet models.",
   "main": "dist/index.js",
   "module": "dist/index.js",
diff --git a/readme.md b/readme.md
index bea180d..d682776 100644
--- a/readme.md
+++ b/readme.md
@@ -431,6 +431,32 @@ return interface IControlNetImage {
 
 &nbsp;
 
+### Request Caption
+
+[Read Documentation](https://docs.runware.ai/en/utilities/caption)
+
+```js
+
+const runware = new Runware({ apiKey: "API_KEY" });
+const caption = await runware.caption({
+	"model": "memories:1@1",
+	inputs: {
+		video: "https://example.com/video.mp4"
+	}
+});
+
+console.log(caption)
+
+return interface IImageToText {
+  taskType: ETaskType;
+  taskUUID: string;
+  text: string;
+  cost?: number;
+}
+```
+
+&nbsp;
+
 ### Model Upload
 
 [Read Documentation](https://docs.runware.ai/en/image-inference/model-upload)
@@ -679,6 +705,10 @@ export type TImageMaskingResponse = {
 
 ## Changelog
 
+### - v1.2.1
+
+- Added caption task type
+
 ### - v1.2.0
 
 - Change removeImageBackground taskType from `removeImageBackground` to `removeBackground` -- removeBackground is compatible with removeImageBackground but it also supports other media inputs such as removing backgrounds from videos