From f910cf3955732994e1bcf055df08973bd358ca6f Mon Sep 17 00:00:00 2001 From: Landen Campbell Date: Thu, 9 Apr 2026 21:57:02 -0500 Subject: [PATCH 1/7] Added some new metrics Added the requested metrics besides the tornadovm-related ones and model load duration; they reference other scripts. --- .../gpullama3/auxiliary/LastRunMetrics.java | 19 +++++++++++-------- .../gpullama3/inference/InferenceEngine.java | 12 +++++++++--- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java index 0d411801..a3f86855 100644 --- a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java +++ b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java @@ -3,9 +3,9 @@ /** * Record to store metrics from the last model run. * @param totalTokens The total number of tokens processed - * @param totalSeconds The total time in seconds + * @param totalNanos The total time in nanoseconds */ -public record LastRunMetrics(int totalTokens, double totalSeconds) { +public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos) { /** * Singleton instance to store the latest metrics */ @@ -14,11 +14,11 @@ public record LastRunMetrics(int totalTokens, double totalSeconds) { /** * Sets the metrics for the latest run * - * @param tokens The total number of tokens processed - * @param seconds The total time in seconds + * @param totalTokens The total number of tokens processed + * @param totalNanos The total time in nanoseconds */ - public static void setMetrics(int tokens, double seconds) { - latestMetrics = new LastRunMetrics(tokens, seconds); + public static void setMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos) { + latestMetrics = new LastRunMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos); } /** @@ -26,8 +26,11 @@ public static void setMetrics(int tokens, double seconds) { */ public static void printMetrics() { if (latestMetrics != null) { - double tokensPerSecond = latestMetrics.totalTokens() / latestMetrics.totalSeconds(); - System.err.printf("\n\nachieved tok/s: %.2f. Tokens: %d, seconds: %.2f\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalSeconds()); + double totalSeconds = latestMetrics.totalNanos() / 1_000_000_000.0; + double promptSeconds = latestMetrics.promptNanos() / 1_000_000_000.0; + double inferenceSeconds = latestMetrics.inferenceNanos() / 1_000_000_000.0; + double tokensPerSecond = latestMetrics.totalTokens() / totalSeconds; + System.err.printf("\n\nAchieved tok/s: %.2f. Total tokens: %d, Total time: %d ns (%.2f s)\nPrompt tokens: %d, Prompt time: %d ns (%.2f s)\nInference tokens: %d, Inference time: %d ns (%.2f s)\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalNanos(), totalSeconds, latestMetrics.promptEvalCount(), latestMetrics.promptNanos(), promptSeconds, latestMetrics.inferenceEvalCount(), latestMetrics.inferenceNanos(), inferenceSeconds); } } } diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java index a9c65223..f5cdcab3 100644 --- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java +++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java @@ -658,10 +658,16 @@ public static List generateTokensGPUGranite(Model model, State state, i } long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos); return generatedTokens; } From 805dce0824d807a849ee6f265473bacdee72fa63 Mon Sep 17 00:00:00 2001 From: Landen Campbell <120332787+LegL0ngly@users.noreply.github.com> Date: Fri, 10 Apr 2026 13:59:49 +0000 Subject: [PATCH 2/7] Added throughput metrics, updated parameter information --- .../beehive/gpullama3/auxiliary/LastRunMetrics.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java index a3f86855..4c61d605 100644 --- a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java +++ b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java @@ -4,6 +4,10 @@ * Record to store metrics from the last model run. * @param totalTokens The total number of tokens processed * @param totalNanos The total time in nanoseconds + * @param promptEvalCount Number of tokens in the prompt + * @param promptNanos Time to process the prompt in nanoseconds + * @param inferenceEvalCount Number of tokens in the model's response + * @param inferenceNanos Time to output the response in nanoseconds */ public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos) { /** @@ -13,9 +17,12 @@ public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCou /** * Sets the metrics for the latest run - * * @param totalTokens The total number of tokens processed * @param totalNanos The total time in nanoseconds + * @param promptEvalCount Number of tokens in the prompt + * @param promptNanos Time to process the prompt in nanoseconds + * @param inferenceEvalCount Number of tokens in the model's response + * @param inferenceNanos Time to output the response in nanoseconds */ public static void setMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos) { latestMetrics = new LastRunMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos); @@ -28,9 +35,11 @@ public static void printMetrics() { if (latestMetrics != null) { double totalSeconds = latestMetrics.totalNanos() / 1_000_000_000.0; double promptSeconds = latestMetrics.promptNanos() / 1_000_000_000.0; + double prefillThroughput = latestMetrics.promptEvalCount() / promptSeconds; double inferenceSeconds = latestMetrics.inferenceNanos() / 1_000_000_000.0; + double decodeThroughput = latestMetrics.inferenceEvalCount() / inferenceSeconds; double tokensPerSecond = latestMetrics.totalTokens() / totalSeconds; - System.err.printf("\n\nAchieved tok/s: %.2f. Total tokens: %d, Total time: %d ns (%.2f s)\nPrompt tokens: %d, Prompt time: %d ns (%.2f s)\nInference tokens: %d, Inference time: %d ns (%.2f s)\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalNanos(), totalSeconds, latestMetrics.promptEvalCount(), latestMetrics.promptNanos(), promptSeconds, latestMetrics.inferenceEvalCount(), latestMetrics.inferenceNanos(), inferenceSeconds); + System.err.printf("\n\nAchieved tok/s: %.2f. Total tokens: %d, Total time: %d ns (%.2f s)\nPrefill throughput: %.2f tok/s, Prompt tokens: %d, Prompt time: %d ns (%.2f s)\nDecode throughput: %.2f tok/s, Inference tokens: %d, Inference time: %d ns (%.2f s)\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalNanos(), totalSeconds, prefillThroughput, latestMetrics.promptEvalCount(), latestMetrics.promptNanos(), promptSeconds, decodeThroughput, latestMetrics.inferenceEvalCount(), latestMetrics.inferenceNanos(), inferenceSeconds); } } } From 89f6b6210f6257c93f4ded3c9e61690cf1f5385b Mon Sep 17 00:00:00 2001 From: Landen Campbell <120332787+LegL0ngly@users.noreply.github.com> Date: Fri, 10 Apr 2026 19:15:21 +0000 Subject: [PATCH 3/7] Added Model Load Time Metric --- .../gpullama3/auxiliary/LastRunMetrics.java | 13 ++++++++++ .../gpullama3/model/loader/ModelLoader.java | 24 +++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java index 4c61d605..b7c661c4 100644 --- a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java +++ b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java @@ -14,6 +14,7 @@ public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCou * Singleton instance to store the latest metrics */ private static LastRunMetrics latestMetrics; + private static long loadDurationNanos = 0; /** * Sets the metrics for the latest run @@ -27,12 +28,23 @@ public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCou public static void setMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos) { latestMetrics = new LastRunMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos); } + /** + * @param nanos Time it takes to load the model in nanoseconds; only changes when new model is loaded + */ + public static void setModelLoadDuration(long nanos){ + loadDurationNanos = nanos; + } /** * Prints the metrics from the latest run to stderr */ public static void printMetrics() { if (latestMetrics != null) { + // If statement to only print model load time once, because it will only load once when initialized. + if (loadDurationNanos > 0) { + System.err.printf("Model load time: %d ns (%.2f s)\n", loadDurationNanos, loadDurationNanos / 1_000_000_000.0); + loadDurationNanos = 0; + } double totalSeconds = latestMetrics.totalNanos() / 1_000_000_000.0; double promptSeconds = latestMetrics.promptNanos() / 1_000_000_000.0; double prefillThroughput = latestMetrics.promptEvalCount() / promptSeconds; @@ -40,6 +52,7 @@ public static void printMetrics() { double decodeThroughput = latestMetrics.inferenceEvalCount() / inferenceSeconds; double tokensPerSecond = latestMetrics.totalTokens() / totalSeconds; System.err.printf("\n\nAchieved tok/s: %.2f. Total tokens: %d, Total time: %d ns (%.2f s)\nPrefill throughput: %.2f tok/s, Prompt tokens: %d, Prompt time: %d ns (%.2f s)\nDecode throughput: %.2f tok/s, Inference tokens: %d, Inference time: %d ns (%.2f s)\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalNanos(), totalSeconds, prefillThroughput, latestMetrics.promptEvalCount(), latestMetrics.promptNanos(), promptSeconds, decodeThroughput, latestMetrics.inferenceEvalCount(), latestMetrics.inferenceNanos(), inferenceSeconds); + } } } diff --git a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java index 392113be..d2dece8b 100644 --- a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java +++ b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java @@ -85,6 +85,9 @@ private static ModelType detectModelType(Map metadata) { * @throws IllegalStateException if AOT loading is enabled but the preloaded model is unavailable */ public static Model loadModel(Options options) throws IOException { + //Keep track of load time for performance metrics + long startLoadNanos = System.nanoTime(); + Path ggufPath = options.modelPath(); int contextLength = options.maxTokens(); boolean useTornadovm = options.useTornadovm(); @@ -94,19 +97,36 @@ public static Model loadModel(Options options) throws IOException { // detect model type ModelType modelType = detectModelType(gguf.getMetadata()); // model type-specific load - return modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + Model loadedModel = modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + + //Calculate load time and send to LastRunMetrics + long endLoadNanos = System.nanoTime(); + long loadNanos = (endLoadNanos - startLoadNanos); + LastRunMetrics.setModelLoadDuration(loadNanos); + + return loadedModel; } /** * For compatibility with langchain4j and quarkus. */ public static Model loadModel(Path ggufPath, int contextLength, boolean loadWeights, boolean useTornadovm) throws IOException { + //Keep track of load time for performance metrics + long startLoadNanos = System.nanoTime(); + // initial load of metadata from gguf file GGUF gguf = GGUF.loadGGUFMetadata(ggufPath); // detect model type ModelType modelType = detectModelType(gguf.getMetadata()); // model type-specific load - return modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + Model loadedModel = modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + + //Calculate load time and send to LastRunMetrics + long endLoadNanos = System.nanoTime(); + long loadNanos = (endLoadNanos - startLoadNanos); + LastRunMetrics.setModelLoadDuration(loadNanos); + + return loadedModel; } /** From 45c5a266d24166427b827cfacb26e5aa8e4cc71c Mon Sep 17 00:00:00 2001 From: Landen Campbell <120332787+LegL0ngly@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:05:40 +0000 Subject: [PATCH 4/7] Added remaining metrics (TornadoVM, Time to First Token) --- .../gpullama3/auxiliary/LastRunMetrics.java | 36 ++++++++++++++----- .../gpullama3/inference/InferenceEngine.java | 14 +++++++- .../tornadovm/TornadoVMMasterPlan.java | 35 ++++++++++++++---- 3 files changed, 70 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java index b7c661c4..6bb822ca 100644 --- a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java +++ b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java @@ -8,13 +8,16 @@ * @param promptNanos Time to process the prompt in nanoseconds * @param inferenceEvalCount Number of tokens in the model's response * @param inferenceNanos Time to output the response in nanoseconds + * @param tornadoCompileNanos Time to compile the tornado task graph in nanoseconds + * @param tornadoWarmupNanos Time spent warming up until steady state in nanoseconds */ -public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos) { +public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos, long tornadoCompileNanos, long tornadoWarmupNanos) { /** * Singleton instance to store the latest metrics */ private static LastRunMetrics latestMetrics; private static long loadDurationNanos = 0; + private static boolean displayLoadDuration = true; /** * Sets the metrics for the latest run @@ -24,9 +27,11 @@ public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCou * @param promptNanos Time to process the prompt in nanoseconds * @param inferenceEvalCount Number of tokens in the model's response * @param inferenceNanos Time to output the response in nanoseconds + * @param tornadoCompileNanos Time to compile the tornado task graph in nanoseconds + * @param tornadoWarmupNanos Time spent warming up until steady state in nanoseconds */ - public static void setMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos) { - latestMetrics = new LastRunMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos); + public static void setMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos, long tornadoCompileNanos, long tornadoWarmupNanos) { + latestMetrics = new LastRunMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); } /** * @param nanos Time it takes to load the model in nanoseconds; only changes when new model is loaded @@ -40,18 +45,33 @@ public static void setModelLoadDuration(long nanos){ */ public static void printMetrics() { if (latestMetrics != null) { - // If statement to only print model load time once, because it will only load once when initialized. - if (loadDurationNanos > 0) { - System.err.printf("Model load time: %d ns (%.2f s)\n", loadDurationNanos, loadDurationNanos / 1_000_000_000.0); - loadDurationNanos = 0; + + // If statement to only print metrics when they change, because they won't always change after every prompt, and it should keep things more organized. + if (displayLoadDuration) { + double loadSeconds = loadDurationNanos / 1_000_000_000.0; + System.err.printf("Model load time: %d ns (%.2f s)\n", loadDurationNanos, loadSeconds); + displayLoadDuration = false; + } + if (latestMetrics.compileNanos() > 0) { + double compileSeconds = latestMetrics.compileNanos() / 1_000_000_000.0; + System.err.printf("TornadoVM Compile Time: %d ns (%.2f s)\n", latestMetrics.compileNanos(), compileSeconds); } + if (latestMetrics.warmupNanos() > 0) { + double warmupSeconds = latestMetrics.warmupNanos() / 1_000_000_000.0; + System.err.printf("TornadoVM Warmup Time: %d ns (%.2f s)\n", latestMetrics.warmupNanos(), warmupSeconds); + } + + // Print metrics which WILL change for every prompt double totalSeconds = latestMetrics.totalNanos() / 1_000_000_000.0; + // Time to First Token = Load Time + Promt Eval time (which includes first decode step) + long ttftNanos = loadDurationNanos + latestMetrics.promptNanos(); + double ttftSeconds = ttftNanos / 1_000_000_000.0; double promptSeconds = latestMetrics.promptNanos() / 1_000_000_000.0; double prefillThroughput = latestMetrics.promptEvalCount() / promptSeconds; double inferenceSeconds = latestMetrics.inferenceNanos() / 1_000_000_000.0; double decodeThroughput = latestMetrics.inferenceEvalCount() / inferenceSeconds; double tokensPerSecond = latestMetrics.totalTokens() / totalSeconds; - System.err.printf("\n\nAchieved tok/s: %.2f. Total tokens: %d, Total time: %d ns (%.2f s)\nPrefill throughput: %.2f tok/s, Prompt tokens: %d, Prompt time: %d ns (%.2f s)\nDecode throughput: %.2f tok/s, Inference tokens: %d, Inference time: %d ns (%.2f s)\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalNanos(), totalSeconds, prefillThroughput, latestMetrics.promptEvalCount(), latestMetrics.promptNanos(), promptSeconds, decodeThroughput, latestMetrics.inferenceEvalCount(), latestMetrics.inferenceNanos(), inferenceSeconds); + System.err.printf("\n\nAchieved tok/s: %.2f. Total tokens: %d, Total time: %d ns (%.2f s), Time to first Token: %d ns (%.2f s)\nPrefill throughput: %.2f tok/s, Prompt tokens: %d, Prompt time: %d ns (%.2f s)\nDecode throughput: %.2f tok/s, Inference tokens: %d, Inference time: %d ns (%.2f s)\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalNanos(), totalSeconds, ttftNanos, ttftSeconds, prefillThroughput, latestMetrics.promptEvalCount(), latestMetrics.promptNanos(), promptSeconds, decodeThroughput, latestMetrics.inferenceEvalCount(), latestMetrics.inferenceNanos(), inferenceSeconds); } } diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java index f5cdcab3..f76fd539 100644 --- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java +++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java @@ -667,7 +667,19 @@ public static List generateTokensGPUGranite(Model model, State state, i long promptNanos = inferenceStartNanos - startNanos; int inferenceEvalCount = generatedTokens.size(); long inferenceNanos = endNanos - inferenceStartNanos; - LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos); + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + // If statement to prevent inadvertent crashes from future features + if (tornadoVMMasterPlan != null) { + tornadoCompileNanos = tornadoVMMasterPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); + // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java + tornadoVMMasterPlan.setCompileDurationNanos(0); + tornadoVMMasterPlan.setWarmupDurationNanos(0 + + ); + } + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java index a42dc310..8a958cde 100644 --- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java +++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java @@ -17,6 +17,24 @@ public class TornadoVMMasterPlan { private final Configuration config; public TornadoExecutionPlan executionPlan; GenericLayerPlanner tornadoVMLayerPlanner; + + // Performance variables and their corresponding methods to be accessed from LastRunMetrics + private long compileDurationNanos = 0; + private long warmupDurationNanos = 0; + + public long getCompileDurationNanos() { + return compileDurationNanos; + } + public void setCompileDurationNanos(long nanos) { + this.compileDurationNanos = nanos; + } + + public long getWarmupDurationNanos() { + return warmupDurationNanos; + } + public void setWarmupDurationNanos(long nanos) { + this.warmupDurationNanos = nanos; + } public TornadoVMMasterPlan(State state, Model model) { this.tornadoVMLayerPlanner = createPlanner(state, model); @@ -36,10 +54,8 @@ public TornadoVMMasterPlan(State state, Model model) { * @return The initialized TornadoVMMasterPlan ready for inference */ public static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model model) { - // Initialize timing variables outside conditional blocks to avoid scope issues + // Record Start Time for Performance stats long startTime = System.nanoTime(); - long planCreationTime = 0; - long warmupTime = 0; // Start a timing message if enabled if (ENABLE_TORNADOVM_INIT_TIME) { @@ -49,27 +65,34 @@ public static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model mod // 1. Pre-allocate the TornadoVM plan TornadoVMMasterPlan tornadoVMPlan = new TornadoVMMasterPlan(state, model); + // Calculate and Set Compile Time + long planCreationTime = System.nanoTime(); + tornadoVMPlan.setCompileDurationNanos(planCreationTime - startTime); + // Record time after plan creation if (ENABLE_TORNADOVM_INIT_TIME) { - planCreationTime = System.nanoTime(); System.err.printf("TornadoVM GPU execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0); } // 2. Perform warmup with extra iterations to ensure JIT compilation is complete tornadoVMPlan.executionPlan.withPreCompilation(); // Force JIT compilation from Java to GPU code + long warmupTime = System.nanoTime(); + // Record time after warmup if (ENABLE_TORNADOVM_INIT_TIME) { - warmupTime = System.nanoTime(); System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0); } // 3. Perform copy-in of read-only weights and objects tornadoVMPlan.forceCopyInReadOnlyDataLayered(); // Force copy-in read-only weights + + // Calculate and Set Total Warmup Time + long copyTime = System.nanoTime(); + tornadoVMPlan.setWarmupDurationNanos(copyTime - planCreationTime); // Record final timing information if (ENABLE_TORNADOVM_INIT_TIME) { - long copyTime = System.nanoTime(); System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0); System.err.printf("Finished TornadoVM initialization...\n \n"); } From 96adf3b36bc7b3f5d08eb8af13b18d06cd79bb37 Mon Sep 17 00:00:00 2001 From: Landen Campbell <120332787+LegL0ngly@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:42:36 +0000 Subject: [PATCH 5/7] All metrics finalized, documentation cleaned up, all models now send metrics --- .../gpullama3/auxiliary/LastRunMetrics.java | 11 +- .../gpullama3/inference/InferenceEngine.java | 137 ++++++++++++++---- .../tornadovm/TornadoVMMasterPlan.java | 2 +- 3 files changed, 117 insertions(+), 33 deletions(-) diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java index 6bb822ca..6dd4bd38 100644 --- a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java +++ b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java @@ -16,8 +16,8 @@ public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCou * Singleton instance to store the latest metrics */ private static LastRunMetrics latestMetrics; + // Load duration is static because it only occurs once private static long loadDurationNanos = 0; - private static boolean displayLoadDuration = true; /** * Sets the metrics for the latest run @@ -46,11 +46,10 @@ public static void setModelLoadDuration(long nanos){ public static void printMetrics() { if (latestMetrics != null) { - // If statement to only print metrics when they change, because they won't always change after every prompt, and it should keep things more organized. - if (displayLoadDuration) { + // If statements to only print model load once, and only print TornadoVM metrics if using GPU + if (loadDurationNanos > 0) { double loadSeconds = loadDurationNanos / 1_000_000_000.0; System.err.printf("Model load time: %d ns (%.2f s)\n", loadDurationNanos, loadSeconds); - displayLoadDuration = false; } if (latestMetrics.compileNanos() > 0) { double compileSeconds = latestMetrics.compileNanos() / 1_000_000_000.0; @@ -61,9 +60,8 @@ public static void printMetrics() { System.err.printf("TornadoVM Warmup Time: %d ns (%.2f s)\n", latestMetrics.warmupNanos(), warmupSeconds); } - // Print metrics which WILL change for every prompt double totalSeconds = latestMetrics.totalNanos() / 1_000_000_000.0; - // Time to First Token = Load Time + Promt Eval time (which includes first decode step) + // Time to First Token = Load Time + Prompt Eval time (which includes first decode step) long ttftNanos = loadDurationNanos + latestMetrics.promptNanos(); double ttftSeconds = ttftNanos / 1_000_000_000.0; double promptSeconds = latestMetrics.promptNanos() / 1_000_000_000.0; @@ -73,6 +71,7 @@ public static void printMetrics() { double tokensPerSecond = latestMetrics.totalTokens() / totalSeconds; System.err.printf("\n\nAchieved tok/s: %.2f. Total tokens: %d, Total time: %d ns (%.2f s), Time to first Token: %d ns (%.2f s)\nPrefill throughput: %.2f tok/s, Prompt tokens: %d, Prompt time: %d ns (%.2f s)\nDecode throughput: %.2f tok/s, Inference tokens: %d, Inference time: %d ns (%.2f s)\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalNanos(), totalSeconds, ttftNanos, ttftSeconds, prefillThroughput, latestMetrics.promptEvalCount(), latestMetrics.promptNanos(), promptSeconds, decodeThroughput, latestMetrics.inferenceEvalCount(), latestMetrics.inferenceNanos(), inferenceSeconds); + loadDurationNanos = 0; } } } diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java index f76fd539..e9f5dba7 100644 --- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java +++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java @@ -132,10 +132,18 @@ public static List generateTokensLlama(Model model, State state, int st // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -213,10 +221,18 @@ public static List generateTokensQwen3(Model model, State state, int st // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -266,10 +282,18 @@ public static List generateTokensPhi3(Model model, State state, int sta // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; @@ -354,13 +378,30 @@ public static List generateTokensGPULlama(Model model, State state, int pos++; } - // === Performance Metrics === + // Calculate and Print Performance Metrics long endNanos = System.nanoTime(); - double totalSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - // Set metrics for tokens achieved - LastRunMetrics.setMetrics(totalTokens, totalSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + // If statement to prevent inadvertent crashes from future features + if (tornadoVMMasterPlan != null) { + tornadoCompileNanos = tornadoVMMasterPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); + // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java + tornadoVMMasterPlan.setCompileDurationNanos(0); + tornadoVMMasterPlan.setWarmupDurationNanos(0 + + ); + } + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -449,10 +490,28 @@ public static List generateTokensGPUQwen3(Model model, State state, int // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + // If statement to prevent inadvertent crashes from future features + if (tornadoVMMasterPlan != null) { + tornadoCompileNanos = tornadoVMMasterPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); + // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java + tornadoVMMasterPlan.setCompileDurationNanos(0); + tornadoVMMasterPlan.setWarmupDurationNanos(0 + + ); + } + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -524,10 +583,28 @@ public static List generateTokensGPUPhi3(Model model, State state, int // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + // If statement to prevent inadvertent crashes from future features + if (tornadoVMMasterPlan != null) { + tornadoCompileNanos = tornadoVMMasterPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); + // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java + tornadoVMMasterPlan.setCompileDurationNanos(0); + tornadoVMMasterPlan.setWarmupDurationNanos(0 + + ); + } + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -590,11 +667,20 @@ public static List generateTokensGranite(Model model, State state, int pos++; } + // Calculate and Print Performance Metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -657,6 +743,7 @@ public static List generateTokensGPUGranite(Model model, State state, i pos++; } + // Calculate and Print Performance Metrics long endNanos = System.nanoTime(); if (inferenceStartNanos == 0) { inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated @@ -675,9 +762,7 @@ public static List generateTokensGPUGranite(Model model, State state, i tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java tornadoVMMasterPlan.setCompileDurationNanos(0); - tornadoVMMasterPlan.setWarmupDurationNanos(0 - - ); + tornadoVMMasterPlan.setWarmupDurationNanos(0); } LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java index 8a958cde..0717e61b 100644 --- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java +++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java @@ -54,7 +54,7 @@ public TornadoVMMasterPlan(State state, Model model) { * @return The initialized TornadoVMMasterPlan ready for inference */ public static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model model) { - // Record Start Time for Performance stats + // Record Start Time for Performance metrics long startTime = System.nanoTime(); // Start a timing message if enabled From 5ed313bfecb70a6d087d0efc75a4abbb9dcc7942 Mon Sep 17 00:00:00 2001 From: Landen Campbell <120332787+LegL0ngly@users.noreply.github.com> Date: Sat, 11 Apr 2026 21:24:26 +0000 Subject: [PATCH 6/7] Fixed typos, missing imports, and missing timing variable for CPU Phi3 --- .../gpullama3/auxiliary/LastRunMetrics.java | 12 ++++++------ .../gpullama3/inference/InferenceEngine.java | 16 +++++++--------- .../gpullama3/model/loader/ModelLoader.java | 1 + 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java index 6dd4bd38..562050ca 100644 --- a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java +++ b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java @@ -51,13 +51,13 @@ public static void printMetrics() { double loadSeconds = loadDurationNanos / 1_000_000_000.0; System.err.printf("Model load time: %d ns (%.2f s)\n", loadDurationNanos, loadSeconds); } - if (latestMetrics.compileNanos() > 0) { - double compileSeconds = latestMetrics.compileNanos() / 1_000_000_000.0; - System.err.printf("TornadoVM Compile Time: %d ns (%.2f s)\n", latestMetrics.compileNanos(), compileSeconds); + if (latestMetrics.tornadoCompileNanos() > 0) { + double compileSeconds = latestMetrics.tornadoCompileNanos() / 1_000_000_000.0; + System.err.printf("TornadoVM Compile Time: %d ns (%.2f s)\n", latestMetrics.tornadoCompileNanos(), compileSeconds); } - if (latestMetrics.warmupNanos() > 0) { - double warmupSeconds = latestMetrics.warmupNanos() / 1_000_000_000.0; - System.err.printf("TornadoVM Warmup Time: %d ns (%.2f s)\n", latestMetrics.warmupNanos(), warmupSeconds); + if (latestMetrics.tornadoWarmupNanos() > 0) { + double warmupSeconds = latestMetrics.tornadoWarmupNanos() / 1_000_000_000.0; + System.err.printf("TornadoVM Warmup Time: %d ns (%.2f s)\n", latestMetrics.tornadoWarmupNanos(), warmupSeconds); } double totalSeconds = latestMetrics.totalNanos() / 1_000_000_000.0; diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java index e9f5dba7..06217e41 100644 --- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java +++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java @@ -241,6 +241,7 @@ public static List generateTokensPhi3(Model model, State state, int sta IntConsumer onTokenGenerated) { long startNanos = System.nanoTime(); + long inferenceStartNanos = 0; if (maxTokens < 0 || model.configuration().contextLength() < maxTokens) { maxTokens = model.configuration().contextLength(); } @@ -261,6 +262,9 @@ public static List generateTokensPhi3(Model model, State state, int sta System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken)))); } } else { + if (inferenceStartNanos == 0) { + inferenceStartNanos = System.nanoTime(); + } nextToken = sampler.sampleToken(state.logits); if (echo) { // log inferred token @@ -397,9 +401,7 @@ public static List generateTokensGPULlama(Model model, State state, int tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java tornadoVMMasterPlan.setCompileDurationNanos(0); - tornadoVMMasterPlan.setWarmupDurationNanos(0 - - ); + tornadoVMMasterPlan.setWarmupDurationNanos(0); } LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); @@ -507,9 +509,7 @@ public static List generateTokensGPUQwen3(Model model, State state, int tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java tornadoVMMasterPlan.setCompileDurationNanos(0); - tornadoVMMasterPlan.setWarmupDurationNanos(0 - - ); + tornadoVMMasterPlan.setWarmupDurationNanos(0); } LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); @@ -600,9 +600,7 @@ public static List generateTokensGPUPhi3(Model model, State state, int tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java tornadoVMMasterPlan.setCompileDurationNanos(0); - tornadoVMMasterPlan.setWarmupDurationNanos(0 - - ); + tornadoVMMasterPlan.setWarmupDurationNanos(0); } LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); diff --git a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java index d2dece8b..c85bfe49 100644 --- a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java +++ b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java @@ -11,6 +11,7 @@ import org.beehive.gpullama3.tensor.tornado.FP32TornadoTensor; import org.beehive.gpullama3.tensor.tornado.Q8_0TornadoTensor; import org.beehive.gpullama3.tensor.tornado.TornadoTensor; +import org.beehive.gpullama3.auxiliary.LastRunMetrics; import uk.ac.manchester.tornado.api.types.HalfFloat; import uk.ac.manchester.tornado.api.types.arrays.*; From f62a6188874b21394af4de6c305d37ccb6974c84 Mon Sep 17 00:00:00 2001 From: Landen Campbell <120332787+LegL0ngly@users.noreply.github.com> Date: Sat, 11 Apr 2026 21:42:34 +0000 Subject: [PATCH 7/7] Fixed a variable inconsistent with the naming scheme which was causing errors --- .../gpullama3/inference/InferenceEngine.java | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java index 06217e41..9e254318 100644 --- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java +++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java @@ -396,12 +396,12 @@ public static List generateTokensGPULlama(Model model, State state, int long tornadoCompileNanos = 0; long tornadoWarmupNanos = 0; // If statement to prevent inadvertent crashes from future features - if (tornadoVMMasterPlan != null) { - tornadoCompileNanos = tornadoVMMasterPlan.getCompileDurationNanos(); - tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); + if (tornadoVMPlan != null) { + tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos(); // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java - tornadoVMMasterPlan.setCompileDurationNanos(0); - tornadoVMMasterPlan.setWarmupDurationNanos(0); + tornadoVMPlan.setCompileDurationNanos(0); + tornadoVMPlan.setWarmupDurationNanos(0); } LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); @@ -490,7 +490,7 @@ public static List generateTokensGPUQwen3(Model model, State state, int state.latestToken = currentToken = nextToken; } - // Calculate and print performance metrics + // Calculate and Print Performance Metrics long endNanos = System.nanoTime(); if (inferenceStartNanos == 0) { inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated @@ -504,12 +504,12 @@ public static List generateTokensGPUQwen3(Model model, State state, int long tornadoCompileNanos = 0; long tornadoWarmupNanos = 0; // If statement to prevent inadvertent crashes from future features - if (tornadoVMMasterPlan != null) { - tornadoCompileNanos = tornadoVMMasterPlan.getCompileDurationNanos(); - tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); + if (tornadoVMPlan != null) { + tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos(); // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java - tornadoVMMasterPlan.setCompileDurationNanos(0); - tornadoVMMasterPlan.setWarmupDurationNanos(0); + tornadoVMPlan.setCompileDurationNanos(0); + tornadoVMPlan.setWarmupDurationNanos(0); } LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); @@ -581,7 +581,7 @@ public static List generateTokensGPUPhi3(Model model, State state, int pos++; } - // Calculate and print performance metrics + // Calculate and Print Performance Metrics long endNanos = System.nanoTime(); if (inferenceStartNanos == 0) { inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated @@ -595,12 +595,12 @@ public static List generateTokensGPUPhi3(Model model, State state, int long tornadoCompileNanos = 0; long tornadoWarmupNanos = 0; // If statement to prevent inadvertent crashes from future features - if (tornadoVMMasterPlan != null) { - tornadoCompileNanos = tornadoVMMasterPlan.getCompileDurationNanos(); - tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); + if (tornadoVMPlan != null) { + tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos(); // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java - tornadoVMMasterPlan.setCompileDurationNanos(0); - tornadoVMMasterPlan.setWarmupDurationNanos(0); + tornadoVMPlan.setCompileDurationNanos(0); + tornadoVMPlan.setWarmupDurationNanos(0); } LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); @@ -689,7 +689,7 @@ public static List generateTokensGranite(Model model, State state, int */ public static List generateTokensGPUGranite(Model model, State state, int startPosition, List promptTokens, Set stopTokens, int maxTokens, Sampler sampler, boolean echo, - IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMMasterPlan) { + IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) { long startNanos = System.nanoTime(); long inferenceStartNanos = 0; @@ -707,7 +707,7 @@ public static List generateTokensGPUGranite(Model model, State state, i while (pos < maxTokens) { // Call TornadoVM forward pass (same as Llama for now) - logits = InferenceCore.forwardTornadoVM(model, state, currentToken, pos, tornadoVMMasterPlan); + logits = InferenceCore.forwardTornadoVM(model, state, currentToken, pos, tornadoVMPlan); if (promptIndex < promptTokens.size()) { nextToken = promptTokens.get(promptIndex++); @@ -755,12 +755,12 @@ public static List generateTokensGPUGranite(Model model, State state, i long tornadoCompileNanos = 0; long tornadoWarmupNanos = 0; // If statement to prevent inadvertent crashes from future features - if (tornadoVMMasterPlan != null) { - tornadoCompileNanos = tornadoVMMasterPlan.getCompileDurationNanos(); - tornadoWarmupNanos = tornadoVMMasterPlan.getWarmupDurationNanos(); + if (tornadoVMPlan != null) { + tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos(); // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java - tornadoVMMasterPlan.setCompileDurationNanos(0); - tornadoVMMasterPlan.setWarmupDurationNanos(0); + tornadoVMPlan.setCompileDurationNanos(0); + tornadoVMPlan.setWarmupDurationNanos(0); } LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos);