Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions llamaTornado
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ record Config(
double temperature, double topP, long seed, int maxTokens,
boolean stream, boolean echo, boolean interactive, boolean instruct,
boolean useGpu, Backend backend, String gpuMemory,
String heapMin, String heapMax,
String heapMin, String heapMax, String directMemory,
boolean debug, boolean profiler, String profilerDumpDir,
boolean printBytecodes, boolean threads, boolean printKernel,
boolean fullDump, boolean verboseInit,
Expand All @@ -37,6 +37,7 @@ Config parseArgs(String[] args) {
String gpuMemory = "14GB";
String heapMin = "20g";
String heapMax = "20g";
String directMemory = null;
boolean debug = false;
boolean profiler = false;
String profilerDumpDir = null;
Expand Down Expand Up @@ -71,6 +72,7 @@ Config parseArgs(String[] args) {
case "--gpu-memory" -> gpuMemory = args[++i];
case "--heap-min" -> heapMin = args[++i];
case "--heap-max" -> heapMax = args[++i];
case "--direct-memory" -> directMemory = args[++i];
case "--debug" -> debug = true;
case "--profiler" -> profiler = true;
case "--profiler-dump-dir" -> profilerDumpDir = args[++i];
Expand Down Expand Up @@ -101,12 +103,27 @@ Config parseArgs(String[] args) {
profilerDumpDir = System.getenv("LLAMA_ROOT") + "/profiler-log.json";
}

// Default direct memory to 3x heap to accommodate K-quant dequantization
if (directMemory == null) {
directMemory = parseAndScale(heapMax, 3);
}

return new Config(modelPath, prompt, systemPrompt, temperature, topP, seed, maxTokens,
stream, echo, interactive, instruct, useGpu, backend, gpuMemory, heapMin, heapMax,
stream, echo, interactive, instruct, useGpu, backend, gpuMemory, heapMin, heapMax, directMemory,
debug, profiler, profilerDumpDir, printBytecodes, threads, printKernel, fullDump,
verboseInit, showCommand, executeAfterShow, openclFlags, maxWaitEvents, verbose);
}

String parseAndScale(String memoryValue, int multiplier) {
var matcher = java.util.regex.Pattern.compile("(\\d+)([gGmM]?)").matcher(memoryValue);
if (matcher.matches()) {
long value = Long.parseLong(matcher.group(1)) * multiplier;
String suffix = matcher.group(2).isEmpty() ? "" : matcher.group(2);
return value + suffix;
}
return memoryValue;
}

void printUsage() {
IO.println("""
Usage: %s --model <path> [options]
Expand Down Expand Up @@ -138,6 +155,7 @@ void printUsage() {
--gpu-memory <val> GPU memory allocation (default: 14GB)
--heap-min <val> Min JVM heap (default: 20g)
--heap-max <val> Max JVM heap (default: 20g)
--direct-memory <val> Max direct buffer memory (default: 3x heap-max)

Debug:
--debug Enable debug output
Expand Down Expand Up @@ -195,6 +213,7 @@ List<String> buildCommand(Config cfg, String javaHome, String tornadoSdk, String
"-XX:+EnableJVMCI",
"-Xms" + cfg.heapMin(),
"-Xmx" + cfg.heapMax(),
"-XX:MaxDirectMemorySize=" + cfg.directMemory(),
"--enable-preview",
"-Djava.library.path=" + tornadoSdk + "/lib",
"-Djdk.module.showModuleResolution=false",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.beehive.gpullama3.model.loader;

import org.beehive.gpullama3.tensor.GGMLType;
import org.beehive.gpullama3.tensor.GGUF;
import org.beehive.gpullama3.tensor.GGMLTensorEntry;
import org.beehive.gpullama3.auxiliary.Pair;
Expand All @@ -8,6 +9,7 @@
import org.beehive.gpullama3.model.Model;
import org.beehive.gpullama3.tokenizer.Tokenizer;
import org.beehive.gpullama3.tokenizer.Vocabulary;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlan;

import java.io.IOException;
import java.nio.channels.FileChannel;
Expand Down Expand Up @@ -40,10 +42,39 @@ protected String getModelQuantization(Map<String, Object> metadata) {
return switch (modelQuantizationAsInt) {
case 1 -> "FP16";
case 7 -> "Q8_0";
case 14, 15 -> "Q8_0"; // Q4_K_S, Q4_K_M (K-quants use Q8_0 activations)
case 16, 17 -> "Q8_0"; // Q5_K_S, Q5_K_M
case 18 -> "Q8_0"; // Q6_K
default -> throw new UnsupportedOperationException("Unsupported quantization format: " + modelQuantizationAsInt + " (as int).");
};
}

/**
* Returns the effective GPU weight type for TornadoVM execution.
* K-quant types (Q4_K, Q5_K, Q6_K) are dequantized to Q8_0 at load time.
*/
protected static GGMLType effectiveGpuWeightType(GGMLType ggmlType) {
return switch (ggmlType) {
case F16, F32, Q8_0 -> ggmlType;
case Q4_K, Q5_K, Q6_K -> GGMLType.Q8_0;
default -> ggmlType;
};
}

private static String fileTypeName(int fileType) {
return switch (fileType) {
case 0 -> "F32";
case 1 -> "F16";
case 7 -> "Q8_0";
case 14 -> "Q4_K_S";
case 15 -> "Q4_K_M";
case 16 -> "Q5_K_S";
case 17 -> "Q5_K_M";
case 18 -> "Q6_K";
default -> "type_" + fileType;
};
}

/**
* Template method that defines the model loading workflow. Subclasses should not override this method.
*
Expand Down Expand Up @@ -123,6 +154,11 @@ public Weights loadWeights(Map<String, GGMLTensorEntry> tensorEntries, C config)

// Delegate to specific implementation
if (useTornadovm) {
GGMLType gpuType = effectiveGpuWeightType(outputWeight.ggmlType());
if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
int fileType = (int) gguf.getMetadata().get("general.file_type");
System.out.println("Loading model weights in TornadoVM format (" + fileTypeName(fileType) + " -> " + gpuType + ")");
}
return createTornadoVMWeights(tensorEntries, config, ropeFreqs, tokenEmbeddings, outputWeight);
} else {
return createStandardWeights(tensorEntries, config, ropeFreqs, tokenEmbeddings, outputWeight);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,7 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
// @formatter:off
@Override
protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, DevstralConfiguration config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings, GGMLTensorEntry outputWeight) {
GGMLType ggmlType = outputWeight.ggmlType();

if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
}
GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());

if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
throw new UnsupportedOperationException("Type: " + ggmlType + " currently not supported for TornadoVM weights.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,7 @@ protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntr
Pair<float[], float[]> ropeFreqs,
GGMLTensorEntry tokenEmbeddings,
GGMLTensorEntry outputWeight) {
GGMLType ggmlType = outputWeight.ggmlType();

if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
}
GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());

// Validate supported types
if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,7 @@ protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntr
Pair<float[], float[]> ropeFreqs,
GGMLTensorEntry tokenEmbeddings,
GGMLTensorEntry outputWeight) {
GGMLType ggmlType = outputWeight.ggmlType();

if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
}
GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());

// Validate supported types
if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,7 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
// @formatter:off
@Override
protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, MistralConfiguration config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings, GGMLTensorEntry outputWeight) {
GGMLType ggmlType = outputWeight.ggmlType();

if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
}
GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());

// Validate supported types
if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import uk.ac.manchester.tornado.api.types.arrays.*;

import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.ByteOrder;
Expand Down Expand Up @@ -121,6 +122,9 @@ public static FloatTensor loadTensor(GGMLTensorEntry entry) {
case F32 -> new FP32FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
case Q8_0 -> new Q8_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
case Q4_0 -> new Q4_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
case Q4_K -> new Q4_KFloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
case Q5_K -> new Q5_KFloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
case Q6_K -> new Q6_KFloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
case F16 -> new FP16FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
default -> throw new UnsupportedOperationException("Quantization format " + ggmlType);
};
Expand Down Expand Up @@ -149,11 +153,69 @@ public static TornadoTensor loadTornadoTensor(GGMLTensorEntry entry) {
case F32 -> FP32TornadoTensor.fromTornadoMemorySegment(entry.memorySegment());
case F16 -> FP16TornadoTensor.fromTornadoMemorySegment(entry.memorySegment());
case Q8_0 -> Q8_0TornadoTensor.fromTornadoMemorySegment(entry.memorySegment());
case Q4_0 -> throw new UnsupportedOperationException("Q4 format not supported yet");
case Q4_K, Q5_K, Q6_K -> dequantizeToQ8_0TornadoTensor(entry);
case Q4_0 -> throw new UnsupportedOperationException("Q4_0 format not supported for TornadoVM yet");
default -> throw new UnsupportedOperationException("Quantization format " + ggmlType);
};
}

/**
* Dequantizes a K-quant tensor (Q4_K, Q5_K, Q6_K) to Q8_0 format for TornadoVM/GPU execution.
* This is a load-time conversion that allows K-quant models to run on GPU with existing Q8_0 kernels.
*/
private static Q8_0TornadoTensor dequantizeToQ8_0TornadoTensor(GGMLTensorEntry entry) {
// The entry's memorySegment includes a TornadoVM ARRAY_HEADER prefix (16 bytes of zeros).
// Slice past it so the K-quant FloatTensor reads raw tensor data starting at byte 0.
long headerBytes = TornadoNativeArray.ARRAY_HEADER;
GGMLTensorEntry dataEntry = new GGMLTensorEntry(
entry.mappedFile(), entry.name(), entry.ggmlType(), entry.shape(),
entry.memorySegment().asSlice(headerBytes));
FloatTensor sourceTensor = loadTensor(dataEntry);
int numElements = sourceTensor.size();
int blockSize = 32;
int blocksNeeded = (numElements + blockSize - 1) / blockSize;
int q8BlockBytes = 34; // 2 bytes scale + 32 bytes quants
int q8BytesNeeded = blocksNeeded * q8BlockBytes;

byte[] q8Data = new byte[q8BytesNeeded];

for (int b = 0; b < blocksNeeded; b++) {
int start = b * blockSize;
int end = Math.min(start + blockSize, numElements);

// Find max absolute value for scale
float maxAbs = 0;
for (int i = start; i < end; i++) {
maxAbs = Math.max(maxAbs, Math.abs(sourceTensor.getFloat(i)));
}
float scale = maxAbs / 127.0f;

// Write scale as fp16 (little-endian)
short scaleF16 = Float.floatToFloat16(scale);
int blockOff = b * q8BlockBytes;
q8Data[blockOff] = (byte) (scaleF16 & 0xFF);
q8Data[blockOff + 1] = (byte) ((scaleF16 >> 8) & 0xFF);

// Quantize values
float invScale = scale != 0 ? 1.0f / scale : 0;
for (int i = start; i < end; i++) {
int qi = Math.round(sourceTensor.getFloat(i) * invScale);
qi = Math.max(-128, Math.min(127, qi));
q8Data[blockOff + 2 + (i - start)] = (byte) qi;
}
}

// Allocate native memory with TornadoNativeArray header, matching GGUF.loadTensorsTornado layout
MemorySegment nativeSegment = Arena.ofAuto().allocate(headerBytes + q8BytesNeeded, 4);
// Zero out the header
for (int i = 0; i < headerBytes; i++) {
nativeSegment.set(ValueLayout.JAVA_BYTE, i, (byte) 0);
}
// Copy Q8_0 data after header
MemorySegment.copy(MemorySegment.ofArray(q8Data), 0, nativeSegment, headerBytes, q8BytesNeeded);
return Q8_0TornadoTensor.fromTornadoMemorySegment(nativeSegment);
}

/**
* Dispatcher method for loading a TornadoVM tensor array based on type.
* Used in GPU-path.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,7 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
// @formatter:off
@Override
protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, Phi3Configuration config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings, GGMLTensorEntry outputWeight) {
GGMLType ggmlType = outputWeight.ggmlType();

if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
}
GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());

// Validate supported types
if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,7 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
@Override
protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, Qwen2Configuration config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings,
GGMLTensorEntry outputWeight) {
GGMLType ggmlType = outputWeight.ggmlType();

if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
}
GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());

// Validate supported types
if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,7 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, Qwen3Configuration config,
Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings,
GGMLTensorEntry outputWeight) {
if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
System.out.println("Loading model weights in TornadoVM format (loading " + outputWeight.ggmlType() + " -> " + GGMLType.F16 + ")");
}

GGMLType ggmlType = outputWeight.ggmlType();
GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());

final int nl = config.numberOfLayers();

Expand Down
Loading
Loading