Skip to content
Open
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ function(llama_cpp_python_install_target target)
endfunction()

if (LLAMA_BUILD)
set(BUILD_NUMBER 0 CACHE STRING "Build number" FORCE)
set(BUILD_SHARED_LIBS "On")

set(CMAKE_SKIP_BUILD_RPATH FALSE)
Expand Down Expand Up @@ -154,6 +155,10 @@ if (LLAMA_BUILD)
endif()

# Building llava
# Set LLAMA_INSTALL_VERSION for mtmd (not inherited from llama.cpp subdirectory scope)
if(NOT DEFINED LLAMA_INSTALL_VERSION)
set(LLAMA_INSTALL_VERSION "0.0.0")
endif()
add_subdirectory(vendor/llama.cpp/tools/mtmd)

if (WIN32)
Expand Down
21 changes: 16 additions & 5 deletions llama_cpp/_ctypes_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,22 @@ def ctypes_function(
):
def decorator(f: F) -> F:
if enabled:
func = getattr(lib, name)
func.argtypes = argtypes
func.restype = restype
functools.wraps(f)(func)
return func
try:
func = getattr(lib, name)
func.argtypes = argtypes
func.restype = restype
functools.wraps(f)(func)
return func
except AttributeError:
# Symbol not found in shared library (deprecated/removed)
@functools.wraps(f)
def stub(*args: Any, **kwargs: Any) -> Any:
raise NotImplementedError(
f"Symbol '{name}' not found in shared library. The C API might "
"have been removed or deprecated."
)

return stub # type: ignore
else:
return f

Expand Down
10 changes: 6 additions & 4 deletions llama_cpp/_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,10 @@ def kv_cache_clear(self):
assert self.memory is not None, "Memory is not initialized"
llama_cpp.llama_memory_clear(self.memory, True)

def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool:
assert self.memory is not None, "Memory is not initialized"
seq_id = seq_id if seq_id >= 0 else 0
llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
return llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)

def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
assert self.memory is not None, "Memory is not initialized"
Expand All @@ -310,9 +310,11 @@ def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
def get_state_size(self) -> int:
return llama_cpp.llama_state_get_size(self.ctx)

# TODO: copy_state_data
def copy_state_data(self, dst, size: int) -> int:
return llama_cpp.llama_state_get_data(self.ctx, dst, size)

# TODO: set_state_data
def set_state_data(self, src, size: int) -> int:
return llama_cpp.llama_state_set_data(self.ctx, src, size)

# TODO: llama_load_session_file

Expand Down
43 changes: 30 additions & 13 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,11 @@ def __init__(
self._logits_all = logits_all if draft_model is None else True
self.context_params.embeddings = embedding # TODO: Rename to embeddings
self.context_params.offload_kqv = offload_kqv
self.context_params.flash_attn = flash_attn
self.context_params.flash_attn_type = (
llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
if flash_attn
else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
)

if op_offload is not None:
self.context_params.op_offload = op_offload
Expand Down Expand Up @@ -888,13 +892,23 @@ def generate(
else:
break
if longest_prefix > 0:
reset = False
tokens = tokens[longest_prefix:]
self.n_tokens = longest_prefix
if self.verbose:
# Try to trim the KV cache to prefix length. Hybrid models
# (e.g. GDN) may not support partial removal — in that case we
# fall through to the full reset path below.
if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
reset = False
tokens = tokens[longest_prefix:]
self.n_tokens = longest_prefix
if self.verbose:
print(
f"Llama.generate: {longest_prefix} prefix-match hit, "
f"remaining {len(tokens)} prompt tokens to eval",
file=sys.stderr,
)
elif self.verbose:
print(
f"Llama.generate: {longest_prefix} prefix-match hit, "
f"remaining {len(tokens)} prompt tokens to eval",
f"Llama.generate: {longest_prefix} prefix-match found "
f"but partial kv removal not supported, re-evaluating full prompt",
file=sys.stderr,
)

Expand Down Expand Up @@ -1041,7 +1055,7 @@ def embed(
data: Union[List[List[float]], List[List[List[float]]]] = []

def decode_batch(seq_sizes: List[int]):
llama_cpp.llama_kv_self_clear(self._ctx.ctx)
llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(self._ctx.ctx), True)
self._ctx.decode(self._batch)
self._batch.reset()

Expand Down Expand Up @@ -1112,7 +1126,7 @@ def decode_batch(seq_sizes: List[int]):

output = data[0] if isinstance(input, str) else data

llama_cpp.llama_kv_self_clear(self._ctx.ctx)
llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(self._ctx.ctx), True)
self.reset()

if return_count:
Expand Down Expand Up @@ -2096,7 +2110,10 @@ def __getstate__(self):
logits_all=self._logits_all,
embedding=self.context_params.embeddings,
offload_kqv=self.context_params.offload_kqv,
flash_attn=self.context_params.flash_attn,
flash_attn=(
self.context_params.flash_attn_type
== llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
),
op_offload=self.context_params.op_offload,
swa_full=self.context_params.swa_full,
# Sampling Params
Expand Down Expand Up @@ -2127,13 +2144,13 @@ def __setstate__(self, state):
def save_state(self) -> LlamaState:
if self.verbose:
print("Llama.save_state: saving llama state", file=sys.stderr)
state_size = llama_cpp.llama_get_state_size(self._ctx.ctx)
state_size = llama_cpp.llama_state_get_size(self._ctx.ctx)
if self.verbose:
print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
llama_state = (ctypes.c_uint8 * int(state_size))()
if self.verbose:
print("Llama.save_state: allocated state", file=sys.stderr)
n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state)
n_bytes = llama_cpp.llama_state_get_data(self._ctx.ctx, llama_state, state_size)
if self.verbose:
print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
if int(n_bytes) > int(state_size):
Expand Down Expand Up @@ -2166,7 +2183,7 @@ def load_state(self, state: LlamaState) -> None:
LLamaStateArrayType = ctypes.c_uint8 * state_size
llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state)

if llama_cpp.llama_set_state_data(self._ctx.ctx, llama_state) != state_size:
if llama_cpp.llama_state_set_data(self._ctx.ctx, llama_state, state_size) != state_size:
raise RuntimeError("Failed to set llama state data")

def n_ctx(self) -> int:
Expand Down
36 changes: 26 additions & 10 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,16 @@
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1


# enum llama_flash_attn_type {
# LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
# LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
# LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
# };
LLAMA_FLASH_ATTN_TYPE_AUTO = -1
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1


# enum llama_split_mode {
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
Expand Down Expand Up @@ -761,6 +771,7 @@ class llama_model_params(ctypes.Structure):
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
# enum llama_attention_type attention_type; // attention type to use for embeddings
# enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention

# // ref: https://github.com/ggml-org/llama.cpp/pull/2054
# float rope_freq_base; // RoPE base frequency, 0 = from model
Expand All @@ -770,7 +781,7 @@ class llama_model_params(ctypes.Structure):
# float yarn_beta_fast; // YaRN low correction dim
# float yarn_beta_slow; // YaRN high correction dim
# uint32_t yarn_orig_ctx; // YaRN original context size
# float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
# float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)

# ggml_backend_sched_eval_callback cb_eval;
# void * cb_eval_user_data;
Expand All @@ -787,15 +798,14 @@ class llama_model_params(ctypes.Structure):
# // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
# bool embeddings; // if true, extract embeddings (together with logits)
# bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
# bool flash_attn; // use flash attention [EXPERIMENTAL]
# bool no_perf; // measure performance timings
# bool op_offload; // offload host tensor operations to device
# bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
# bool swa_full; // use full-size SWA cache
# bool kv_unified; // use a unified buffer across the input sequences when computing the attention
# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
# // ref: https://github.com/ggml-org/llama.cpp/pull/14363

# // [EXPERIMENTAL]
# struct llama_sampler_seq_config * samplers;
# size_t n_samplers;
# };
class llama_context_params(ctypes.Structure):
"""Parameters for llama_context
Expand All @@ -810,6 +820,7 @@ class llama_context_params(ctypes.Structure):
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
attention_type (int): attention type to use for embeddings
flash_attn_type (int): when to enable Flash Attention, from `enum llama_flash_attn_type`
rope_freq_base (float): RoPE base frequency, 0 = from model
rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
Expand All @@ -826,11 +837,12 @@ class llama_context_params(ctypes.Structure):
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
embeddings (bool): if true, extract embeddings (together with logits)
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
flash_attn (bool): whether to use flash attention
no_perf (bool): whether to measure performance timings
op_offload (bool): offload host tensor operations to device
swa_full (bool): use full-size SWA cache
kv_unified (bool): use a unified buffer across the input sequences when computing the attention
samplers (ctypes.c_void_p): backend sampler chain configuration [EXPERIMENTAL]
n_samplers (ctypes.c_size_t): number of backend sampler chains
"""

if TYPE_CHECKING:
Expand All @@ -843,6 +855,7 @@ class llama_context_params(ctypes.Structure):
rope_scaling_type: int
pooling_type: int
attention_type: int
flash_attn_type: int
rope_freq_base: float
rope_freq_scale: float
yarn_ext_factor: float
Expand All @@ -859,11 +872,12 @@ class llama_context_params(ctypes.Structure):
abort_callback_data: ctypes.c_void_p
embeddings: bool
offload_kqv: bool
flash_attn: bool
no_perf: bool
op_offload: bool
swa_full: bool
kv_unified: bool
samplers: ctypes.c_void_p
n_samplers: ctypes.c_size_t

_fields_ = [
("n_ctx", ctypes.c_uint32),
Expand All @@ -875,6 +889,7 @@ class llama_context_params(ctypes.Structure):
("rope_scaling_type", ctypes.c_int),
("pooling_type", ctypes.c_int),
("attention_type", ctypes.c_int),
("flash_attn_type", ctypes.c_int),
("rope_freq_base", ctypes.c_float),
("rope_freq_scale", ctypes.c_float),
("yarn_ext_factor", ctypes.c_float),
Expand All @@ -891,11 +906,12 @@ class llama_context_params(ctypes.Structure):
("abort_callback_data", ctypes.c_void_p),
("embeddings", ctypes.c_bool),
("offload_kqv", ctypes.c_bool),
("flash_attn", ctypes.c_bool),
("no_perf", ctypes.c_bool),
("op_offload", ctypes.c_bool),
("swa_full", ctypes.c_bool),
("kv_unified", ctypes.c_bool),
("samplers", ctypes.c_void_p),
("n_samplers", ctypes.c_size_t),
]


Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp