88 c_int ,
99 c_uint8 ,
1010 c_uint32 ,
11+ c_size_t ,
1112 c_float ,
1213 c_void_p ,
13- c_size_t ,
1414 POINTER ,
1515 _Pointer , # type: ignore
1616 Structure ,
@@ -123,6 +123,17 @@ class mtmd_input_text(Structure):
123123 ]
124124
125125
126+ class mtmd_decoder_pos (Structure ):
127+ """Decoder attention position for M-RoPE models."""
128+
129+ _fields_ = [
130+ ("t" , c_uint32 ),
131+ ("x" , c_uint32 ),
132+ ("y" , c_uint32 ),
133+ ("z" , c_uint32 ),
134+ ]
135+
136+
126137################################################
127138# mtmd.h functions
128139################################################
@@ -165,35 +176,41 @@ def mtmd_init_from_file(
165176def mtmd_free (ctx : mtmd_context_p , / ): ...
166177
167178
168- # MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
169- @ctypes_function ("mtmd_decode_use_non_causal" , [mtmd_context_p_ctypes ], c_bool )
170- def mtmd_decode_use_non_causal (ctx : mtmd_context_p , / ) -> bool :
179+ # MTMD_API bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk);
180+ @ctypes_function (
181+ "mtmd_decode_use_non_causal" ,
182+ [mtmd_context_p_ctypes , mtmd_input_chunk_p_ctypes ],
183+ c_bool ,
184+ )
185+ def mtmd_decode_use_non_causal (
186+ ctx : mtmd_context_p , chunk : Optional [mtmd_input_chunk_p ], /
187+ ) -> bool :
171188 """Check whether MTMD decoding uses non-causal attention."""
172189 ...
173190
174191
175- # MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
192+ # MTMD_API bool mtmd_decode_use_mrope(const mtmd_context * ctx);
176193@ctypes_function ("mtmd_decode_use_mrope" , [mtmd_context_p_ctypes ], c_bool )
177194def mtmd_decode_use_mrope (ctx : mtmd_context_p , / ) -> bool :
178195 """Check whether MTMD decoding uses mRoPE."""
179196 ...
180197
181198
182- # MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
199+ # MTMD_API bool mtmd_support_vision(const mtmd_context * ctx);
183200@ctypes_function ("mtmd_support_vision" , [mtmd_context_p_ctypes ], c_bool )
184201def mtmd_support_vision (ctx : mtmd_context_p , / ) -> bool :
185202 """Check whether the current model supports vision input."""
186203 ...
187204
188205
189- # MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
206+ # MTMD_API bool mtmd_support_audio(const mtmd_context * ctx);
190207@ctypes_function ("mtmd_support_audio" , [mtmd_context_p_ctypes ], c_bool )
191208def mtmd_support_audio (ctx : mtmd_context_p , / ) -> bool :
192209 """Check whether MTMD supports audio."""
193210 ...
194211
195212
196- # MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
213+ # MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
197214@ctypes_function ("mtmd_get_audio_sample_rate" , [mtmd_context_p_ctypes ], c_int )
198215def mtmd_get_audio_sample_rate (ctx : mtmd_context_p , / ) -> int :
199216 """Get the audio sample rate in Hz. Returns -1 if audio is not supported."""
@@ -418,14 +435,16 @@ def mtmd_image_tokens_get_n_tokens(image_tokens: mtmd_image_tokens_p, /) -> int:
418435 ...
419436
420437
421- # MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
438+ # DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens),
439+ # "use mtmd_image_tokens_get_decoder_pos() instead");
422440@ctypes_function ("mtmd_image_tokens_get_nx" , [mtmd_image_tokens_p_ctypes ], c_size_t )
423441def mtmd_image_tokens_get_nx (image_tokens : mtmd_image_tokens_p , / ) -> int :
424442 """Get the image token grid width."""
425443 ...
426444
427445
428- # MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
446+ # DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens),
447+ # "use mtmd_image_tokens_get_decoder_pos() instead");
429448@ctypes_function ("mtmd_image_tokens_get_ny" , [mtmd_image_tokens_p_ctypes ], c_size_t )
430449def mtmd_image_tokens_get_ny (image_tokens : mtmd_image_tokens_p , / ) -> int :
431450 """Get the image token grid height."""
@@ -450,6 +469,23 @@ def mtmd_image_tokens_get_n_pos(image_tokens: mtmd_image_tokens_p, /) -> int:
450469 ...
451470
452471
472+ # MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(
473+ # const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i);
474+ @ctypes_function (
475+ "mtmd_image_tokens_get_decoder_pos" ,
476+ [mtmd_image_tokens_p_ctypes , llama_cpp .llama_pos , c_size_t ],
477+ mtmd_decoder_pos ,
478+ )
479+ def mtmd_image_tokens_get_decoder_pos (
480+ image_tokens : mtmd_image_tokens_p ,
481+ pos_0 : llama_cpp .llama_pos ,
482+ i : Union [c_size_t , int ],
483+ / ,
484+ ) -> mtmd_decoder_pos :
485+ """Get decoder attention position for an image embedding token."""
486+ ...
487+
488+
453489# MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens);
454490@ctypes_function (
455491 "mtmd_encode" ,
@@ -534,6 +570,23 @@ def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p, /) -> int:
534570 ...
535571
536572
573+ # MTMD_API void mtmd_helper_image_get_decoder_pos(
574+ # const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos);
575+ @ctypes_function (
576+ "mtmd_helper_image_get_decoder_pos" ,
577+ [mtmd_image_tokens_p_ctypes , llama_cpp .llama_pos , POINTER (mtmd_decoder_pos )],
578+ None ,
579+ )
580+ def mtmd_helper_image_get_decoder_pos (
581+ image : mtmd_image_tokens_p ,
582+ pos_0 : llama_cpp .llama_pos ,
583+ out_pos : "_Pointer[mtmd_decoder_pos]" ,
584+ / ,
585+ ):
586+ """Fill decoder attention positions for all image embedding tokens."""
587+ ...
588+
589+
537590# MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
538591# struct llama_context * lctx,
539592# const mtmd_input_chunks * chunks,
0 commit comments