Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 51 additions & 5 deletions Docs/LLMSYSTEM.md
Original file line number Diff line number Diff line change
Expand Up @@ -662,20 +662,56 @@ This macro system ensures dynamic, contextual content that adapts to your curren

The LLMEngine provides several events for real-time updates:

### Structured Streaming Events (Recommended)

The new channel-aware events provide richer information and cleanly separate different types of inference content:

```csharp
// Full prompt generation
LLMEngine.OnFullPromptReady += (sender, prompt) =>
// Channel-aware streaming — receives typed segments (Text, Thinking, ToolCall, etc.)
LLMEngine.OnInferenceSegment += (sender, segment) =>
{
Console.WriteLine($"Generated prompt: {prompt}");
switch (segment.Channel)
{
case InferenceChannel.Text:
Console.Write(segment.Text); // Normal visible response text
break;
case InferenceChannel.Thinking:
// Chain-of-thought / thinking block content (hidden from users)
break;
case InferenceChannel.ToolCall when segment.IsComplete:
Console.WriteLine("LLM requested a tool call");
break;
}
};

// Streaming text generation
// Structured completion — receives the full result with separated channels
LLMEngine.OnInferenceCompleted += (sender, result) =>
{
Console.WriteLine($"\nResponse: {result.Response}");

if (result.ThinkingContent != null)
Console.WriteLine($"[Thinking: {result.ThinkingContent}]");

Console.WriteLine($"Finish reason: {result.FinishReason}");

// Log only the visible response to history
LLMEngine.History.LogMessage(AuthorRole.Assistant, result.Response,
LLMEngine.User, LLMEngine.Bot);
};
```

### Legacy Streaming Events

The following events are still supported for backward compatibility but are deprecated. Migrate to the structured events above for new code.

```csharp
// [Obsolete] Streaming text generation — fires for every token
LLMEngine.OnInferenceStreamed += (sender, token) =>
{
Console.Write(token); // Real-time text output
};

// Generation completion
// [Obsolete] Generation completion — returns the full raw response string
LLMEngine.OnInferenceEnded += (sender, fullResponse) =>
{
Console.WriteLine($"\nGeneration complete: {fullResponse}");
Expand All @@ -684,6 +720,16 @@ LLMEngine.OnInferenceEnded += (sender, fullResponse) =>
LLMEngine.History.LogMessage(AuthorRole.Assistant, fullResponse,
LLMEngine.User, LLMEngine.Bot);
};
```

### Other Events

```csharp
// Full prompt generation
LLMEngine.OnFullPromptReady += (sender, prompt) =>
{
Console.WriteLine($"Generated prompt: {prompt}");
};

// Quick inference (non-streaming) completion
LLMEngine.OnQuickInferenceEnded += (sender, response) =>
Expand Down
120 changes: 120 additions & 0 deletions LLM/InferenceStream.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
using System.Collections.Generic;

namespace LetheAISharp.LLM
{
/// <summary>
/// Identifies the content channel of an inference segment.
/// </summary>
public enum InferenceChannel
{
/// <summary> Normal visible text response. </summary>
Text,
/// <summary> Chain-of-thought / thinking block content. </summary>
Thinking,
/// <summary> The LLM is requesting a tool/function call. </summary>
ToolCall,
/// <summary> Result being fed back after tool execution. </summary>
ToolResult,
/// <summary> Error or system-level message. </summary>
System
}

/// <summary>
/// A single typed chunk emitted during streaming inference.
/// </summary>
public class InferenceSegment
{
/// <summary> What kind of content this segment carries. </summary>
public InferenceChannel Channel { get; init; }

/// <summary> The text delta (populated for Text and Thinking channels). </summary>
public string? Text { get; init; }

/// <summary> Tool call data (populated for ToolCall channel). </summary>
public ToolCallInfo? ToolCall { get; init; }

/// <summary> Tool result data (populated for ToolResult channel). </summary>
public ToolResultInfo? ToolResult { get; init; }

/// <summary> Whether this is the final chunk in its channel. </summary>
public bool IsComplete { get; init; }
}

/// <summary>
/// Data for an LLM-requested tool/function call.
/// </summary>
public class ToolCallInfo
{
/// <summary> Unique identifier for this call. </summary>
public string CallId { get; init; } = string.Empty;

/// <summary> Name of the function the LLM wants to invoke. </summary>
public string FunctionName { get; init; } = string.Empty;

/// <summary> Raw JSON arguments string. </summary>
public string ArgumentsJson { get; init; } = string.Empty;
}

/// <summary>
/// Data for the result returned after a tool call.
/// </summary>
public class ToolResultInfo
{
/// <summary> Identifier matching the originating <see cref="ToolCallInfo.CallId"/>. </summary>
public string CallId { get; init; } = string.Empty;

/// <summary> Name of the function that was invoked. </summary>
public string FunctionName { get; init; } = string.Empty;

/// <summary> Whether the tool execution succeeded. </summary>
public bool Success { get; init; }

/// <summary> Raw JSON result string. </summary>
public string ResultJson { get; init; } = string.Empty;

/// <summary> Error description if <see cref="Success"/> is false. </summary>
public string? Error { get; init; }
}

/// <summary>
/// The final structured result of a complete inference cycle.
/// </summary>
public class InferenceResult
{
/// <summary> The final visible text response (thinking blocks removed). </summary>
public string Response { get; init; } = string.Empty;

/// <summary> The thinking/CoT block content, if any. </summary>
public string? ThinkingContent { get; init; }

/// <summary> All tool calls made during this inference cycle. </summary>
public List<ToolCallRecord> ToolCalls { get; init; } = [];

/// <summary> The finish reason reported by the backend (e.g. "stop", "length", "tool_calls"). </summary>
public string? FinishReason { get; init; }
}

/// <summary>
/// A complete record of a single tool call and its result.
/// </summary>
public class ToolCallRecord
{
/// <summary> Unique identifier for this call. </summary>
public string CallId { get; init; } = string.Empty;

/// <summary> Name of the function that was invoked. </summary>
public string FunctionName { get; init; } = string.Empty;

/// <summary> Raw JSON arguments string. </summary>
public string ArgumentsJson { get; init; } = string.Empty;

/// <summary> Raw JSON result string. </summary>
public string ResultJson { get; init; } = string.Empty;

/// <summary> Whether the tool execution succeeded. </summary>
public bool Success { get; init; }

/// <summary> How long the tool execution took. </summary>
public System.TimeSpan Duration { get; init; }
}
}
60 changes: 60 additions & 0 deletions LLM/LLMEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,21 @@ public static int MaxContextLength {
/// <summary> Called when this library has generated the full prompt, returns full prompt </summary>
public static event EventHandler<string>? OnFullPromptReady;
/// <summary> Called during inference each time the LLM outputs a new token, returns the generated token </summary>
[Obsolete("Use OnInferenceSegment for channel-aware streaming. This event only receives Text channel content.")]
public static event EventHandler<string>? OnInferenceStreamed;
/// <summary> Called once the inference has ended, returns the full string </summary>
[Obsolete("Use OnInferenceCompleted for structured results including thinking content and tool calls.")]
public static event EventHandler<string>? OnInferenceEnded;
/// <summary> Called when the system changes states (no init, busy, ready) </summary>
public static event EventHandler<SystemStatus>? OnStatusChanged;
/// <summary> Called when the bot persona is changed, returns the new bot (sender is always null) </summary>
public static event EventHandler<BasePersona>? OnBotChanged;

/// <summary> Called during inference with typed, channel-tagged segments. Provides richer information than OnInferenceStreamed. </summary>
public static event EventHandler<InferenceSegment>? OnInferenceSegment;
/// <summary> Called when a complete inference cycle finishes. Provides structured results including thinking content and tool call records. </summary>
public static event EventHandler<InferenceResult>? OnInferenceCompleted;

/// <summary> Set to true if the backend supports text-to-speech </summary>
public static bool SupportsTTS => Client?.SupportsTTS ?? false;

Expand All @@ -97,9 +104,13 @@ public static int MaxContextLength {

private static void RaiseOnFullPromptReady(string fullprompt) => OnFullPromptReady?.Invoke(Bot, fullprompt);
private static void RaiseOnStatusChange(SystemStatus newStatus) => OnStatusChanged?.Invoke(Bot, newStatus);
#pragma warning disable CS0618 // backward-compat raise helpers for obsolete events
private static void RaiseOnInferenceStreamed(string addedString) => OnInferenceStreamed?.Invoke(Bot, addedString);
private static void RaiseOnInferenceEnded(string fullString) => OnInferenceEnded?.Invoke(Bot, fullString);
#pragma warning restore CS0618
private static void RaiseOnQuickInferenceEnded(string fullprompt) => OnQuickInferenceEnded?.Invoke(Bot, fullprompt);
private static void RaiseInferenceSegment(InferenceSegment segment) => OnInferenceSegment?.Invoke(Bot, segment);
private static void RaiseInferenceCompleted(InferenceResult result) => OnInferenceCompleted?.Invoke(Bot, result);

/// <summary> List of loaded plugins </summary>
public static List<IContextPlugin> ContextPlugins { get; set; } = [];
Expand Down Expand Up @@ -161,6 +172,9 @@ public static InstructFormat Instruct {

private static SystemStatus status = SystemStatus.NotInit;
private static string StreamingTextProgress = string.Empty;
private static InferenceChannel _currentChannel = InferenceChannel.Text;
private static readonly StringBuilder _thinkingBuffer = new();
private static readonly StringBuilder _textBuffer = new();
private static InstructFormat instruct = new()
{
AddNamesToPrompt = false,
Expand Down Expand Up @@ -396,6 +410,7 @@ public static async Task RerollLastMessage()
if (Status == SystemStatus.Busy)
return;
Status = SystemStatus.Busy;
ResetStreamingState();
StreamingTextProgress = Instruct.GetThinkPrefill();
if (Instruct.PrefillThinking && !string.IsNullOrEmpty(Instruct.ThinkingStart))
{
Expand Down Expand Up @@ -474,6 +489,7 @@ public static async Task SimpleQueryStreaming(object chatlog, CancellationToken

using var _ = await AcquireModelSlotAsync(ctx).ConfigureAwait(false);
Status = SystemStatus.Busy;
ResetStreamingState();
await Client.GenerateTextStreaming(chatlog).ConfigureAwait(false);
}

Expand Down Expand Up @@ -660,14 +676,57 @@ private static void Client_StreamingMessageReceived(object? sender, LLMTokenStre
}
Status = SystemStatus.Ready;
RaiseOnInferenceEnded(response);

// Build structured result for new event
var thinkingContent = _thinkingBuffer.Length > 0 ? _thinkingBuffer.ToString().Trim() : null;
var textResponse = Instruct.IsThinkFormat ? response.RemoveThinkingBlocks() : response;
var inferenceResult = new InferenceResult
{
Response = textResponse,
ThinkingContent = thinkingContent,
ToolCalls = [],
FinishReason = e.FinishReason
};
if (e.FinishReason == "tool_calls")
{
RaiseInferenceSegment(new InferenceSegment { Channel = InferenceChannel.ToolCall, IsComplete = true });
}
RaiseInferenceCompleted(inferenceResult);
}
else
{
StreamingTextProgress += e.Token;

// Detect channel transitions based on thinking delimiters
if (Instruct.IsThinkFormat)
{
_currentChannel = Instruct.IsThinkingPrompt(StreamingTextProgress)
? InferenceChannel.Thinking
: InferenceChannel.Text;
}

// Route token to the appropriate content buffer
if (_currentChannel == InferenceChannel.Thinking)
_thinkingBuffer.Append(e.Token);
else
_textBuffer.Append(e.Token);

RaiseInferenceSegment(new InferenceSegment { Channel = _currentChannel, Text = e.Token, IsComplete = false });
RaiseOnInferenceStreamed(e.Token);
}
}

/// <summary>
/// Resets all per-generation streaming state. Must be called before each new generation.
/// </summary>
private static void ResetStreamingState()
{
_currentChannel = InferenceChannel.Text;
_thinkingBuffer.Clear();
_textBuffer.Clear();
StreamingTextProgress = string.Empty;
}

/// <summary>
/// Change the current bot persona.
/// </summary>
Expand Down Expand Up @@ -887,6 +946,7 @@ private static async Task StartGeneration(SingleMessage message)

var genparams = await GenerateFullPrompt(message, pluginmessage).ConfigureAwait(false);

ResetStreamingState();
StreamingTextProgress = Instruct.GetThinkPrefill();
if (Instruct.PrefillThinking && !string.IsNullOrEmpty(Instruct.ThinkingStart))
{
Expand Down