feat(AI): 两后端归一的 GenerateStats(prefill/decode 实测统计)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 06:42:59 +08:00
parent 8c8599e77d
commit 070e016f81
3 changed files with 51 additions and 4 deletions
--- a/康康/AI/GenerateStats.swift
+++ b/康康/AI/GenerateStats.swift
@@ -0,0 +1,19 @@
 import Foundation
 /// 单次生成的性能统计,两后端(MNN / MLX)归一。
 /// MNN 取自 LlmContext(prefill_us / decode_us);MLX 取自 GenerateCompletionInfo。
 struct GenerateStats: Sendable, Equatable {
    var promptTokens: Int
    var genTokens: Int
    /// prefill(读入 prompt)耗时,秒。
    var prefillSeconds: Double
    /// decode(逐 token 生成)耗时,秒。
    var decodeSeconds: Double
    var prefillTokensPerSecond: Double {
        prefillSeconds > 0 ? Double(promptTokens) / prefillSeconds : 0
    }
    var decodeTokensPerSecond: Double {
        decodeSeconds > 0 ? Double(genTokens) / decodeSeconds : 0
    }
 }
--- a/康康/AI/LLMSession.swift
+++ b/康康/AI/LLMSession.swift
@@ -8,6 +8,11 @@ import MLXLMCommon
 actor LLMSession {
    let container: ModelContainer
    /// 末次生成统计(取自流末尾的 .info 完成事件,性能自检用)。
    private(set) var lastStats: GenerateStats?
    private func record(_ s: GenerateStats) { lastStats = s }
    init(container: ModelContainer) {
        self.container = container
    }
@@ -78,9 +83,14 @@ actor LLMSession {
                                    let rate = elapsed > 0 ? Double(produced) / elapsed : 0
                                    continuation.yield(TokenChunk(text: text, decodeRate: rate))
-                                case .info:
+                                case .info(let info):
                                    // 生成完成统计,是流的最后一个事件
-                                    break
+                                    await self.record(GenerateStats(
                                        promptTokens: info.promptTokenCount,
                                        genTokens: info.generationTokenCount,
                                        prefillSeconds: info.promptTime,
                                        decodeSeconds: info.generateTime
                                    ))
                                case .toolCall:
                                    // 纯文本生成不会触发,switch 穷举
--- a/康康/AI/MNNBackend.swift
+++ b/康康/AI/MNNBackend.swift
@@ -9,6 +9,11 @@ import Foundation
 actor MNNBackend {
    private var bridge: MNNLLMBridge?
    /// 末次生成统计(供 AIRuntime 在流结束后取走,性能自检用)。
    private(set) var lastStats: GenerateStats?
    private func record(_ s: GenerateStats) { lastStats = s }
    var isLoaded: Bool { bridge?.isLoaded ?? false }
    /// 从 MNN 模型目录加载(目录含 MNN llm 的 config.json + llm.mnn + 权重 + tokenizer)。
@@ -35,10 +40,17 @@ actor MNNBackend {
        return AsyncThrowingStream { continuation in
            let meter = MNNRateMeter()
            let task = Task.detached(priority: .userInitiated) {
-                _ = box.value.generateText(prompt, maxTokens: Int32(maxTokens)) { piece in
+                let stats = box.value.generateText(prompt, maxTokens: Int32(maxTokens)) { piece in
                    let rate = meter.tick()
                    continuation.yield(TokenChunk(text: piece, decodeRate: rate))
                }
                // ObjC 统计对象先抽成 Sendable 的 GenerateStats 再跨 actor 记录。
                await self.record(GenerateStats(
                    promptTokens: Int(stats.promptTokens),
                    genTokens: Int(stats.genTokens),
                    prefillSeconds: stats.prefillMs / 1000.0,
                    decodeSeconds: stats.decodeMs / 1000.0
                ))
                continuation.finish()
            }
            continuation.onTermination = { _ in
@@ -58,9 +70,15 @@ actor MNNBackend {
            Task.detached(priority: .userInitiated) {
                let sink = MNNTextSink()
                do {
-                    _ = try box.value.analyzeImages(paths, prompt: prompt, maxTokens: Int32(maxTokens)) { piece in
+                    let stats = try box.value.analyzeImages(paths, prompt: prompt, maxTokens: Int32(maxTokens)) { piece in
                        sink.append(piece)
                    }
                    await self.record(GenerateStats(
                        promptTokens: Int(stats.promptTokens),
                        genTokens: Int(stats.genTokens),
                        prefillSeconds: stats.prefillMs / 1000.0,
                        decodeSeconds: stats.decodeMs / 1000.0
                    ))
                    cont.resume(returning: sink.text)
                } catch {
                    cont.resume(throwing: AIRuntimeError.inferenceFailed(error.localizedDescription))