diff --git a/康康/AI/GenerateStats.swift b/康康/AI/GenerateStats.swift new file mode 100644 index 0000000..b4cadd2 --- /dev/null +++ b/康康/AI/GenerateStats.swift @@ -0,0 +1,19 @@ +import Foundation + +/// 单次生成的性能统计,两后端(MNN / MLX)归一。 +/// MNN 取自 LlmContext(prefill_us / decode_us);MLX 取自 GenerateCompletionInfo。 +struct GenerateStats: Sendable, Equatable { + var promptTokens: Int + var genTokens: Int + /// prefill(读入 prompt)耗时,秒。 + var prefillSeconds: Double + /// decode(逐 token 生成)耗时,秒。 + var decodeSeconds: Double + + var prefillTokensPerSecond: Double { + prefillSeconds > 0 ? Double(promptTokens) / prefillSeconds : 0 + } + var decodeTokensPerSecond: Double { + decodeSeconds > 0 ? Double(genTokens) / decodeSeconds : 0 + } +} diff --git a/康康/AI/LLMSession.swift b/康康/AI/LLMSession.swift index c88d947..842f09e 100644 --- a/康康/AI/LLMSession.swift +++ b/康康/AI/LLMSession.swift @@ -8,6 +8,11 @@ import MLXLMCommon actor LLMSession { let container: ModelContainer + /// 末次生成统计(取自流末尾的 .info 完成事件,性能自检用)。 + private(set) var lastStats: GenerateStats? + + private func record(_ s: GenerateStats) { lastStats = s } + init(container: ModelContainer) { self.container = container } @@ -78,9 +83,14 @@ actor LLMSession { let rate = elapsed > 0 ? Double(produced) / elapsed : 0 continuation.yield(TokenChunk(text: text, decodeRate: rate)) - case .info: + case .info(let info): // 生成完成统计,是流的最后一个事件 - break + await self.record(GenerateStats( + promptTokens: info.promptTokenCount, + genTokens: info.generationTokenCount, + prefillSeconds: info.promptTime, + decodeSeconds: info.generateTime + )) case .toolCall: // 纯文本生成不会触发,switch 穷举 diff --git a/康康/AI/MNNBackend.swift b/康康/AI/MNNBackend.swift index 4357fbe..bee2acf 100644 --- a/康康/AI/MNNBackend.swift +++ b/康康/AI/MNNBackend.swift @@ -9,6 +9,11 @@ import Foundation actor MNNBackend { private var bridge: MNNLLMBridge? + /// 末次生成统计(供 AIRuntime 在流结束后取走,性能自检用)。 + private(set) var lastStats: GenerateStats? + + private func record(_ s: GenerateStats) { lastStats = s } + var isLoaded: Bool { bridge?.isLoaded ?? false } /// 从 MNN 模型目录加载(目录含 MNN llm 的 config.json + llm.mnn + 权重 + tokenizer)。 @@ -35,10 +40,17 @@ actor MNNBackend { return AsyncThrowingStream { continuation in let meter = MNNRateMeter() let task = Task.detached(priority: .userInitiated) { - _ = box.value.generateText(prompt, maxTokens: Int32(maxTokens)) { piece in + let stats = box.value.generateText(prompt, maxTokens: Int32(maxTokens)) { piece in let rate = meter.tick() continuation.yield(TokenChunk(text: piece, decodeRate: rate)) } + // ObjC 统计对象先抽成 Sendable 的 GenerateStats 再跨 actor 记录。 + await self.record(GenerateStats( + promptTokens: Int(stats.promptTokens), + genTokens: Int(stats.genTokens), + prefillSeconds: stats.prefillMs / 1000.0, + decodeSeconds: stats.decodeMs / 1000.0 + )) continuation.finish() } continuation.onTermination = { _ in @@ -58,9 +70,15 @@ actor MNNBackend { Task.detached(priority: .userInitiated) { let sink = MNNTextSink() do { - _ = try box.value.analyzeImages(paths, prompt: prompt, maxTokens: Int32(maxTokens)) { piece in + let stats = try box.value.analyzeImages(paths, prompt: prompt, maxTokens: Int32(maxTokens)) { piece in sink.append(piece) } + await self.record(GenerateStats( + promptTokens: Int(stats.promptTokens), + genTokens: Int(stats.genTokens), + prefillSeconds: stats.prefillMs / 1000.0, + decodeSeconds: stats.decodeMs / 1000.0 + )) cont.resume(returning: sink.text) } catch { cont.resume(throwing: AIRuntimeError.inferenceFailed(error.localizedDescription))