kangkang/体己/AI/LLMSession.swift

import Foundation
import MLX
import MLXLLM
import MLXLMCommon

/// 封装 MLX 语言模型的流式生成,actor 保证单线程访问。
/// 基于 mlx-swift-examples 2.29.1(commit 9bff95ca)的 API。
actor LLMSession {
    let container: ModelContainer

    init(container: ModelContainer) {
        self.container = container
    }

    /// 从本地目录加载模型(包含 config.json + weights + tokenizer)。
    static func load(folderURL: URL) async throws -> LLMSession {
        #if targetEnvironment(simulator)
        // MLX 的 iOS Simulator GPU stream 初始化会在部分 Metal backend 路径中 abort。
        // 模拟器只用于功能调试,强制走 CPU;真机仍保留默认 GPU/ANE 相关路径。
        Device.setDefault(device: .cpu)
        #endif

        let configuration = ModelConfiguration(directory: folderURL)
        let container = try await LLMModelFactory.shared.loadContainer(
            configuration: configuration
        )
        return LLMSession(container: container)
    }

    /// 流式生成。返回的 AsyncThrowingStream 被取消时,内部 Task 也会取消。
    /// - Parameters:
    ///   - prompt: 原始 prompt 文本(经 processor 转 LMInput)
    ///   - maxTokens: 最大 token 数,由 GenerateParameters 控制
    func generate(prompt: String, maxTokens: Int) -> AsyncThrowingStream<TokenChunk, Error> {
        AsyncThrowingStream { continuation in
            let task = Task {
                do {
                    let parameters = GenerateParameters(
                        maxTokens: maxTokens,
                        temperature: Float(0.6),
                        topP: Float(0.9)
                    )

                    try await container.perform { (context: ModelContext) in
                        let userInput = UserInput(prompt: prompt)
                        let lmInput = try await context.processor.prepare(input: userInput)

                        let start = Date()
                        var produced = 0

                        for await event in try MLXLMCommon.generate(
                            input: lmInput,
                            parameters: parameters,
                            context: context
                        ) {
                            if Task.isCancelled { break }

                            switch event {
                            case .chunk(let text):
                                produced += 1
                                let elapsed = Date().timeIntervalSince(start)
                                let rate = elapsed > 0 ? Double(produced) / elapsed : 0
                                continuation.yield(TokenChunk(text: text, decodeRate: rate))

                            case .info:
                                // 生成完成统计,是流的最后一个事件
                                break

                            case .toolCall:
                                // 纯文本生成不会触发,switch 穷举
                                break
                            }
                        }
                        // 注:研究笔记里曾建议尾部 MLX.GPU.synchronize() 以确保
                        // GPU 操作全部完成。但 AsyncStream 已经 yield 真实解码后的
                        // 文字,GPU 是否完全空闲不影响数据正确性。去掉此调用同时省
                        // 一份 transitive import MLX 的依赖,简化 SPM 链接。
                    }
                    continuation.finish()
                } catch {
                    continuation.finish(throwing: error)
                }
            }
            continuation.onTermination = { _ in task.cancel() }
        }
    }
}