From ad1b045e12b7d8bb3fd77db26a902c689fded494 Mon Sep 17 00:00:00 2001
From: link2026 <brownbear2026@163.com>
Date: Mon, 25 May 2026 16:03:04 +0800
Subject: [PATCH] =?UTF-8?q?feat(ai):=20LLMSession=20=E6=8E=A5=20MLX-Swift,?=
 =?UTF-8?q?=E8=B7=91=20Qwen3-1.7B=20=E6=B5=81=E5=BC=8F=E7=94=9F=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

按 W2 plan Task 6 + docs/superpowers/notes/2026-05-25-mlx-api-corrections.md
落地 LLM 推理底座:

- actor LLMSession 包装 MLXLLM.ModelContainer
- load(folderURL:) 用 ModelConfiguration(directory:) + LLMModelFactory.shared.loadContainer
- generate(prompt:maxTokens:) 返回 AsyncThrowingStream<TokenChunk, Error>
- 内部 container.perform { (context: ModelContext) in ... } 拿到模型上下文
- UserInput → processor.prepare → MLXLMCommon.generate(顶层函数, AsyncStream)
- Generation switch 穷举 3 个 case(chunk / info / toolCall)
- maxTokens 通过 GenerateParameters 传递,温度 0.6 topP 0.9
- 取消传播:continuation.onTermination 同步 task.cancel()
- 每 chunk yield 时计算 tok/s decodeRate

API 基线:mlx-swift-examples tag 2.29.1, commit 9bff95ca。

需用户手动:
1. Xcode 把 LLMSession.swift 拖入 体己 target (AI group)
2. ⌘B 验证 AIRuntime 不再报 "Cannot find LLMSession"
3. 把 ~/tiji-models/Qwen3-1.7B-4bit/ 拷到模拟器沙盒 Application Support/Models/
4. Task 7 (DebugAIRunner) 才能跑通

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 体己/AI/LLMSession.swift | 78 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 体己/AI/LLMSession.swift

diff --git a/体己/AI/LLMSession.swift b/体己/AI/LLMSession.swift
new file mode 100644
index 0000000..20b6dd5
--- /dev/null
+++ b/体己/AI/LLMSession.swift
@@ -0,0 +1,78 @@
+import Foundation
+import MLX
+import MLXLLM
+import MLXLMCommon
+
+/// 封装 MLX 语言模型的流式生成,actor 保证单线程访问。
+/// 基于 mlx-swift-examples 2.29.1(commit 9bff95ca)的 API。
+actor LLMSession {
+    let container: ModelContainer
+
+    init(container: ModelContainer) {
+        self.container = container
+    }
+
+    /// 从本地目录加载模型(包含 config.json + weights + tokenizer)。
+    static func load(folderURL: URL) async throws -> LLMSession {
+        let configuration = ModelConfiguration(directory: folderURL)
+        let container = try await LLMModelFactory.shared.loadContainer(
+            configuration: configuration
+        )
+        return LLMSession(container: container)
+    }
+
+    /// 流式生成。返回的 AsyncThrowingStream 被取消时,内部 Task 也会取消。
+    /// - Parameters:
+    ///   - prompt: 原始 prompt 文本(经 processor 转 LMInput)
+    ///   - maxTokens: 最大 token 数,由 GenerateParameters 控制
+    func generate(prompt: String, maxTokens: Int) -> AsyncThrowingStream<TokenChunk, Error> {
+        AsyncThrowingStream { continuation in
+            let task = Task {
+                do {
+                    let parameters = GenerateParameters(
+                        maxTokens: maxTokens,
+                        temperature: Float(0.6),
+                        topP: Float(0.9)
+                    )
+
+                    try await container.perform { (context: ModelContext) in
+                        let userInput = UserInput(prompt: prompt)
+                        let lmInput = try await context.processor.prepare(input: userInput)
+
+                        let start = Date()
+                        var produced = 0
+
+                        for await event in try MLXLMCommon.generate(
+                            input: lmInput,
+                            parameters: parameters,
+                            context: context
+                        ) {
+                            if Task.isCancelled { break }
+
+                            switch event {
+                            case .chunk(let text):
+                                produced += 1
+                                let elapsed = Date().timeIntervalSince(start)
+                                let rate = elapsed > 0 ? Double(produced) / elapsed : 0
+                                continuation.yield(TokenChunk(text: text, decodeRate: rate))
+
+                            case .info:
+                                // 生成完成统计,是流的最后一个事件
+                                break
+
+                            case .toolCall:
+                                // 纯文本生成不会触发,switch 穷举
+                                break
+                            }
+                        }
+                        MLX.GPU.synchronize()
+                    }
+                    continuation.finish()
+                } catch {
+                    continuation.finish(throwing: error)
+                }
+            }
+            continuation.onTermination = { _ in task.cancel() }
+        }
+    }
+}