Files
kangkang/体己/AI/LLMSession.swift
link2026 ad1b045e12 feat(ai): LLMSession 接 MLX-Swift,跑 Qwen3-1.7B 流式生成
按 W2 plan Task 6 + docs/superpowers/notes/2026-05-25-mlx-api-corrections.md
落地 LLM 推理底座:

- actor LLMSession 包装 MLXLLM.ModelContainer
- load(folderURL:) 用 ModelConfiguration(directory:) + LLMModelFactory.shared.loadContainer
- generate(prompt:maxTokens:) 返回 AsyncThrowingStream<TokenChunk, Error>
- 内部 container.perform { (context: ModelContext) in ... } 拿到模型上下文
- UserInput → processor.prepare → MLXLMCommon.generate(顶层函数, AsyncStream)
- Generation switch 穷举 3 个 case(chunk / info / toolCall)
- maxTokens 通过 GenerateParameters 传递,温度 0.6 topP 0.9
- 取消传播:continuation.onTermination 同步 task.cancel()
- 每 chunk yield 时计算 tok/s decodeRate

API 基线:mlx-swift-examples tag 2.29.1, commit 9bff95ca。

需用户手动:
1. Xcode 把 LLMSession.swift 拖入 体己 target (AI group)
2. ⌘B 验证 AIRuntime 不再报 "Cannot find LLMSession"
3. 把 ~/tiji-models/Qwen3-1.7B-4bit/ 拷到模拟器沙盒 Application Support/Models/
4. Task 7 (DebugAIRunner) 才能跑通

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 16:03:04 +08:00

79 lines
3.0 KiB
Swift

import Foundation
import MLX
import MLXLLM
import MLXLMCommon
/// MLX ,actor 线访
/// mlx-swift-examples 2.29.1(commit 9bff95ca) API
actor LLMSession {
let container: ModelContainer
init(container: ModelContainer) {
self.container = container
}
/// ( config.json + weights + tokenizer)
static func load(folderURL: URL) async throws -> LLMSession {
let configuration = ModelConfiguration(directory: folderURL)
let container = try await LLMModelFactory.shared.loadContainer(
configuration: configuration
)
return LLMSession(container: container)
}
/// AsyncThrowingStream , Task
/// - Parameters:
/// - prompt: prompt ( processor LMInput)
/// - maxTokens: token , GenerateParameters
func generate(prompt: String, maxTokens: Int) -> AsyncThrowingStream<TokenChunk, Error> {
AsyncThrowingStream { continuation in
let task = Task {
do {
let parameters = GenerateParameters(
maxTokens: maxTokens,
temperature: Float(0.6),
topP: Float(0.9)
)
try await container.perform { (context: ModelContext) in
let userInput = UserInput(prompt: prompt)
let lmInput = try await context.processor.prepare(input: userInput)
let start = Date()
var produced = 0
for await event in try MLXLMCommon.generate(
input: lmInput,
parameters: parameters,
context: context
) {
if Task.isCancelled { break }
switch event {
case .chunk(let text):
produced += 1
let elapsed = Date().timeIntervalSince(start)
let rate = elapsed > 0 ? Double(produced) / elapsed : 0
continuation.yield(TokenChunk(text: text, decodeRate: rate))
case .info:
// ,
break
case .toolCall:
// ,switch
break
}
}
MLX.GPU.synchronize()
}
continuation.finish()
} catch {
continuation.finish(throwing: error)
}
}
continuation.onTermination = { _ in task.cancel() }
}
}
}