feat(AI): 双后端路由 MNN/MLX,AIRuntime 按引擎分发(Phase 3 核心)
- InferenceEngine:引擎枚举(.mnn 默认 / .mlx 兜底)+ UserDefaults 持久化 + 可用性/SME2 运行时探测(经 MNNLLMBridge) - MNNBackend:actor 封装 MNNLLMBridge 文本流式生成,detached 线程跑同步 response、按 UTF-8 边界 yield TokenChunk,串行化交给 AIRuntime 闸门 - AIRuntime:prepare/generate 按引擎分发;.mnn 且模型就绪→MNN,否则回退 MLX (过渡期 App 始终可用);prepareVL/单模型常驻时互卸 MNN↔MLX 释放内存 公有 API 不变,各 Service 零改动 模拟器 BUILD SUCCEEDED,0 error。引擎切换 UI + SME2 指示留待 Phase 5。 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,15 @@ actor AIRuntime {
|
||||
private var llmSession: LLMSession?
|
||||
private var vlSession: VLSession?
|
||||
|
||||
// MARK: - MNN 后端(CPU/SME2,挑战赛考核路径)
|
||||
// 文本生成在 .mnn 引擎下走 MNN;VL(图→文)暂仍走 MLX(MNN VL 需 OMNI 构建)。
|
||||
private let mnn = MNNBackend()
|
||||
private(set) var mnnStatus: Status = .notReady
|
||||
/// MNN 模型目录(下载/旁路导入到 Models/Qwen3.5-2B-MNN)。
|
||||
nonisolated static var mnnModelFolder: URL {
|
||||
ModelStore.shared.rootURL.appendingPathComponent("Qwen3.5-2B-MNN", isDirectory: true)
|
||||
}
|
||||
|
||||
// MARK: - 串行推理闸门(§3.1 OOM 防护的真正落地)
|
||||
//
|
||||
// actor 只串行化「方法入口」,但 generate() 同步返回流、真正解码在内部 Task;
|
||||
@@ -78,8 +87,19 @@ actor AIRuntime {
|
||||
#endif
|
||||
}
|
||||
|
||||
/// 加载模型。首次调用会真正加载,后续幂等。
|
||||
/// 加载文本模型。首次调用会真正加载,后续幂等。
|
||||
/// 按当前引擎路由:.mnn → MNN(CPU/SME2);.mlx → 现有 MLX(GPU)。
|
||||
func prepare() async throws {
|
||||
// 选了 MNN 且模型已就绪才走 MNN;否则(选 MLX,或 MNN 模型尚未下载)回退 MLX,
|
||||
// 保证过渡期 App 始终可用。引擎指示器(Phase 5)展示实际生效后端。
|
||||
let mnnReady = FileManager.default.fileExists(
|
||||
atPath: Self.mnnModelFolder.appendingPathComponent("config.json").path)
|
||||
if InferenceEngine.current == .mnn, mnnReady {
|
||||
try await prepareMNN()
|
||||
return
|
||||
}
|
||||
// 走 MLX:先卸 MNN 释放内存(单模型常驻策略)。
|
||||
await unloadMNN()
|
||||
// 已有其他调用方在加载时,轮询等其结束再判定结果。
|
||||
// 不能像旧实现那样裸 return:那会让调用方误以为已 ready,随后 generate 的
|
||||
// `guard status == .ready` 失败 → 用户撞上「假错误屏」(模型其实正常加载中)。
|
||||
@@ -119,9 +139,52 @@ actor AIRuntime {
|
||||
}
|
||||
}
|
||||
|
||||
/// 加载 MNN 文本模型。幂等。单模型常驻:载入前卸掉 MLX 的 LLM/VL 释放内存。
|
||||
private func prepareMNN() async throws {
|
||||
while mnnStatus == .loading {
|
||||
try await Task.sleep(nanoseconds: 80_000_000)
|
||||
}
|
||||
if mnnStatus == .ready { return }
|
||||
|
||||
let folder = Self.mnnModelFolder
|
||||
let config = folder.appendingPathComponent("config.json").path
|
||||
guard FileManager.default.fileExists(atPath: config) else {
|
||||
mnnStatus = .error("MNN 模型未就绪")
|
||||
throw AIRuntimeError.notReady
|
||||
}
|
||||
|
||||
await acquireGate()
|
||||
defer { releaseGate() }
|
||||
if mnnStatus == .ready { return }
|
||||
|
||||
// 单模型常驻:卸 MLX LLM/VL,避免与 MNN 模型叠加占内存。
|
||||
unloadLLM()
|
||||
unloadVL()
|
||||
|
||||
mnnStatus = .loading
|
||||
do {
|
||||
try await mnn.load(folderURL: folder)
|
||||
mnnStatus = .ready
|
||||
} catch {
|
||||
mnnStatus = .error("\(error)")
|
||||
throw AIRuntimeError.modelLoadFailed("\(error)")
|
||||
}
|
||||
}
|
||||
|
||||
/// 卸载 MNN,释放桥与权重。幂等。
|
||||
private func unloadMNN() async {
|
||||
guard mnnStatus != .notReady else { return }
|
||||
await mnn.unload()
|
||||
mnnStatus = .notReady
|
||||
MLX.Memory.clearCache()
|
||||
}
|
||||
|
||||
/// 流式生成。调用前应先 await prepare()。
|
||||
/// 注意:返回流是同步创建的,但跨 actor 调用 LLMSession 需要 await。
|
||||
func generate(prompt: String, maxTokens: Int = 256) -> AsyncThrowingStream<TokenChunk, Error> {
|
||||
if InferenceEngine.current == .mnn, mnnStatus == .ready {
|
||||
return mnnGenerate(prompt: prompt, maxTokens: maxTokens)
|
||||
}
|
||||
// 在 actor 隔离上下文中捕获快照,Task 内不再访问 self.status / self.llmSession
|
||||
let snapshotStatus = status
|
||||
let snapshotSession = llmSession
|
||||
@@ -159,6 +222,33 @@ actor AIRuntime {
|
||||
}
|
||||
}
|
||||
|
||||
/// MNN(CPU/SME2)文本流式生成。结构与 MLX 分支一致:进闸门、串行解码、记录速率。
|
||||
private func mnnGenerate(prompt: String, maxTokens: Int) -> AsyncThrowingStream<TokenChunk, Error> {
|
||||
let ready = (mnnStatus == .ready)
|
||||
return AsyncThrowingStream { continuation in
|
||||
let task = Task {
|
||||
guard ready else {
|
||||
continuation.finish(throwing: AIRuntimeError.notReady)
|
||||
return
|
||||
}
|
||||
await self.acquireGate()
|
||||
do {
|
||||
let stream = await self.mnn.generate(prompt: prompt, maxTokens: maxTokens)
|
||||
for try await chunk in stream {
|
||||
try Task.checkCancellation()
|
||||
self.recordRate(chunk.decodeRate)
|
||||
continuation.yield(chunk)
|
||||
}
|
||||
continuation.finish()
|
||||
} catch {
|
||||
continuation.finish(throwing: AIRuntimeError.inferenceFailed("\(error)"))
|
||||
}
|
||||
self.releaseGate()
|
||||
}
|
||||
continuation.onTermination = { _ in task.cancel() }
|
||||
}
|
||||
}
|
||||
|
||||
private func recordRate(_ rate: Double) {
|
||||
if rate > 0 { lastDecodeRate = rate }
|
||||
}
|
||||
@@ -186,6 +276,7 @@ actor AIRuntime {
|
||||
|
||||
// OOM 闸门(§3.1):加载 VL(~3GB)前先卸 LLM(~1GB),否则两者常驻叠加冲过内存上限被 jetsam 杀。
|
||||
unloadLLM()
|
||||
await unloadMNN()
|
||||
|
||||
vlStatus = .loading
|
||||
do {
|
||||
|
||||
Reference in New Issue
Block a user