feat(AI): 双后端路由 MNN/MLX,AIRuntime 按引擎分发(Phase 3 核心)
- InferenceEngine:引擎枚举(.mnn 默认 / .mlx 兜底)+ UserDefaults 持久化 + 可用性/SME2 运行时探测(经 MNNLLMBridge) - MNNBackend:actor 封装 MNNLLMBridge 文本流式生成,detached 线程跑同步 response、按 UTF-8 边界 yield TokenChunk,串行化交给 AIRuntime 闸门 - AIRuntime:prepare/generate 按引擎分发;.mnn 且模型就绪→MNN,否则回退 MLX (过渡期 App 始终可用);prepareVL/单模型常驻时互卸 MNN↔MLX 释放内存 公有 API 不变,各 Service 零改动 模拟器 BUILD SUCCEEDED,0 error。引擎切换 UI + SME2 指示留待 Phase 5。 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,15 @@ actor AIRuntime {
|
||||
private var llmSession: LLMSession?
|
||||
private var vlSession: VLSession?
|
||||
|
||||
// MARK: - MNN 后端(CPU/SME2,挑战赛考核路径)
|
||||
// 文本生成在 .mnn 引擎下走 MNN;VL(图→文)暂仍走 MLX(MNN VL 需 OMNI 构建)。
|
||||
private let mnn = MNNBackend()
|
||||
private(set) var mnnStatus: Status = .notReady
|
||||
/// MNN 模型目录(下载/旁路导入到 Models/Qwen3.5-2B-MNN)。
|
||||
nonisolated static var mnnModelFolder: URL {
|
||||
ModelStore.shared.rootURL.appendingPathComponent("Qwen3.5-2B-MNN", isDirectory: true)
|
||||
}
|
||||
|
||||
// MARK: - 串行推理闸门(§3.1 OOM 防护的真正落地)
|
||||
//
|
||||
// actor 只串行化「方法入口」,但 generate() 同步返回流、真正解码在内部 Task;
|
||||
@@ -78,8 +87,19 @@ actor AIRuntime {
|
||||
#endif
|
||||
}
|
||||
|
||||
/// 加载模型。首次调用会真正加载,后续幂等。
|
||||
/// 加载文本模型。首次调用会真正加载,后续幂等。
|
||||
/// 按当前引擎路由:.mnn → MNN(CPU/SME2);.mlx → 现有 MLX(GPU)。
|
||||
func prepare() async throws {
|
||||
// 选了 MNN 且模型已就绪才走 MNN;否则(选 MLX,或 MNN 模型尚未下载)回退 MLX,
|
||||
// 保证过渡期 App 始终可用。引擎指示器(Phase 5)展示实际生效后端。
|
||||
let mnnReady = FileManager.default.fileExists(
|
||||
atPath: Self.mnnModelFolder.appendingPathComponent("config.json").path)
|
||||
if InferenceEngine.current == .mnn, mnnReady {
|
||||
try await prepareMNN()
|
||||
return
|
||||
}
|
||||
// 走 MLX:先卸 MNN 释放内存(单模型常驻策略)。
|
||||
await unloadMNN()
|
||||
// 已有其他调用方在加载时,轮询等其结束再判定结果。
|
||||
// 不能像旧实现那样裸 return:那会让调用方误以为已 ready,随后 generate 的
|
||||
// `guard status == .ready` 失败 → 用户撞上「假错误屏」(模型其实正常加载中)。
|
||||
@@ -119,9 +139,52 @@ actor AIRuntime {
|
||||
}
|
||||
}
|
||||
|
||||
/// 加载 MNN 文本模型。幂等。单模型常驻:载入前卸掉 MLX 的 LLM/VL 释放内存。
|
||||
private func prepareMNN() async throws {
|
||||
while mnnStatus == .loading {
|
||||
try await Task.sleep(nanoseconds: 80_000_000)
|
||||
}
|
||||
if mnnStatus == .ready { return }
|
||||
|
||||
let folder = Self.mnnModelFolder
|
||||
let config = folder.appendingPathComponent("config.json").path
|
||||
guard FileManager.default.fileExists(atPath: config) else {
|
||||
mnnStatus = .error("MNN 模型未就绪")
|
||||
throw AIRuntimeError.notReady
|
||||
}
|
||||
|
||||
await acquireGate()
|
||||
defer { releaseGate() }
|
||||
if mnnStatus == .ready { return }
|
||||
|
||||
// 单模型常驻:卸 MLX LLM/VL,避免与 MNN 模型叠加占内存。
|
||||
unloadLLM()
|
||||
unloadVL()
|
||||
|
||||
mnnStatus = .loading
|
||||
do {
|
||||
try await mnn.load(folderURL: folder)
|
||||
mnnStatus = .ready
|
||||
} catch {
|
||||
mnnStatus = .error("\(error)")
|
||||
throw AIRuntimeError.modelLoadFailed("\(error)")
|
||||
}
|
||||
}
|
||||
|
||||
/// 卸载 MNN,释放桥与权重。幂等。
|
||||
private func unloadMNN() async {
|
||||
guard mnnStatus != .notReady else { return }
|
||||
await mnn.unload()
|
||||
mnnStatus = .notReady
|
||||
MLX.Memory.clearCache()
|
||||
}
|
||||
|
||||
/// 流式生成。调用前应先 await prepare()。
|
||||
/// 注意:返回流是同步创建的,但跨 actor 调用 LLMSession 需要 await。
|
||||
func generate(prompt: String, maxTokens: Int = 256) -> AsyncThrowingStream<TokenChunk, Error> {
|
||||
if InferenceEngine.current == .mnn, mnnStatus == .ready {
|
||||
return mnnGenerate(prompt: prompt, maxTokens: maxTokens)
|
||||
}
|
||||
// 在 actor 隔离上下文中捕获快照,Task 内不再访问 self.status / self.llmSession
|
||||
let snapshotStatus = status
|
||||
let snapshotSession = llmSession
|
||||
@@ -159,6 +222,33 @@ actor AIRuntime {
|
||||
}
|
||||
}
|
||||
|
||||
/// MNN(CPU/SME2)文本流式生成。结构与 MLX 分支一致:进闸门、串行解码、记录速率。
|
||||
private func mnnGenerate(prompt: String, maxTokens: Int) -> AsyncThrowingStream<TokenChunk, Error> {
|
||||
let ready = (mnnStatus == .ready)
|
||||
return AsyncThrowingStream { continuation in
|
||||
let task = Task {
|
||||
guard ready else {
|
||||
continuation.finish(throwing: AIRuntimeError.notReady)
|
||||
return
|
||||
}
|
||||
await self.acquireGate()
|
||||
do {
|
||||
let stream = await self.mnn.generate(prompt: prompt, maxTokens: maxTokens)
|
||||
for try await chunk in stream {
|
||||
try Task.checkCancellation()
|
||||
self.recordRate(chunk.decodeRate)
|
||||
continuation.yield(chunk)
|
||||
}
|
||||
continuation.finish()
|
||||
} catch {
|
||||
continuation.finish(throwing: AIRuntimeError.inferenceFailed("\(error)"))
|
||||
}
|
||||
self.releaseGate()
|
||||
}
|
||||
continuation.onTermination = { _ in task.cancel() }
|
||||
}
|
||||
}
|
||||
|
||||
private func recordRate(_ rate: Double) {
|
||||
if rate > 0 { lastDecodeRate = rate }
|
||||
}
|
||||
@@ -186,6 +276,7 @@ actor AIRuntime {
|
||||
|
||||
// OOM 闸门(§3.1):加载 VL(~3GB)前先卸 LLM(~1GB),否则两者常驻叠加冲过内存上限被 jetsam 杀。
|
||||
unloadLLM()
|
||||
await unloadMNN()
|
||||
|
||||
vlStatus = .loading
|
||||
do {
|
||||
|
||||
41
康康/AI/InferenceEngine.swift
Normal file
41
康康/AI/InferenceEngine.swift
Normal file
@@ -0,0 +1,41 @@
|
||||
import Foundation
|
||||
|
||||
/// 端侧推理引擎选择。
|
||||
/// - mnn:Qwen + MNN + SME2(CPU),挑战赛考核路径,真机默认。
|
||||
/// - mlx:Qwen + MLX(Metal GPU),兜底 / 对照。模拟器只有它可用。
|
||||
nonisolated enum InferenceEngine: String, CaseIterable, Sendable {
|
||||
case mnn
|
||||
case mlx
|
||||
|
||||
var displayName: String {
|
||||
switch self {
|
||||
case .mnn: return "MNN · CPU/SME2"
|
||||
case .mlx: return "MLX · GPU"
|
||||
}
|
||||
}
|
||||
|
||||
/// 本构建/设备是否可用。MNN 仅 device 切片有真实内核,模拟器回退 MLX。
|
||||
var isAvailable: Bool {
|
||||
switch self {
|
||||
case .mlx: return true
|
||||
case .mnn: return MNNLLMBridge.isAvailable()
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - 持久化(UserDefaults,跨 actor 安全)
|
||||
|
||||
private static let key = "kk.inferenceEngine"
|
||||
|
||||
/// 当前选择。无效/不可用时回退到 .mlx(保证总有可用引擎)。真机默认 .mnn。
|
||||
static var current: InferenceEngine {
|
||||
get {
|
||||
let raw = UserDefaults.standard.string(forKey: key)
|
||||
let chosen = raw.flatMap(InferenceEngine.init(rawValue:)) ?? .mnn
|
||||
return chosen.isAvailable ? chosen : .mlx
|
||||
}
|
||||
set { UserDefaults.standard.set(newValue.rawValue, forKey: key) }
|
||||
}
|
||||
|
||||
/// 运行时探测:CPU 是否支持 SME2(A19/iPhone17+)。用于 UI 展示加速状态。
|
||||
static var cpuSupportsSME2: Bool { MNNLLMBridge.cpuSupportsSME2() }
|
||||
}
|
||||
72
康康/AI/MNNBackend.swift
Normal file
72
康康/AI/MNNBackend.swift
Normal file
@@ -0,0 +1,72 @@
|
||||
import Foundation
|
||||
|
||||
/// MNN(CPU / SME2)推理后端,封装 `MNNLLMBridge` 的文本流式生成。
|
||||
/// 与 `LLMSession`/`VLSession` 同款 actor 隔离;跨调用的串行化由上游 `AIRuntime` 闸门保证。
|
||||
///
|
||||
/// VL(图→文)需 MNN OMNI 构建(OpenCV 解码图像),当前文本构建不支持;`analyze` 抛错,
|
||||
/// 上层在 VL 路径回退 MLX(见 `AIRuntime`)。
|
||||
actor MNNBackend {
|
||||
private var bridge: MNNLLMBridge?
|
||||
|
||||
var isLoaded: Bool { bridge?.isLoaded ?? false }
|
||||
|
||||
/// 从 MNN 模型目录加载(目录含 MNN llm 的 config.json + llm.mnn + 权重 + tokenizer)。
|
||||
func load(folderURL: URL) throws {
|
||||
let configPath = folderURL.appendingPathComponent("config.json").path
|
||||
guard FileManager.default.fileExists(atPath: configPath) else {
|
||||
throw AIRuntimeError.notReady
|
||||
}
|
||||
guard let b = MNNLLMBridge(configPath: configPath) else {
|
||||
throw AIRuntimeError.modelLoadFailed("MNN createLLM/load 失败")
|
||||
}
|
||||
bridge = b
|
||||
}
|
||||
|
||||
func unload() { bridge = nil }
|
||||
|
||||
/// 文本流式生成。`bridge.generateText` 同步阻塞、逐段回调,放在 detached 线程跑,
|
||||
/// 把每段文本 yield 成 `TokenChunk`(含即时 tok/s)。流被取消时调用 `bridge.cancel()`。
|
||||
func generate(prompt: String, maxTokens: Int) -> AsyncThrowingStream<TokenChunk, Error> {
|
||||
guard let bridge else {
|
||||
return AsyncThrowingStream { $0.finish(throwing: AIRuntimeError.notReady) }
|
||||
}
|
||||
let box = MNNUncheckedBox(bridge)
|
||||
return AsyncThrowingStream { continuation in
|
||||
let meter = MNNRateMeter()
|
||||
let task = Task.detached(priority: .userInitiated) {
|
||||
_ = box.value.generateText(prompt, maxTokens: Int32(maxTokens)) { piece in
|
||||
let rate = meter.tick()
|
||||
continuation.yield(TokenChunk(text: piece, decodeRate: rate))
|
||||
}
|
||||
continuation.finish()
|
||||
}
|
||||
continuation.onTermination = { _ in
|
||||
box.value.cancel()
|
||||
task.cancel()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// 图→文(VL)。当前 MNN 文本构建未含 OMNI,直接抛错让上层回退 MLX VL。
|
||||
func analyze(imageURLs: [URL], prompt: String, maxTokens: Int) throws -> String {
|
||||
throw AIRuntimeError.inferenceFailed("MNN 当前构建不支持 VL(需 OMNI)")
|
||||
}
|
||||
}
|
||||
|
||||
/// 把非 Sendable 的 ObjC 桥对象安全带过 detached 边界。
|
||||
/// 安全性来自 `AIRuntime` 闸门:同一时刻只有一个生成在跑,桥不会被并发访问。
|
||||
private nonisolated struct MNNUncheckedBox<T>: @unchecked Sendable {
|
||||
let value: T
|
||||
init(_ value: T) { self.value = value }
|
||||
}
|
||||
|
||||
/// 即时解码速率计:回调在单线程串行调用,内部计数无竞争。
|
||||
private nonisolated final class MNNRateMeter: @unchecked Sendable {
|
||||
private let start = Date()
|
||||
private var produced = 0
|
||||
func tick() -> Double {
|
||||
produced += 1
|
||||
let elapsed = Date().timeIntervalSince(start)
|
||||
return elapsed > 0 ? Double(produced) / elapsed : 0
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user