feat(AI): 双后端路由 MNN/MLX,AIRuntime 按引擎分发(Phase 3 核心)

- InferenceEngine:引擎枚举(.mnn 默认 / .mlx 兜底)+ UserDefaults 持久化
  + 可用性/SME2 运行时探测(经 MNNLLMBridge)
- MNNBackend:actor 封装 MNNLLMBridge 文本流式生成,detached 线程跑同步
  response、按 UTF-8 边界 yield TokenChunk,串行化交给 AIRuntime 闸门
- AIRuntime:prepare/generate 按引擎分发;.mnn 且模型就绪→MNN,否则回退 MLX
  (过渡期 App 始终可用);prepareVL/单模型常驻时互卸 MNN↔MLX 释放内存
  公有 API 不变,各 Service 零改动

模拟器 BUILD SUCCEEDED,0 error。引擎切换 UI + SME2 指示留待 Phase 5。

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
link2026
2026-06-08 18:58:27 +08:00
parent afc6a79dd7
commit f6c0ba7077
3 changed files with 205 additions and 1 deletions

View File

@@ -32,6 +32,15 @@ actor AIRuntime {
private var llmSession: LLMSession? private var llmSession: LLMSession?
private var vlSession: VLSession? private var vlSession: VLSession?
// MARK: - MNN (CPU/SME2,)
// .mnn MNN;VL() MLX(MNN VL OMNI )
private let mnn = MNNBackend()
private(set) var mnnStatus: Status = .notReady
/// MNN (/ Models/Qwen3.5-2B-MNN)
nonisolated static var mnnModelFolder: URL {
ModelStore.shared.rootURL.appendingPathComponent("Qwen3.5-2B-MNN", isDirectory: true)
}
// MARK: - (§3.1 OOM ) // MARK: - (§3.1 OOM )
// //
// actor , generate() Task; // actor , generate() Task;
@@ -78,8 +87,19 @@ actor AIRuntime {
#endif #endif
} }
/// , /// ,
/// :.mnn MNN(CPU/SME2);.mlx MLX(GPU)
func prepare() async throws { func prepare() async throws {
// MNN MNN;( MLX, MNN )退 MLX,
// App (Phase 5)
let mnnReady = FileManager.default.fileExists(
atPath: Self.mnnModelFolder.appendingPathComponent("config.json").path)
if InferenceEngine.current == .mnn, mnnReady {
try await prepareMNN()
return
}
// MLX: MNN ()
await unloadMNN()
// , // ,
// return: ready, generate // return: ready, generate
// `guard status == .ready` () // `guard status == .ready` ()
@@ -119,9 +139,52 @@ actor AIRuntime {
} }
} }
/// MNN : MLX LLM/VL
private func prepareMNN() async throws {
while mnnStatus == .loading {
try await Task.sleep(nanoseconds: 80_000_000)
}
if mnnStatus == .ready { return }
let folder = Self.mnnModelFolder
let config = folder.appendingPathComponent("config.json").path
guard FileManager.default.fileExists(atPath: config) else {
mnnStatus = .error("MNN 模型未就绪")
throw AIRuntimeError.notReady
}
await acquireGate()
defer { releaseGate() }
if mnnStatus == .ready { return }
// : MLX LLM/VL, MNN
unloadLLM()
unloadVL()
mnnStatus = .loading
do {
try await mnn.load(folderURL: folder)
mnnStatus = .ready
} catch {
mnnStatus = .error("\(error)")
throw AIRuntimeError.modelLoadFailed("\(error)")
}
}
/// MNN,
private func unloadMNN() async {
guard mnnStatus != .notReady else { return }
await mnn.unload()
mnnStatus = .notReady
MLX.Memory.clearCache()
}
/// await prepare() /// await prepare()
/// :, actor LLMSession await /// :, actor LLMSession await
func generate(prompt: String, maxTokens: Int = 256) -> AsyncThrowingStream<TokenChunk, Error> { func generate(prompt: String, maxTokens: Int = 256) -> AsyncThrowingStream<TokenChunk, Error> {
if InferenceEngine.current == .mnn, mnnStatus == .ready {
return mnnGenerate(prompt: prompt, maxTokens: maxTokens)
}
// actor ,Task 访 self.status / self.llmSession // actor ,Task 访 self.status / self.llmSession
let snapshotStatus = status let snapshotStatus = status
let snapshotSession = llmSession let snapshotSession = llmSession
@@ -159,6 +222,33 @@ actor AIRuntime {
} }
} }
/// MNN(CPU/SME2) MLX :
private func mnnGenerate(prompt: String, maxTokens: Int) -> AsyncThrowingStream<TokenChunk, Error> {
let ready = (mnnStatus == .ready)
return AsyncThrowingStream { continuation in
let task = Task {
guard ready else {
continuation.finish(throwing: AIRuntimeError.notReady)
return
}
await self.acquireGate()
do {
let stream = await self.mnn.generate(prompt: prompt, maxTokens: maxTokens)
for try await chunk in stream {
try Task.checkCancellation()
self.recordRate(chunk.decodeRate)
continuation.yield(chunk)
}
continuation.finish()
} catch {
continuation.finish(throwing: AIRuntimeError.inferenceFailed("\(error)"))
}
self.releaseGate()
}
continuation.onTermination = { _ in task.cancel() }
}
}
private func recordRate(_ rate: Double) { private func recordRate(_ rate: Double) {
if rate > 0 { lastDecodeRate = rate } if rate > 0 { lastDecodeRate = rate }
} }
@@ -186,6 +276,7 @@ actor AIRuntime {
// OOM (§3.1): VL(~3GB) LLM(~1GB), jetsam // OOM (§3.1): VL(~3GB) LLM(~1GB), jetsam
unloadLLM() unloadLLM()
await unloadMNN()
vlStatus = .loading vlStatus = .loading
do { do {

View File

@@ -0,0 +1,41 @@
import Foundation
///
/// - mnn:Qwen + MNN + SME2(CPU),,
/// - mlx:Qwen + MLX(Metal GPU), /
nonisolated enum InferenceEngine: String, CaseIterable, Sendable {
case mnn
case mlx
var displayName: String {
switch self {
case .mnn: return "MNN · CPU/SME2"
case .mlx: return "MLX · GPU"
}
}
/// /MNN device ,退 MLX
var isAvailable: Bool {
switch self {
case .mlx: return true
case .mnn: return MNNLLMBridge.isAvailable()
}
}
// MARK: - (UserDefaults, actor )
private static let key = "kk.inferenceEngine"
/// /退 .mlx() .mnn
static var current: InferenceEngine {
get {
let raw = UserDefaults.standard.string(forKey: key)
let chosen = raw.flatMap(InferenceEngine.init(rawValue:)) ?? .mnn
return chosen.isAvailable ? chosen : .mlx
}
set { UserDefaults.standard.set(newValue.rawValue, forKey: key) }
}
/// :CPU SME2(A19/iPhone17+) UI
static var cpuSupportsSME2: Bool { MNNLLMBridge.cpuSupportsSME2() }
}

View File

@@ -0,0 +1,72 @@
import Foundation
/// MNN(CPU / SME2), `MNNLLMBridge`
/// `LLMSession`/`VLSession` actor ; `AIRuntime`
///
/// VL() MNN OMNI (OpenCV ),;`analyze` ,
/// VL 退 MLX( `AIRuntime`)
actor MNNBackend {
private var bridge: MNNLLMBridge?
var isLoaded: Bool { bridge?.isLoaded ?? false }
/// MNN ( MNN llm config.json + llm.mnn + + tokenizer)
func load(folderURL: URL) throws {
let configPath = folderURL.appendingPathComponent("config.json").path
guard FileManager.default.fileExists(atPath: configPath) else {
throw AIRuntimeError.notReady
}
guard let b = MNNLLMBridge(configPath: configPath) else {
throw AIRuntimeError.modelLoadFailed("MNN createLLM/load 失败")
}
bridge = b
}
func unload() { bridge = nil }
/// `bridge.generateText` , detached 线,
/// yield `TokenChunk`( tok/s) `bridge.cancel()`
func generate(prompt: String, maxTokens: Int) -> AsyncThrowingStream<TokenChunk, Error> {
guard let bridge else {
return AsyncThrowingStream { $0.finish(throwing: AIRuntimeError.notReady) }
}
let box = MNNUncheckedBox(bridge)
return AsyncThrowingStream { continuation in
let meter = MNNRateMeter()
let task = Task.detached(priority: .userInitiated) {
_ = box.value.generateText(prompt, maxTokens: Int32(maxTokens)) { piece in
let rate = meter.tick()
continuation.yield(TokenChunk(text: piece, decodeRate: rate))
}
continuation.finish()
}
continuation.onTermination = { _ in
box.value.cancel()
task.cancel()
}
}
}
/// (VL) MNN OMNI,退 MLX VL
func analyze(imageURLs: [URL], prompt: String, maxTokens: Int) throws -> String {
throw AIRuntimeError.inferenceFailed("MNN 当前构建不支持 VL(需 OMNI)")
}
}
/// Sendable ObjC detached
/// `AIRuntime` :,访
private nonisolated struct MNNUncheckedBox<T>: @unchecked Sendable {
let value: T
init(_ value: T) { self.value = value }
}
/// :线,
private nonisolated final class MNNRateMeter: @unchecked Sendable {
private let start = Date()
private var produced = 0
func tick() -> Double {
produced += 1
let elapsed = Date().timeIntervalSince(start)
return elapsed > 0 ? Double(produced) / elapsed : 0
}
}