Files
kangkang/康康/AI/AIRuntime.swift
link2026 d72a1fec17 ```
feat(AI): 添加MLX内存管理和AI模型互斥卸载机制

为防止应用因内存溢出被系统终止,在项目中添加了MLX框架依赖,
并在应用启动时配置GPU缓存限制,设置256MB缓存上限以避免内存过度使用。

同时实现了LLM和VL模型的互斥卸载机制,确保大模型不会同时常驻内存,
通过在加载一个模型前先卸载另一个模型来控制内存使用,防止jetsam OOM。

chore(project): 配置代码签名授权文件

refactor(localization): 调整本地化字符串并清理冗余条目

修正了提醒任务和建议相关的本地化文本,调整了多个UI字符串,
清理了过时和重复的本地化条目,更新了AI识别相关的新字符串资源。
```
2026-05-31 23:22:50 +08:00

195 lines
7.1 KiB
Swift

import Foundation
import MLX
enum AIRuntimeError: Error, LocalizedError {
case notReady
case modelLoadFailed(String)
case inferenceFailed(String)
var errorDescription: String? {
switch self {
case .notReady: return String(appLoc: "AI 模型尚未准备好")
case .modelLoadFailed(let m): return String(appLoc: "模型加载失败:\(m)")
case .inferenceFailed(let m): return String(appLoc: "推理失败:\(m)")
}
}
}
actor AIRuntime {
static let shared = AIRuntime()
enum Status: Sendable, Equatable {
case notReady
case loading
case ready
case error(String)
}
private(set) var status: Status = .notReady
private(set) var vlStatus: Status = .notReady
private(set) var lastDecodeRate: Double = 0
private var llmSession: LLMSession?
private var vlSession: VLSession?
private init() {}
/// App : MLX GPU , reuse cache
/// App ( CPU, Metal abort)
/// increased-memory-limit entitlement + LLM/VL , jetsam OOM
nonisolated static func configureMLXMemory() {
#if !targetEnvironment(simulator)
// 256MB cache : 3GB MB
MLX.GPU.set(cacheLimit: 256 * 1024 * 1024)
#endif
}
/// ,
func prepare() async throws {
switch status {
case .ready:
return
case .loading:
// ; prepare ,
// await prepare() status, / UI
// W3 prepare
return
case .error, .notReady:
break
}
guard ModelStore.shared.isReady(.llm) else {
status = .error("LLM 模型未就绪")
throw AIRuntimeError.notReady
}
// OOM (§3.1):LLM(~1GB) VL(~3GB), App jetsam
// LLM VL, ModelContainer + MLX
unloadVL()
status = .loading
do {
let session = try await LLMSession.load(
folderURL: ModelStore.shared.localURL(for: .llm)
)
self.llmSession = session
status = .ready
} catch {
status = .error("\(error)")
throw AIRuntimeError.modelLoadFailed("\(error)")
}
}
/// await prepare()
/// :, actor LLMSession await
func generate(prompt: String, maxTokens: Int = 256) -> AsyncThrowingStream<TokenChunk, Error> {
// actor ,Task 访 self.status / self.llmSession
let snapshotStatus = status
let snapshotSession = llmSession
return AsyncThrowingStream { continuation in
let task = Task {
guard snapshotStatus == .ready, let session = snapshotSession else {
continuation.finish(throwing: AIRuntimeError.notReady)
return
}
do {
// session.generate actor , await
let stream = await session.generate(prompt: prompt, maxTokens: maxTokens)
for try await chunk in stream {
// (UI)/, checkCancellation Task 退,
// session onTermination, MLX , GPU
try Task.checkCancellation()
// Task generate() , AIRuntime actor ;
// actor recordRate await
self.recordRate(chunk.decodeRate)
continuation.yield(chunk)
}
continuation.finish()
} catch {
continuation.finish(throwing: AIRuntimeError.inferenceFailed("\(error)"))
}
}
// / Task( LLMSession / HealthExportService )
continuation.onTermination = { _ in task.cancel() }
}
}
private func recordRate(_ rate: Double) {
if rate > 0 { lastDecodeRate = rate }
}
// MARK: - VL
/// VL , load
func prepareVL() async throws {
switch vlStatus {
case .ready, .loading:
return
case .error, .notReady:
break
}
guard ModelStore.shared.isReady(.vl) else {
vlStatus = .error("VL 模型未就绪")
throw AIRuntimeError.notReady
}
// OOM (§3.1): VL(~3GB) LLM(~1GB), jetsam
// App 退
unloadLLM()
vlStatus = .loading
do {
let session = try await VLSession.load(
folderURL: ModelStore.shared.localURL(for: .vl)
)
self.vlSession = session
vlStatus = .ready
} catch {
vlStatus = .error("\(error)")
throw AIRuntimeError.modelLoadFailed("\(error)")
}
}
// MARK: - (OOM )
/// LLM, ModelContainer MLX
/// : generate() , session ,;
/// /,
private func unloadLLM() {
guard llmSession != nil else { return }
llmSession = nil
status = .notReady
MLX.GPU.clearCache()
}
/// VL, ModelContainer MLX
private func unloadVL() {
guard vlSession != nil else { return }
vlSession = nil
vlStatus = .notReady
MLX.GPU.clearCache()
}
/// JSON ( VLPrompts.reportExtraction )
/// + 退(§3.2)
/// AIRuntime actor, LLM.generate() , OOM
func analyzeReport(imageURLs: [URL],
prompt: String,
maxTokens: Int = 512) async throws -> String {
guard vlStatus == .ready, let session = vlSession else {
throw AIRuntimeError.notReady
}
do {
return try await session.analyze(
imageURLs: imageURLs,
prompt: prompt,
maxTokens: maxTokens
)
} catch {
throw AIRuntimeError.inferenceFailed("\(error)")
}
}
}