feat(AI): 推理闸门双优先级 — 前台插队、后台按 token 让位;暴露统计与后端标签

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
link2026
2026-06-10 06:42:59 +08:00
parent 070e016f81
commit 8494e51823
2 changed files with 102 additions and 9 deletions

View File

@@ -15,6 +15,13 @@ enum AIRuntimeError: Error, LocalizedError {
} }
} }
/// interactive = (//);
/// background = (),
nonisolated enum InferencePriority: Sendable, Equatable {
case interactive
case background
}
actor AIRuntime { actor AIRuntime {
static let shared = AIRuntime() static let shared = AIRuntime()
@@ -29,6 +36,21 @@ actor AIRuntime {
private(set) var vlStatus: Status = .notReady private(set) var vlStatus: Status = .notReady
private(set) var lastDecodeRate: Double = 0 private(set) var lastDecodeRate: Double = 0
/// (;)
private(set) var lastGenerateStats: GenerateStats?
/// ( / PPT )
var activeBackendLabel: String {
if InferenceEngine.current == .mnn, mnnStatus == .ready {
return InferenceEngine.cpuSupportsSME2 ? "MNN · SME2" : "MNN · NEON"
}
#if targetEnvironment(simulator)
return "MLX · CPU(模拟器)"
#else
return "MLX · GPU"
#endif
}
private var llmSession: LLMSession? private var llmSession: LLMSession?
private var vlSession: VLSession? private var vlSession: VLSession?
@@ -52,30 +74,56 @@ actor AIRuntime {
// actor (count = 1):( + ) // actor (count = 1):( + )
// await acquireGate(), releaseGate()actor // await acquireGate(), releaseGate()actor
// gateBusy / gateWaiters // gateBusy / gateWaiters
private struct GateWaiter {
let priority: InferencePriority
let cont: CheckedContinuation<Void, Never>
}
private var gateBusy = false private var gateBusy = false
private var gateWaiters: [CheckedContinuation<Void, Never>] = [] private var gateHolderPriority: InferencePriority = .interactive
private var preemptRequested = false
private var gateWaiters: [GateWaiter] = []
private func acquireGate() async { /// interactive background ; FIFO,
nonisolated static func gateInsertionIndex(of priority: InferencePriority,
in waiting: [InferencePriority]) -> Int {
guard priority == .interactive else { return waiting.count }
return waiting.firstIndex(of: .background) ?? waiting.count
}
private func acquireGate(_ priority: InferencePriority = .interactive) async {
if !gateBusy { if !gateBusy {
gateBusy = true gateBusy = true
gateHolderPriority = priority
return return
} }
// : token CancellationError
if priority == .interactive, gateHolderPriority == .background {
preemptRequested = true
}
await withCheckedContinuation { (cont: CheckedContinuation<Void, Never>) in await withCheckedContinuation { (cont: CheckedContinuation<Void, Never>) in
gateWaiters.append(cont) let idx = Self.gateInsertionIndex(of: priority, in: gateWaiters.map(\.priority))
gateWaiters.insert(GateWaiter(priority: priority, cont: cont), at: idx)
} }
// releaseGate (gateBusy true) // releaseGate (gateBusy true)
} }
private func releaseGate() { private func releaseGate() {
preemptRequested = false
if gateWaiters.isEmpty { if gateWaiters.isEmpty {
gateBusy = false gateBusy = false
} else { } else {
// ,gateBusy true, // ,gateBusy true,
let next = gateWaiters.removeFirst() let next = gateWaiters.removeFirst()
next.resume() gateHolderPriority = next.priority
next.cont.resume()
} }
} }
/// token :
private func shouldPreempt(_ priority: InferencePriority) -> Bool {
priority == .background && preemptRequested
}
private init() {} private init() {}
/// App : MLX GPU , reuse cache /// App : MLX GPU , reuse cache
@@ -180,9 +228,12 @@ actor AIRuntime {
/// await prepare() /// await prepare()
/// :, actor LLMSession await /// :, actor LLMSession await
func generate(prompt: String, maxTokens: Int = 256) -> AsyncThrowingStream<TokenChunk, Error> { /// priority = .background token (CancellationError )
func generate(prompt: String,
maxTokens: Int = 256,
priority: InferencePriority = .interactive) -> AsyncThrowingStream<TokenChunk, Error> {
if InferenceEngine.current == .mnn, mnnStatus == .ready { if InferenceEngine.current == .mnn, mnnStatus == .ready {
return mnnGenerate(prompt: prompt, maxTokens: maxTokens) return mnnGenerate(prompt: prompt, maxTokens: maxTokens, priority: priority)
} }
// actor ,Task 访 self.status / self.llmSession // actor ,Task 访 self.status / self.llmSession
let snapshotStatus = status let snapshotStatus = status
@@ -195,7 +246,7 @@ actor AIRuntime {
return return
} }
// : LLM VL / , // : LLM VL / ,
await self.acquireGate() await self.acquireGate(priority)
do { do {
// session.generate actor , await // session.generate actor , await
let stream = await session.generate(prompt: prompt, maxTokens: maxTokens) let stream = await session.generate(prompt: prompt, maxTokens: maxTokens)
@@ -203,12 +254,18 @@ actor AIRuntime {
// (UI)/, checkCancellation Task 退, // (UI)/, checkCancellation Task 退,
// session onTermination, MLX , GPU // session onTermination, MLX , GPU
try Task.checkCancellation() try Task.checkCancellation()
// :, token 退
if self.shouldPreempt(priority) { throw CancellationError() }
// Task generate() , AIRuntime actor ; // Task generate() , AIRuntime actor ;
// actor recordRate await // actor recordRate await
self.recordRate(chunk.decodeRate) self.recordRate(chunk.decodeRate)
continuation.yield(chunk) continuation.yield(chunk)
} }
self.lastGenerateStats = await session.lastStats
continuation.finish() continuation.finish()
} catch is CancellationError {
// / CancellationError ,
continuation.finish(throwing: CancellationError())
} catch { } catch {
continuation.finish(throwing: AIRuntimeError.inferenceFailed("\(error)")) continuation.finish(throwing: AIRuntimeError.inferenceFailed("\(error)"))
} }
@@ -222,7 +279,9 @@ actor AIRuntime {
} }
/// MNN(CPU/SME2) MLX : /// MNN(CPU/SME2) MLX :
private func mnnGenerate(prompt: String, maxTokens: Int) -> AsyncThrowingStream<TokenChunk, Error> { private func mnnGenerate(prompt: String,
maxTokens: Int,
priority: InferencePriority) -> AsyncThrowingStream<TokenChunk, Error> {
let ready = (mnnStatus == .ready) let ready = (mnnStatus == .ready)
return AsyncThrowingStream { continuation in return AsyncThrowingStream { continuation in
let task = Task { let task = Task {
@@ -230,15 +289,21 @@ actor AIRuntime {
continuation.finish(throwing: AIRuntimeError.notReady) continuation.finish(throwing: AIRuntimeError.notReady)
return return
} }
await self.acquireGate() await self.acquireGate(priority)
do { do {
let stream = await self.mnn.generate(prompt: prompt, maxTokens: maxTokens) let stream = await self.mnn.generate(prompt: prompt, maxTokens: maxTokens)
for try await chunk in stream { for try await chunk in stream {
try Task.checkCancellation() try Task.checkCancellation()
// :, token 退
//( MNNBackend.onTermination bridge.cancel())
if self.shouldPreempt(priority) { throw CancellationError() }
self.recordRate(chunk.decodeRate) self.recordRate(chunk.decodeRate)
continuation.yield(chunk) continuation.yield(chunk)
} }
self.lastGenerateStats = await self.mnn.lastStats
continuation.finish() continuation.finish()
} catch is CancellationError {
continuation.finish(throwing: CancellationError())
} catch { } catch {
continuation.finish(throwing: AIRuntimeError.inferenceFailed("\(error)")) continuation.finish(throwing: AIRuntimeError.inferenceFailed("\(error)"))
} }

View File

@@ -0,0 +1,28 @@
import Testing
@testable import
struct InferencePriorityTests {
@Test func interactiveJumpsAheadOfBackground() {
let idx = AIRuntime.gateInsertionIndex(of: .interactive,
in: [.interactive, .background, .background])
#expect(idx == 1)
}
@Test func interactiveKeepsFIFOAmongInteractive() {
let idx = AIRuntime.gateInsertionIndex(of: .interactive,
in: [.interactive, .interactive])
#expect(idx == 2)
}
@Test func backgroundAlwaysAppends() {
let idx = AIRuntime.gateInsertionIndex(of: .background,
in: [.interactive, .background])
#expect(idx == 2)
}
@Test func emptyQueueInsertsAtZero() {
#expect(AIRuntime.gateInsertionIndex(of: .interactive, in: []) == 0)
#expect(AIRuntime.gateInsertionIndex(of: .background, in: []) == 0)
}
}