feat(AI): 集成MNN推理引擎替换MLX作为主AI运行时 - 引入MNN(alibaba) + Arm SME2 + CPU作为主AI运行时,支持A19/iPhone17的 SME2和A17的NEON加速 - 添加MLX Swift作为兜底GPU推理方案,实现双后端切换机制 - 使用单一Qwen3.5-2B多模态模型(1.2GB),替代原有的LLM+VL分离架构 - 实现InferenceEngine.current引擎选择逻辑,真机默认MNN,模拟器回退MLX - 更新AIAgent架构,通过MNNLLMBridge(ObjC++) → MNNBackend进行推理 - 修改队列机制防止并发推理导致OOM,使用信号量闸门控制显存占用 - 更新文档中的技术栈说明、模块边界和周次交付计划 ```
162 lines
6.5 KiB
Swift
162 lines
6.5 KiB
Swift
import Foundation
|
|
import Speech
|
|
import AVFoundation
|
|
|
|
/// 端侧流式语音转写(spec 2026-06-10-voice-diary)。
|
|
/// AVAudioEngine 麦克风 buffer → SFSpeechAudioBufferRecognitionRequest,
|
|
/// `requiresOnDeviceRecognition = true` 硬性端侧,识别内容不出设备;**不落盘任何音频**。
|
|
///
|
|
/// 生命周期:start(onPartial:) 开始录音并实时回调 partial;stop() 结束并返回最终稿。
|
|
/// 调用方:DiaryQuickSheet。工程默认 MainActor 隔离,本类型即 MainActor;
|
|
/// audio tap 与识别回调在系统线程,闭包内只碰局部捕获对象,回主线程统一走 Task { @MainActor }。
|
|
final class SpeechDictationService {
|
|
|
|
enum DictationError: Error, LocalizedError {
|
|
case unavailable
|
|
case audioEngineStartFailed(String)
|
|
|
|
var errorDescription: String? {
|
|
switch self {
|
|
case .unavailable:
|
|
return String(appLoc: "本机不支持端侧语音识别")
|
|
case .audioEngineStartFailed(let m):
|
|
return String(appLoc: "录音启动失败:\(m)")
|
|
}
|
|
}
|
|
}
|
|
|
|
/// 把已有文字 `prefix` 与新听写片段 `partial` 合并成一段。纯函数,方便单测、与录音生命周期解耦。
|
|
/// 规则:空片段保留原文;空前缀直接用片段;前缀已以空白(空格/换行)结尾则直接拼,
|
|
/// 否则中间补一个空格——避免「已有内容新听写」黏成一坨,也不会在换行后多塞空格。
|
|
static func merge(prefix: String, partial: String) -> String {
|
|
if partial.isEmpty { return prefix }
|
|
if prefix.isEmpty { return partial }
|
|
if prefix.last?.isWhitespace == true { return prefix + partial }
|
|
return prefix + " " + partial
|
|
}
|
|
|
|
/// 优先系统语言;系统语言不支持端侧时兜底中文(demo 机即使系统是英文也能用)。
|
|
private static func makeRecognizer() -> SFSpeechRecognizer? {
|
|
if let r = SFSpeechRecognizer(locale: .current), r.supportsOnDeviceRecognition {
|
|
return r
|
|
}
|
|
if let r = SFSpeechRecognizer(locale: Locale(identifier: "zh-CN")),
|
|
r.supportsOnDeviceRecognition {
|
|
return r
|
|
}
|
|
return nil
|
|
}
|
|
|
|
/// 本机是否支持端侧识别。false(模拟器/老机型)时 UI 隐藏 mic 入口,静默降级。
|
|
static var isAvailable: Bool { makeRecognizer() != nil }
|
|
|
|
private let audioEngine = AVAudioEngine()
|
|
private var request: SFSpeechAudioBufferRecognitionRequest?
|
|
private var task: SFSpeechRecognitionTask?
|
|
/// 识别回调持续刷新;isFinal 或出错时置 didFinish。stop() 用「final 优先、partial 兜底」。
|
|
private var latestText = ""
|
|
private var didFinish = false
|
|
|
|
private(set) var isRecording = false
|
|
|
|
/// 麦克风 + 语音识别两个权限一起申请。任一被拒返回 false。
|
|
func requestAuthorization() async -> Bool {
|
|
let speech = await withCheckedContinuation { (c: CheckedContinuation<SFSpeechRecognizerAuthorizationStatus, Never>) in
|
|
SFSpeechRecognizer.requestAuthorization { c.resume(returning: $0) }
|
|
}
|
|
guard speech == .authorized else { return false }
|
|
return await AVAudioApplication.requestRecordPermission()
|
|
}
|
|
|
|
/// 开始录音 + 流式识别。partial 结果在主线程回调(录音面板实时字幕)。
|
|
func start(onPartial: @escaping (String) -> Void) throws {
|
|
guard !isRecording else { return }
|
|
guard let recognizer = Self.makeRecognizer(), recognizer.isAvailable else {
|
|
throw DictationError.unavailable
|
|
}
|
|
|
|
let session = AVAudioSession.sharedInstance()
|
|
do {
|
|
try session.setCategory(.record, mode: .measurement, options: .duckOthers)
|
|
try session.setActive(true, options: .notifyOthersOnDeactivation)
|
|
} catch {
|
|
throw DictationError.audioEngineStartFailed(error.localizedDescription)
|
|
}
|
|
|
|
let request = SFSpeechAudioBufferRecognitionRequest()
|
|
request.requiresOnDeviceRecognition = true // 红线:识别不出设备
|
|
request.shouldReportPartialResults = true
|
|
request.addsPunctuation = true
|
|
self.request = request
|
|
latestText = ""
|
|
didFinish = false
|
|
|
|
let input = audioEngine.inputNode
|
|
let format = input.outputFormat(forBus: 0)
|
|
// tap 在音频线程跑:只碰局部捕获的 request,不碰 self
|
|
input.installTap(onBus: 0, bufferSize: 1024, format: format) { buffer, _ in
|
|
request.append(buffer)
|
|
}
|
|
audioEngine.prepare()
|
|
do {
|
|
try audioEngine.start()
|
|
} catch {
|
|
input.removeTap(onBus: 0)
|
|
deactivateSession()
|
|
throw DictationError.audioEngineStartFailed(error.localizedDescription)
|
|
}
|
|
|
|
task = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
|
// 系统线程 → 主线程
|
|
Task { @MainActor in
|
|
guard let self else { return }
|
|
if let result {
|
|
self.latestText = result.bestTranscription.formattedString
|
|
onPartial(self.latestText)
|
|
if result.isFinal { self.didFinish = true }
|
|
}
|
|
if error != nil { self.didFinish = true }
|
|
}
|
|
}
|
|
isRecording = true
|
|
}
|
|
|
|
/// 停止录音,等待最终识别结果(最多 1.5s,超时用最新 partial),返回最终稿。
|
|
/// 中途识别出错时已拿到的 partial 一样返回(spec 错误表:照常进整理流程)。
|
|
func stop() async -> String {
|
|
guard isRecording else { return "" }
|
|
isRecording = false
|
|
|
|
audioEngine.stop()
|
|
audioEngine.inputNode.removeTap(onBus: 0)
|
|
request?.endAudio()
|
|
|
|
let deadline = Date().addingTimeInterval(1.5)
|
|
while !didFinish && Date() < deadline {
|
|
try? await Task.sleep(nanoseconds: 100_000_000)
|
|
}
|
|
task?.cancel()
|
|
task = nil
|
|
request = nil
|
|
deactivateSession()
|
|
return latestText
|
|
}
|
|
|
|
/// 用户直接关 sheet 时的清理:不关心结果,立即停。
|
|
func abort() {
|
|
guard isRecording else { return }
|
|
isRecording = false
|
|
audioEngine.stop()
|
|
audioEngine.inputNode.removeTap(onBus: 0)
|
|
request?.endAudio()
|
|
task?.cancel()
|
|
task = nil
|
|
request = nil
|
|
deactivateSession()
|
|
}
|
|
|
|
private func deactivateSession() {
|
|
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
|
|
}
|
|
}
|