```
feat(AI): 集成MNN推理引擎替换MLX作为主AI运行时 - 引入MNN(alibaba) + Arm SME2 + CPU作为主AI运行时,支持A19/iPhone17的 SME2和A17的NEON加速 - 添加MLX Swift作为兜底GPU推理方案,实现双后端切换机制 - 使用单一Qwen3.5-2B多模态模型(1.2GB),替代原有的LLM+VL分离架构 - 实现InferenceEngine.current引擎选择逻辑,真机默认MNN,模拟器回退MLX - 更新AIAgent架构,通过MNNLLMBridge(ObjC++) → MNNBackend进行推理 - 修改队列机制防止并发推理导致OOM,使用信号量闸门控制显存占用 - 更新文档中的技术栈说明、模块边界和周次交付计划 ```
This commit is contained in:
@@ -33,7 +33,9 @@ struct ParsedReport: Sendable {
|
||||
var isEmpty: Bool { indicators.isEmpty }
|
||||
|
||||
/// 占位空结果,失败回退时给 UI。
|
||||
static func empty(date: Date = .now) -> ParsedReport {
|
||||
/// nonisolated:本工程默认 MainActor 隔离,而 CaptureService(actor)里的 extractReportMeta
|
||||
/// 需要在 actor 上下文构造空结果 —— 纯值工厂,标 nonisolated 才能跨隔离调用(Swift 6)。
|
||||
nonisolated static func empty(date: Date = .now) -> ParsedReport {
|
||||
ParsedReport(
|
||||
title: "",
|
||||
typeRaw: ReportType.other.rawValue,
|
||||
@@ -78,6 +80,40 @@ actor CaptureService {
|
||||
try await runVL(on: assets)
|
||||
}
|
||||
|
||||
/// 报告归档「轻量 meta 提取」:**只保存原图,不逐项识别指标**。
|
||||
/// 逐项多模态识别在 2B 上又慢又易 OOM(jetsam 杀进程 = 用户说的「死机」),
|
||||
/// 故归档链路改为:Vision OCR(本地,<1s/页)→ 文本 LLM 只抽 {title,type,date,institution}(~50 token)。
|
||||
/// 全程容错:OCR 空 / 模型未就绪 / 解析失败都返回 (空 meta, recognized:false),绝不抛、绝不阻断保存原图(§3.2)。
|
||||
/// 返回的 indicators 恒为空 —— 归档不建指标。
|
||||
func extractReportMeta(assets: [FileVault.SavedAsset]) async -> (meta: ParsedReport, recognized: Bool) {
|
||||
let urls = assets.map { FileVault.shared.rootURL.appendingPathComponent($0.relativePath) }
|
||||
let ocr = await Self.ocrReference(for: urls)
|
||||
guard !ocr.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
|
||||
return (.empty(), false)
|
||||
}
|
||||
do {
|
||||
try await AIRuntime.shared.prepare() // 文本 LLM(轻);OOM 闸门已处理 VL 卸载
|
||||
} catch {
|
||||
return (.empty(), false)
|
||||
}
|
||||
var collected = ""
|
||||
do {
|
||||
// meta 输出极小,256 token 足够,远小于逐项识别的 2048 —— 这是不卡死的关键。
|
||||
let stream = await AIRuntime.shared.generate(prompt: VLPrompts.reportMetaFromText(ocr),
|
||||
maxTokens: 256)
|
||||
for try await chunk in stream { collected += chunk.text }
|
||||
} catch {
|
||||
return (.empty(), false)
|
||||
}
|
||||
let cleaned = CaptureService.stripThink(collected)
|
||||
guard var parsed = try? CaptureService.parseReportJSON(cleaned, pageCount: assets.count) else {
|
||||
return (.empty(), false)
|
||||
}
|
||||
// 归档只存 meta + 原图,丢弃模型可能附带的任何指标。
|
||||
parsed.indicators = []
|
||||
return (parsed, true)
|
||||
}
|
||||
|
||||
/// 「拍照识别」OCR 链路:把 Vision OCR 出的纯文本交给 LLM(Qwen3-1.7B)结构化抽指标。
|
||||
/// 不建 Report、不留图;失败抛 `CaptureError`,UI 回退手动录入(§3.2)。
|
||||
/// 调用方(MainActor)先做 OCR,再把文本传进来——OCR 不需进 actor,也避免 UIImage 跨 actor。
|
||||
@@ -169,8 +205,17 @@ actor CaptureService {
|
||||
private static func ocrReference(for urls: [URL]) async -> String {
|
||||
var pages: [String] = []
|
||||
for (idx, url) in urls.prefix(4).enumerated() {
|
||||
guard let src = CGImageSourceCreateWithURL(url as CFURL, nil),
|
||||
let cg = CGImageSourceCreateImageAtIndex(src, 0, nil) else { continue }
|
||||
guard let src = CGImageSourceCreateWithURL(url as CFURL, nil) else { continue }
|
||||
// OCR 不需要全分辨率:一张 4000px 体检照全量解码 ≈48MB,正赶在 VL 推理前叠加,
|
||||
// 易触发 jetsam。降到 ≤3000px 既省内存又加速 Vision,医检报告字号此分辨率仍清晰;
|
||||
// 且原图仍完整交给 VL 自行读取,OCR 仅当数字「抄写员」辅助,降采样不影响最终可用信息。
|
||||
let thumbOptions: [CFString: Any] = [
|
||||
kCGImageSourceCreateThumbnailFromImageAlways: true,
|
||||
kCGImageSourceCreateThumbnailWithTransform: true,
|
||||
kCGImageSourceShouldCacheImmediately: true,
|
||||
kCGImageSourceThumbnailMaxPixelSize: 3000
|
||||
]
|
||||
guard let cg = CGImageSourceCreateThumbnailAtIndex(src, 0, thumbOptions as CFDictionary) else { continue }
|
||||
guard let text = try? await OCRService.recognizeText(in: cg),
|
||||
!text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { continue }
|
||||
pages.append(urls.count > 1 ? "【第 \(idx + 1) 页】\n\(text)" : text)
|
||||
|
||||
Reference in New Issue
Block a user