feat(Capture): 报告识别注入 Vision OCR 参考文本,提升 2B 多模态数字准确率
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import Foundation
|
||||
import UIKit
|
||||
import ImageIO
|
||||
import SwiftData
|
||||
|
||||
/// VL 解析结果(已结构化,可直接喂 SwiftData 模型构造)。
|
||||
@@ -142,11 +143,14 @@ actor CaptureService {
|
||||
throw CaptureError.modelNotReady
|
||||
}
|
||||
let urls = assets.map { FileVault.shared.rootURL.appendingPathComponent($0.relativePath) }
|
||||
// OCR 参考(Vision 本地,<1s/页):给 2B 多模态当数字「抄写员」,降低小字误读。
|
||||
// 任何失败都静默回退为空串,绝不阻断识别主流程(§3.2)。
|
||||
let ocr = await Self.ocrReference(for: urls)
|
||||
let raw: String
|
||||
do {
|
||||
raw = try await AIRuntime.shared.analyzeReport(
|
||||
imageURLs: urls,
|
||||
prompt: VLPrompts.reportExtraction()
|
||||
prompt: VLPrompts.reportExtraction(ocrText: ocr)
|
||||
)
|
||||
} catch {
|
||||
throw CaptureError.inferenceFailed("\(error)")
|
||||
@@ -160,6 +164,20 @@ actor CaptureService {
|
||||
}
|
||||
}
|
||||
|
||||
/// 对 Vault 报告图逐页 OCR 拼参考文本。最多 4 页;失败/空文本返回 ""。
|
||||
/// 用 ImageIO 直取 CGImage(不经 UIImage,避免跨 actor 传非 Sendable 引用)。
|
||||
private static func ocrReference(for urls: [URL]) async -> String {
|
||||
var pages: [String] = []
|
||||
for (idx, url) in urls.prefix(4).enumerated() {
|
||||
guard let src = CGImageSourceCreateWithURL(url as CFURL, nil),
|
||||
let cg = CGImageSourceCreateImageAtIndex(src, 0, nil) else { continue }
|
||||
guard let text = try? await OCRService.recognizeText(in: cg),
|
||||
!text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { continue }
|
||||
pages.append(urls.count > 1 ? "【第 \(idx + 1) 页】\n\(text)" : text)
|
||||
}
|
||||
return pages.joined(separator: "\n")
|
||||
}
|
||||
|
||||
// MARK: - JSON parse(static + 纯函数 → 方便单测)
|
||||
|
||||
/// 从 VL 输出里抠出第一段合法 JSON 对象并解析。
|
||||
|
||||
Reference in New Issue
Block a user