feat(Capture): 报告识别注入 Vision OCR 参考文本,提升 2B 多模态数字准确率

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 07:12:48 +08:00
parent 0dd60d6021
commit 77139f5e32
3 changed files with 80 additions and 4 deletions
--- a/康康/Services/CaptureService.swift
+++ b/康康/Services/CaptureService.swift
@@ -1,5 +1,6 @@
 import Foundation
 import UIKit
+import ImageIO
 import SwiftData

 /// VL 解析结果(已结构化,可直接喂 SwiftData 模型构造)。
@@ -142,11 +143,14 @@ actor CaptureService {
            throw CaptureError.modelNotReady
        }
        let urls = assets.map { FileVault.shared.rootURL.appendingPathComponent($0.relativePath) }
+        // OCR 参考(Vision 本地,<1s/页):给 2B 多模态当数字「抄写员」,降低小字误读。
+        // 任何失败都静默回退为空串,绝不阻断识别主流程(§3.2)。
+        let ocr = await Self.ocrReference(for: urls)
        let raw: String
        do {
            raw = try await AIRuntime.shared.analyzeReport(
                imageURLs: urls,
-                prompt: VLPrompts.reportExtraction()
+                prompt: VLPrompts.reportExtraction(ocrText: ocr)
            )
        } catch {
            throw CaptureError.inferenceFailed("\(error)")
@@ -160,6 +164,20 @@ actor CaptureService {
        }
    }

+    /// 对 Vault 报告图逐页 OCR 拼参考文本。最多 4 页;失败/空文本返回 ""。
+    /// 用 ImageIO 直取 CGImage(不经 UIImage,避免跨 actor 传非 Sendable 引用)。
+    private static func ocrReference(for urls: [URL]) async -> String {
+        var pages: [String] = []
+        for (idx, url) in urls.prefix(4).enumerated() {
+            guard let src = CGImageSourceCreateWithURL(url as CFURL, nil),
+                  let cg = CGImageSourceCreateImageAtIndex(src, 0, nil) else { continue }
+            guard let text = try? await OCRService.recognizeText(in: cg),
+                  !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { continue }
+            pages.append(urls.count > 1 ? "【第 \(idx + 1) 页】\n\(text)" : text)
+        }
+        return pages.joined(separator: "\n")
+    }
+
    // MARK: - JSON parse(static + 纯函数 → 方便单测)

    /// 从 VL 输出里抠出第一段合法 JSON 对象并解析。