feat(Capture): 报告识别注入 Vision OCR 参考文本,提升 2B 多模态数字准确率

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
link2026
2026-06-10 07:12:48 +08:00
parent 0dd60d6021
commit 77139f5e32
3 changed files with 80 additions and 4 deletions

View File

@@ -31,12 +31,38 @@ nonisolated enum VLPrompts {
/// VL "", few-shot ,
/// prompt,退
static func reportExtraction(today: Date = .now) -> String {
/// ocrText Vision OCR Vision
/// 2B ;
static func reportExtraction(today: Date = .now, ocrText: String = "") -> String {
let f = DateFormatter()
f.locale = Locale(identifier: "en_US_POSIX")
f.dateFormat = "yyyy-MM-dd"
let todayStr = f.string(from: today)
return reportExtractionTemplate.replacingOccurrences(of: "{{TODAY}}", with: todayStr)
let ocrSection: String
if ocrText.isEmpty {
ocrSection = ""
} else {
ocrSection = """
OCR 参考文本(系统对同一报告做文字识别的结果,可能有错字、串行或漏行;版面与表格结构以图片为准,但数值、小数点以 OCR 文字更可靠):
\(clipOCR(ocrText))
"""
}
return reportExtractionTemplate
.replacingOccurrences(of: "{{TODAY}}", with: todayStr)
.replacingOccurrences(of: "{{OCR_SECTION}}", with: ocrSection)
}
/// OCR : prompt (2B )
static func clipOCR(_ text: String, limit: Int = 1800) -> String {
guard text.count > limit else { return text }
let clipped = String(text.prefix(limit))
if let lastNewline = clipped.lastIndex(of: "\n") {
return String(clipped[..<lastNewline]) + "\n(后续内容过长已截断)"
}
return clipped + "\n(后续内容过长已截断)"
}
private static let reportExtractionTemplate: String = #"""
@@ -84,7 +110,7 @@ JSON schema(严格):
输入: 一份春季体检,3 项可读
输出:
{"title":"","type":"checkup","report_date":"2026-04-12","institution":"","page_count":1,"summary":"","indicators":[{"name":"","value":"3.84","unit":"mmol/L","range":"< 3.40","status":"high","source_page":1,"source_box":[0.12,0.31,0.76,0.07]},{"name":"","value":"32","unit":"U/L","range":"9 - 50","status":"normal","source_page":1,"source_box":[0.12,0.39,0.76,0.07]},{"name":"","value":"5.2","unit":"mmol/L","range":"3.9 - 6.1","status":"normal","source_page":1,"source_box":[0.12,0.47,0.76,0.07]}]}
{{OCR_SECTION}}
现在请识别图片并输出 JSON:
"""#

View File

@@ -1,5 +1,6 @@
import Foundation
import UIKit
import ImageIO
import SwiftData
/// VL (, SwiftData )
@@ -142,11 +143,14 @@ actor CaptureService {
throw CaptureError.modelNotReady
}
let urls = assets.map { FileVault.shared.rootURL.appendingPathComponent($0.relativePath) }
// OCR (Vision ,<1s/): 2B ,
// 退,(§3.2)
let ocr = await Self.ocrReference(for: urls)
let raw: String
do {
raw = try await AIRuntime.shared.analyzeReport(
imageURLs: urls,
prompt: VLPrompts.reportExtraction()
prompt: VLPrompts.reportExtraction(ocrText: ocr)
)
} catch {
throw CaptureError.inferenceFailed("\(error)")
@@ -160,6 +164,20 @@ actor CaptureService {
}
}
/// Vault OCR 4 ;/ ""
/// ImageIO CGImage( UIImage, actor Sendable )
private static func ocrReference(for urls: [URL]) async -> String {
var pages: [String] = []
for (idx, url) in urls.prefix(4).enumerated() {
guard let src = CGImageSourceCreateWithURL(url as CFURL, nil),
let cg = CGImageSourceCreateImageAtIndex(src, 0, nil) else { continue }
guard let text = try? await OCRService.recognizeText(in: cg),
!text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { continue }
pages.append(urls.count > 1 ? "【第 \(idx + 1) 页】\n\(text)" : text)
}
return pages.joined(separator: "\n")
}
// MARK: - JSON parse(static + 便)
/// VL JSON