feat(Capture): 报告识别注入 Vision OCR 参考文本,提升 2B 多模态数字准确率

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
link2026
2026-06-10 07:12:48 +08:00
parent 0dd60d6021
commit 77139f5e32
3 changed files with 80 additions and 4 deletions

View File

@@ -31,12 +31,38 @@ nonisolated enum VLPrompts {
/// VL "", few-shot , /// VL "", few-shot ,
/// prompt,退 /// prompt,退
static func reportExtraction(today: Date = .now) -> String { /// ocrText Vision OCR Vision
/// 2B ;
static func reportExtraction(today: Date = .now, ocrText: String = "") -> String {
let f = DateFormatter() let f = DateFormatter()
f.locale = Locale(identifier: "en_US_POSIX") f.locale = Locale(identifier: "en_US_POSIX")
f.dateFormat = "yyyy-MM-dd" f.dateFormat = "yyyy-MM-dd"
let todayStr = f.string(from: today) let todayStr = f.string(from: today)
return reportExtractionTemplate.replacingOccurrences(of: "{{TODAY}}", with: todayStr) let ocrSection: String
if ocrText.isEmpty {
ocrSection = ""
} else {
ocrSection = """
OCR 参考文本(系统对同一报告做文字识别的结果,可能有错字、串行或漏行;版面与表格结构以图片为准,但数值、小数点以 OCR 文字更可靠):
\(clipOCR(ocrText))
"""
}
return reportExtractionTemplate
.replacingOccurrences(of: "{{TODAY}}", with: todayStr)
.replacingOccurrences(of: "{{OCR_SECTION}}", with: ocrSection)
}
/// OCR : prompt (2B )
static func clipOCR(_ text: String, limit: Int = 1800) -> String {
guard text.count > limit else { return text }
let clipped = String(text.prefix(limit))
if let lastNewline = clipped.lastIndex(of: "\n") {
return String(clipped[..<lastNewline]) + "\n(后续内容过长已截断)"
}
return clipped + "\n(后续内容过长已截断)"
} }
private static let reportExtractionTemplate: String = #""" private static let reportExtractionTemplate: String = #"""
@@ -84,7 +110,7 @@ JSON schema(严格):
输入: 一份春季体检,3 项可读 输入: 一份春季体检,3 项可读
输出: 输出:
{"title":"","type":"checkup","report_date":"2026-04-12","institution":"","page_count":1,"summary":"","indicators":[{"name":"","value":"3.84","unit":"mmol/L","range":"< 3.40","status":"high","source_page":1,"source_box":[0.12,0.31,0.76,0.07]},{"name":"","value":"32","unit":"U/L","range":"9 - 50","status":"normal","source_page":1,"source_box":[0.12,0.39,0.76,0.07]},{"name":"","value":"5.2","unit":"mmol/L","range":"3.9 - 6.1","status":"normal","source_page":1,"source_box":[0.12,0.47,0.76,0.07]}]} {"title":"","type":"checkup","report_date":"2026-04-12","institution":"","page_count":1,"summary":"","indicators":[{"name":"","value":"3.84","unit":"mmol/L","range":"< 3.40","status":"high","source_page":1,"source_box":[0.12,0.31,0.76,0.07]},{"name":"","value":"32","unit":"U/L","range":"9 - 50","status":"normal","source_page":1,"source_box":[0.12,0.39,0.76,0.07]},{"name":"","value":"5.2","unit":"mmol/L","range":"3.9 - 6.1","status":"normal","source_page":1,"source_box":[0.12,0.47,0.76,0.07]}]}
{{OCR_SECTION}}
现在请识别图片并输出 JSON: 现在请识别图片并输出 JSON:
"""# """#

View File

@@ -1,5 +1,6 @@
import Foundation import Foundation
import UIKit import UIKit
import ImageIO
import SwiftData import SwiftData
/// VL (, SwiftData ) /// VL (, SwiftData )
@@ -142,11 +143,14 @@ actor CaptureService {
throw CaptureError.modelNotReady throw CaptureError.modelNotReady
} }
let urls = assets.map { FileVault.shared.rootURL.appendingPathComponent($0.relativePath) } let urls = assets.map { FileVault.shared.rootURL.appendingPathComponent($0.relativePath) }
// OCR (Vision ,<1s/): 2B ,
// 退,(§3.2)
let ocr = await Self.ocrReference(for: urls)
let raw: String let raw: String
do { do {
raw = try await AIRuntime.shared.analyzeReport( raw = try await AIRuntime.shared.analyzeReport(
imageURLs: urls, imageURLs: urls,
prompt: VLPrompts.reportExtraction() prompt: VLPrompts.reportExtraction(ocrText: ocr)
) )
} catch { } catch {
throw CaptureError.inferenceFailed("\(error)") throw CaptureError.inferenceFailed("\(error)")
@@ -160,6 +164,20 @@ actor CaptureService {
} }
} }
/// Vault OCR 4 ;/ ""
/// ImageIO CGImage( UIImage, actor Sendable )
private static func ocrReference(for urls: [URL]) async -> String {
var pages: [String] = []
for (idx, url) in urls.prefix(4).enumerated() {
guard let src = CGImageSourceCreateWithURL(url as CFURL, nil),
let cg = CGImageSourceCreateImageAtIndex(src, 0, nil) else { continue }
guard let text = try? await OCRService.recognizeText(in: cg),
!text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { continue }
pages.append(urls.count > 1 ? "【第 \(idx + 1) 页】\n\(text)" : text)
}
return pages.joined(separator: "\n")
}
// MARK: - JSON parse(static + 便) // MARK: - JSON parse(static + 便)
/// VL JSON /// VL JSON

View File

@@ -0,0 +1,32 @@
import Testing
@testable import
struct VLPromptsOCRTests {
@Test func emptyOCRKeepsPromptClean() {
let p = VLPrompts.reportExtraction(ocrText: "")
#expect(!p.contains("OCR 参考文本"))
#expect(!p.contains("{{OCR_SECTION}}"))
#expect(p.contains("现在请识别图片并输出 JSON"))
}
@Test func ocrTextIsInjectedBeforeFinalInstruction() {
let p = VLPrompts.reportExtraction(ocrText: "尿酸 486 208-428 μmol/L")
#expect(p.contains("OCR 参考文本"))
#expect(p.contains("尿酸 486"))
let ocrPos = p.range(of: "尿酸 486")!.lowerBound
let endPos = p.range(of: "现在请识别图片并输出 JSON")!.lowerBound
#expect(ocrPos < endPos)
}
@Test func clipKeepsShortTextIntact() {
#expect(VLPrompts.clipOCR("短文本") == "短文本")
}
@Test func clipCutsAtLineBoundary() {
let long = Array(repeating: "指标行 1.23 mmol/L", count: 400).joined(separator: "\n")
let clipped = VLPrompts.clipOCR(long, limit: 200)
#expect(clipped.count < 260)
#expect(clipped.hasSuffix("(后续内容过长已截断)"))
}
}