feat(Capture): 报告识别注入 Vision OCR 参考文本,提升 2B 多模态数字准确率
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -31,12 +31,38 @@ nonisolated enum VLPrompts {
|
||||
|
||||
/// VL 模型不知"今天"是哪天,且 few-shot 示例里写死了日期,
|
||||
/// 必须把当天日期显式注入 prompt,模型在无报告日期时才会用对正确的回退值。
|
||||
static func reportExtraction(today: Date = .now) -> String {
|
||||
/// ocrText 非空时把 Vision OCR 的结果作为参考文本注入 —— Vision 抄数字比
|
||||
/// 2B 多模态读密集小字稳;版面与表格结构仍以图片为准。
|
||||
static func reportExtraction(today: Date = .now, ocrText: String = "") -> String {
|
||||
let f = DateFormatter()
|
||||
f.locale = Locale(identifier: "en_US_POSIX")
|
||||
f.dateFormat = "yyyy-MM-dd"
|
||||
let todayStr = f.string(from: today)
|
||||
return reportExtractionTemplate.replacingOccurrences(of: "{{TODAY}}", with: todayStr)
|
||||
let ocrSection: String
|
||||
if ocrText.isEmpty {
|
||||
ocrSection = ""
|
||||
} else {
|
||||
ocrSection = """
|
||||
|
||||
|
||||
OCR 参考文本(系统对同一报告做文字识别的结果,可能有错字、串行或漏行;版面与表格结构以图片为准,但数值、小数点以 OCR 文字更可靠):
|
||||
\(clipOCR(ocrText))
|
||||
|
||||
"""
|
||||
}
|
||||
return reportExtractionTemplate
|
||||
.replacingOccurrences(of: "{{TODAY}}", with: todayStr)
|
||||
.replacingOccurrences(of: "{{OCR_SECTION}}", with: ocrSection)
|
||||
}
|
||||
|
||||
/// OCR 文本截断:限制进入 prompt 的体量(2B 模型上下文有限)。截到最后一个完整行。
|
||||
static func clipOCR(_ text: String, limit: Int = 1800) -> String {
|
||||
guard text.count > limit else { return text }
|
||||
let clipped = String(text.prefix(limit))
|
||||
if let lastNewline = clipped.lastIndex(of: "\n") {
|
||||
return String(clipped[..<lastNewline]) + "\n(后续内容过长已截断)"
|
||||
}
|
||||
return clipped + "\n(后续内容过长已截断)"
|
||||
}
|
||||
|
||||
private static let reportExtractionTemplate: String = #"""
|
||||
@@ -84,7 +110,7 @@ JSON schema(严格):
|
||||
输入: 一份春季体检,3 项可读
|
||||
输出:
|
||||
{"title":"春季年度体检","type":"checkup","report_date":"2026-04-12","institution":"协和医院","page_count":1,"summary":"血脂偏高、其他正常","indicators":[{"name":"低密度脂蛋白","value":"3.84","unit":"mmol/L","range":"< 3.40","status":"high","source_page":1,"source_box":[0.12,0.31,0.76,0.07]},{"name":"谷丙转氨酶","value":"32","unit":"U/L","range":"9 - 50","status":"normal","source_page":1,"source_box":[0.12,0.39,0.76,0.07]},{"name":"空腹血糖","value":"5.2","unit":"mmol/L","range":"3.9 - 6.1","status":"normal","source_page":1,"source_box":[0.12,0.47,0.76,0.07]}]}
|
||||
|
||||
{{OCR_SECTION}}
|
||||
现在请识别图片并输出 JSON:
|
||||
"""#
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import Foundation
|
||||
import UIKit
|
||||
import ImageIO
|
||||
import SwiftData
|
||||
|
||||
/// VL 解析结果(已结构化,可直接喂 SwiftData 模型构造)。
|
||||
@@ -142,11 +143,14 @@ actor CaptureService {
|
||||
throw CaptureError.modelNotReady
|
||||
}
|
||||
let urls = assets.map { FileVault.shared.rootURL.appendingPathComponent($0.relativePath) }
|
||||
// OCR 参考(Vision 本地,<1s/页):给 2B 多模态当数字「抄写员」,降低小字误读。
|
||||
// 任何失败都静默回退为空串,绝不阻断识别主流程(§3.2)。
|
||||
let ocr = await Self.ocrReference(for: urls)
|
||||
let raw: String
|
||||
do {
|
||||
raw = try await AIRuntime.shared.analyzeReport(
|
||||
imageURLs: urls,
|
||||
prompt: VLPrompts.reportExtraction()
|
||||
prompt: VLPrompts.reportExtraction(ocrText: ocr)
|
||||
)
|
||||
} catch {
|
||||
throw CaptureError.inferenceFailed("\(error)")
|
||||
@@ -160,6 +164,20 @@ actor CaptureService {
|
||||
}
|
||||
}
|
||||
|
||||
/// 对 Vault 报告图逐页 OCR 拼参考文本。最多 4 页;失败/空文本返回 ""。
|
||||
/// 用 ImageIO 直取 CGImage(不经 UIImage,避免跨 actor 传非 Sendable 引用)。
|
||||
private static func ocrReference(for urls: [URL]) async -> String {
|
||||
var pages: [String] = []
|
||||
for (idx, url) in urls.prefix(4).enumerated() {
|
||||
guard let src = CGImageSourceCreateWithURL(url as CFURL, nil),
|
||||
let cg = CGImageSourceCreateImageAtIndex(src, 0, nil) else { continue }
|
||||
guard let text = try? await OCRService.recognizeText(in: cg),
|
||||
!text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { continue }
|
||||
pages.append(urls.count > 1 ? "【第 \(idx + 1) 页】\n\(text)" : text)
|
||||
}
|
||||
return pages.joined(separator: "\n")
|
||||
}
|
||||
|
||||
// MARK: - JSON parse(static + 纯函数 → 方便单测)
|
||||
|
||||
/// 从 VL 输出里抠出第一段合法 JSON 对象并解析。
|
||||
|
||||
32
康康Tests/VLPromptsOCRTests.swift
Normal file
32
康康Tests/VLPromptsOCRTests.swift
Normal file
@@ -0,0 +1,32 @@
|
||||
import Testing
|
||||
@testable import 康康
|
||||
|
||||
struct VLPromptsOCRTests {
|
||||
|
||||
@Test func emptyOCRKeepsPromptClean() {
|
||||
let p = VLPrompts.reportExtraction(ocrText: "")
|
||||
#expect(!p.contains("OCR 参考文本"))
|
||||
#expect(!p.contains("{{OCR_SECTION}}"))
|
||||
#expect(p.contains("现在请识别图片并输出 JSON"))
|
||||
}
|
||||
|
||||
@Test func ocrTextIsInjectedBeforeFinalInstruction() {
|
||||
let p = VLPrompts.reportExtraction(ocrText: "尿酸 486 208-428 μmol/L")
|
||||
#expect(p.contains("OCR 参考文本"))
|
||||
#expect(p.contains("尿酸 486"))
|
||||
let ocrPos = p.range(of: "尿酸 486")!.lowerBound
|
||||
let endPos = p.range(of: "现在请识别图片并输出 JSON")!.lowerBound
|
||||
#expect(ocrPos < endPos)
|
||||
}
|
||||
|
||||
@Test func clipKeepsShortTextIntact() {
|
||||
#expect(VLPrompts.clipOCR("短文本") == "短文本")
|
||||
}
|
||||
|
||||
@Test func clipCutsAtLineBoundary() {
|
||||
let long = Array(repeating: "指标行 1.23 mmol/L", count: 400).joined(separator: "\n")
|
||||
let clipped = VLPrompts.clipOCR(long, limit: 200)
|
||||
#expect(clipped.count < 260)
|
||||
#expect(clipped.hasSuffix("(后续内容过长已截断)"))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user