feat(iOS): 更新MNN后端模型配置优化性能

将MNN主模型从Qwen3.5-4B(~2.64GiB)降级为Qwen3.5-2B(~1.1GiB),因为4B版本
实测运行过慢,影响用户体验。iPhone17+/SME2设备使用2B模型,保留MLX
兜底方案用于模拟器和备用场景,确保AI推理性能和存储效率的平衡。
```
This commit is contained in:
link2026
2026-06-09 22:20:07 +08:00
parent ca5a3fa38b
commit b79ae54b7b
40 changed files with 1327 additions and 452 deletions

View File

@@ -33,11 +33,11 @@ actor AIRuntime {
private var vlSession: VLSession?
// MARK: - MNN (CPU/SME2,)
// .mnn , VL() Qwen3.5-4B MNN ()
// .mnn , VL() Qwen3.5-2B MNN ()
// MNN,VL 退 MLX Qwen3-VL-4B
private let mnn = MNNBackend()
private(set) var mnnStatus: Status = .notReady
/// MNN (/ Models/Qwen3.5-4B-MNN)
/// MNN (/ Models/Qwen3.5-2B-MNN)
nonisolated static var mnnModelFolder: URL {
ModelStore.shared.localURL(for: .mnnLLM)
}
@@ -266,7 +266,7 @@ actor AIRuntime {
}
if vlStatus == .ready { return }
// MLX VL .llm Qwen3.5-4B (VLMModelFactory qwen3_5 ),
// MLX VL .llm Qwen3.5-2B (VLMModelFactory qwen3_5 ),
// Qwen3-VL-4B isComplete ,
guard ModelStore.shared.isComplete(for: .llm) else {
vlStatus = .error("VL 模型未就绪")
@@ -274,7 +274,7 @@ actor AIRuntime {
}
// :( LLM ), LLM + VL
// App 退
// App 退
await acquireGate()
defer { releaseGate() }
if vlStatus == .ready { return }

View File

@@ -26,16 +26,52 @@ nonisolated enum InferenceEngine: String, CaseIterable, Sendable {
private static let key = "kk.inferenceEngine"
/// /退 .mlx() .mnn
/// ( .auto)使
/// AIRuntime / MeView , .mnn .mlx
/// ,
static var current: InferenceEngine {
get {
let raw = UserDefaults.standard.string(forKey: key)
let chosen = raw.flatMap(InferenceEngine.init(rawValue:)) ?? .mnn
return chosen.isAvailable ? chosen : .mlx
}
set { UserDefaults.standard.set(newValue.rawValue, forKey: key) }
let resolved = preference.resolved
return resolved.isAvailable ? resolved : .mlx
}
/// :CPU SME2(A19/iPhone17+) UI
static var cpuSupportsSME2: Bool { MNNLLMBridge.cpuSupportsSME2() }
// MARK: - (auto / mnn / mlx)
/// .auto:
/// UserDefaults key "mnn"/"mlx"
static var preference: EnginePreference {
get {
let raw = UserDefaults.standard.string(forKey: key)
return raw.flatMap(EnginePreference.init(rawValue:)) ?? .auto
}
set { UserDefaults.standard.set(newValue.rawValue, forKey: key) }
}
}
/// , .auto
/// - auto: MNN(, SME2/NEON),
/// MNN ()退 MLX
nonisolated enum EnginePreference: String, CaseIterable, Sendable {
case auto
case mnn
case mlx
var displayName: String {
switch self {
case .auto: return "自动"
case .mnn: return InferenceEngine.mnn.displayName
case .mlx: return InferenceEngine.mlx.displayName
}
}
/// (, `InferenceEngine.current`)
var resolved: InferenceEngine {
switch self {
case .mnn: return .mnn
case .mlx: return .mlx
case .auto: return InferenceEngine.mnn.isAvailable ? .mnn : .mlx
}
}
}

View File

@@ -45,10 +45,16 @@ actor LLMSession {
let task = Task {
do {
try await Self.withDeviceOverride {
// : App "/JSON ", JSON
// 0.3 + topP 0.85 JSON ( MNN set_config )
// repetitionPenalty: + ,()
// ;1.1 + 64 token ( MNN penalty )
let parameters = GenerateParameters(
maxTokens: maxTokens,
temperature: Float(0.6),
topP: Float(0.9)
temperature: Float(0.3),
topP: Float(0.85),
repetitionPenalty: Float(1.1),
repetitionContextSize: 64
)
try await container.perform { (context: ModelContext) in

View File

@@ -127,6 +127,22 @@ private:
_cancel = false;
_llm = Llm::createLLM(std::string(configPath.UTF8String));
if (_llm == nullptr) return nil;
// load 前以 merge-patch 调三件事(只翻这几个叶子,保留 chat_template 等其余配置):
// ① enable_thinking=false:config.json 默认 true,模板会给每个 assistant 回合硬塞
// <think>\n 开启思考,吞掉 token 预算并污染 JSON(prompt 里的 /no_think 对此模板无效)。
// ② 降温:config.json 默认 temperature=1.0 对结构化 JSON 太高,随机性大→经常吐成非 JSON。
// 本 App 所有任务都是"直答/JSON",压到 0.3 + topP 0.85 让输出更确定、JSON 更稳。
// ③ 重复惩罚:MNN 默认 mixed_samplers 不含 "penalty"、penalty/ngram_factor=1.0(全关),
// 叠加低温 → 长文本(如「关键指标」列表)会陷入逐行复读死循环(收缩压 107 mmHg ×N)。
// 显式把 "penalty" 放进 mixed 链首,开 repetition penalty(1.1)+ n-gram 惩罚(ngram_factor 1.05):
// n-gram 命中整段重复时惩罚升到 max_penalty,直接掐断逐行复读。
_llm->set_config("{"
"\"jinja\":{\"context\":{\"enable_thinking\":false}},"
"\"sampler_type\":\"mixed\","
"\"mixed_samplers\":[\"penalty\",\"topK\",\"topP\",\"temperature\"],"
"\"temperature\":0.3,\"topP\":0.85,\"topK\":40,"
"\"penalty\":1.1,\"n_gram\":8,\"ngram_factor\":1.05"
"}");
_loaded = _llm->load();
if (!_loaded) { Llm::destroy(_llm); _llm = nullptr; return nil; }
return self;

View File

@@ -3,7 +3,7 @@ import Foundation
/// MNN(CPU / SME2), `MNNLLMBridge`
/// `LLMSession`/`VLSession` actor ; `AIRuntime`
///
/// () Qwen3.5-4B MNN :`generate` ,
/// () Qwen3.5-2B MNN :`generate` ,
/// `analyze` <img> Omni imread ( OMNI ,xcframework )
/// ,; MNN,VL 退 MLX( `AIRuntime`)
actor MNNBackend {

View File

@@ -18,18 +18,20 @@ nonisolated enum ModelManifest {
static func files(for kind: ModelKind) -> [ModelFile] {
switch kind {
case .llm:
// Qwen3.5-4B-4bit:,MLX (LLMModelFactory qwen3_5 )
// (VLMModelFactory qwen3_5) mlx-community/Qwen3.5-4B-4bit
// blob (HF API,2026-06 )(),
// README.md / .gitattributes
// Qwen3.5-2B-4bit:, LLMModelFactory qwen3_5
// mlx-community/Qwen3.5-2B-4bit blob (HF API,2026-06 )
// tokenizer vocab.json + tokenizer.json( merges.txt /
// special_tokens_map.json / added_tokens.json),chat_template .jinja
// (preprocessor / processor / video_preprocessor),
// ,
return [
ModelFile(path: "config.json", bytes: 3_366),
ModelFile(path: "model.safetensors", bytes: 3_034_300_695),
ModelFile(path: "model.safetensors.index.json", bytes: 101_944),
ModelFile(path: "config.json", bytes: 3_113),
ModelFile(path: "model.safetensors", bytes: 1_722_271_785),
ModelFile(path: "model.safetensors.index.json", bytes: 81_722),
ModelFile(path: "tokenizer.json", bytes: 19_989_343),
ModelFile(path: "tokenizer_config.json", bytes: 1_139),
ModelFile(path: "vocab.json", bytes: 6_722_759),
ModelFile(path: "chat_template.jinja", bytes: 7_756),
ModelFile(path: "chat_template.jinja", bytes: 7_755),
ModelFile(path: "preprocessor_config.json", bytes: 390),
ModelFile(path: "processor_config.json", bytes: 1_300),
ModelFile(path: "video_preprocessor_config.json", bytes: 385),
@@ -58,18 +60,17 @@ nonisolated enum ModelManifest {
ModelFile(path: "video_preprocessor_config.json", bytes: 817),
]
case .mnnLLM:
// taobao-mnn/Qwen3.5-4B-MNN MNN (HF API ,2026-06)
// taobao-mnn/Qwen3.5-2B-MNN MNN (HF API ,2026-06)
// :config.json(MNN llm )+ llm_config.json()+ llm.mnn()
// + llm.mnn.weight( ~2.45GB)+ tokenizer.txt + visual.mnn/visual.mnn.weight(,
// mllm,) README/.gitattributes dump
// + llm.mnn.weight( ~1.1GB)+ tokenizer.txt + visual.mnn(, mllm)
// README/.gitattributes dump(llm.mnn.json / export_args.json)
return [
ModelFile(path: "config.json", bytes: 652),
ModelFile(path: "llm_config.json", bytes: 8_693),
ModelFile(path: "llm.mnn", bytes: 3_651_096),
ModelFile(path: "llm.mnn.weight", bytes: 2_629_387_626),
ModelFile(path: "llm_config.json", bytes: 8_692),
ModelFile(path: "llm.mnn", bytes: 2_148_136),
ModelFile(path: "llm.mnn.weight", bytes: 1_176_647_702),
ModelFile(path: "tokenizer.txt", bytes: 6_465_727),
ModelFile(path: "visual.mnn", bytes: 488_096),
ModelFile(path: "visual.mnn.weight", bytes: 196_768_960),
]
}
}

View File

@@ -2,19 +2,19 @@ import Foundation
nonisolated enum ModelKind: String, CaseIterable {
/// Models/ / CDN
/// Qwen3.5-4B,:
/// - mnnLLM:MNN(CPU/SME2,)+,taobao-mnn ,
/// - llm:MLX(GPU),Qwen3.5-4B-4bit (, qwen3_5)
/// Qwen3.5-2B,:
/// - mnnLLM:MNN(CPU/SME2,)+,taobao-mnn iPhone17+(A19/SME2),
/// - llm:MLX(GPU),Qwen3.5-2B-4bit (, qwen3_5)
/// - vl:(MLX VL .llm ), switch,/
case llm = "Qwen3.5-4B-4bit"
case llm = "Qwen3.5-2B-4bit"
case vl = "Qwen3-VL-4B-Instruct-4bit"
case mnnLLM = "Qwen3.5-4B-MNN"
case mnnLLM = "Qwen3.5-2B-MNN"
var displayName: String {
switch self {
case .llm: return "Qwen3.5-4B (MLX)"
case .llm: return "Qwen3.5-2B (MLX)"
case .vl: return "Qwen3-VL-4B"
case .mnnLLM: return "Qwen3.5-4B (MNN/SME2)"
case .mnnLLM: return "Qwen3.5-2B (MNN/SME2)"
}
}
@@ -25,7 +25,7 @@ nonisolated enum ModelKind: String, CaseIterable {
var sentinelFilename: String { "config.json" }
/// : / /
/// Qwen3.5-4B(MNN,+,)
/// Qwen3.5-2B(MNN,+,iPhone17+ )
/// MLX .llm/.vl ,(),
/// · ,
static let userFacing: [ModelKind] = [.mnnLLM]

View File

@@ -88,9 +88,9 @@ JSON schema(严格):
现在请识别图片并输出 JSON:
"""#
// MARK: - ()
// MARK: - ()
/// :/****()
/// :/****()
/// indicators ,// , Report
static func regionExtraction(today: Date = .now) -> String {
let f = DateFormatter()