```

feat(iOS): 更新MNN后端模型配置优化性能将MNN主模型从Qwen3.5-4B(~2.64GiB)降级为Qwen3.5-2B(~1.1GiB)，因为4B版本实测运行过慢，影响用户体验。iPhone17+/SME2设备使用2B模型，保留MLX 兜底方案用于模拟器和备用场景，确保AI推理性能和存储效率的平衡。 ```
2026-06-09 22:20:07 +08:00
parent ca5a3fa38b
commit b79ae54b7b
40 changed files with 1327 additions and 452 deletions
--- a/康康/AI/MNN/MNNLLMBridge.mm
+++ b/康康/AI/MNN/MNNLLMBridge.mm
@@ -127,6 +127,22 @@ private:
    _cancel = false;
    _llm = Llm::createLLM(std::string(configPath.UTF8String));
    if (_llm == nullptr) return nil;
+    // load 前以 merge-patch 调三件事(只翻这几个叶子,保留 chat_template 等其余配置):
+    // ① enable_thinking=false:config.json 默认 true,模板会给每个 assistant 回合硬塞
+    //    <think>\n 开启思考,吞掉 token 预算并污染 JSON(prompt 里的 /no_think 对此模板无效)。
+    // ② 降温:config.json 默认 temperature=1.0 对结构化 JSON 太高,随机性大→经常吐成非 JSON。
+    //    本 App 所有任务都是"直答/JSON",压到 0.3 + topP 0.85 让输出更确定、JSON 更稳。
+    // ③ 重复惩罚:MNN 默认 mixed_samplers 不含 "penalty"、penalty/ngram_factor=1.0(全关),
+    //    叠加低温 → 长文本(如「关键指标」列表)会陷入逐行复读死循环(收缩压 107 mmHg ×N)。
+    //    显式把 "penalty" 放进 mixed 链首,开 repetition penalty(1.1)+ n-gram 惩罚(ngram_factor 1.05):
+    //    n-gram 命中整段重复时惩罚升到 max_penalty,直接掐断逐行复读。
+    _llm->set_config("{"
+                     "\"jinja\":{\"context\":{\"enable_thinking\":false}},"
+                     "\"sampler_type\":\"mixed\","
+                     "\"mixed_samplers\":[\"penalty\",\"topK\",\"topP\",\"temperature\"],"
+                     "\"temperature\":0.3,\"topP\":0.85,\"topK\":40,"
+                     "\"penalty\":1.1,\"n_gram\":8,\"ngram_factor\":1.05"
+                     "}");
    _loaded = _llm->load();
    if (!_loaded) { Llm::destroy(_llm); _llm = nullptr; return nil; }
    return self;