From fbd28015c203984c78a2ba907da3ef1e5570a331 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 7 May 2026 10:39:44 +0000
Subject: [PATCH 1/4] Studio: pin GPU at 95% headroom and warn on silent CPU
 fallback

Two related runtime-side fixes for unslothai/unsloth#5106 ("model
loaded fully on RAM instead of VRAM"):

1. GPU pin threshold bump 0.90 -> 0.95
-------------------------------------

``_select_gpus`` and the auto-ctx pin loop in ``start_llama_server``
used a ``pool * 0.90`` threshold to decide whether the model fits on
GPU. Models that needed 91-94% of free VRAM were classified as "does
not fit", so Studio set ``gpu_indices = None`` and shipped
``--fit on`` to llama-server without ``-ngl``. The unsloth
llama.cpp fork's ``--fit on`` then ran with its default
``--fit-target 1024`` (1 GiB margin per device, an upstream default
inherited from ggml-org#18679). On a tight fit where compute
buffers + CUDA context push the projected free below the 1 GiB
target, the fork's fit logic shaves layer weights off the GPU --
slow inference for users whose models would have loaded comfortably
with ``-ngl -1``.

The classic reproducer from #5106 (noahterbest's log):

    GGUF size: 20.8 GB, est. KV cache: 0.1 GB, context: 4096,
    GPUs free: [(0, 22805)], selected: None, fit: True

20.8 GiB on a 22.27 GiB free RTX 4090 is 94% utilization. The model
fits (1.4 GiB headroom), but the 0.90 threshold kicks it to fit
mode. Bumping to 0.95 keeps these in the fits-on-GPU branch and
emits ``-ngl -1`` directly. The fork's ``--fit on`` still serves as
the safety net for the genuinely-too-large case.

The auto-ctx fallback also re-checks fit at 4096 before handing off
to ``--fit on``: a 20.8 GiB model with a 131072 native context fails
the auto loop at native ctx, falls back to ``min(4096, ctx)``, but
its weights + 4096 KV pin to the GPU comfortably. Without the
re-check we still emitted ``--fit on``.

``_fit_context_to_vram``'s 0.90 budget for context binary search is
intentionally left tighter than the pin fraction. That routine
chooses the slider value, where over-promising would OOM at runtime.
``_select_gpus`` decides whether to pin at all, where being
conservative pushes layers to CPU.

2. Belt-and-suspenders: warn on silent CPU fallback
---------------------------------------------------

After ``_wait_for_health`` succeeds, scan llama-server's stdout for
``model buffer size`` lines. If Studio detected GPUs and intended
GPU use but only CPU buffers were allocated, log a structured
warning citing #5106. Markers cover CUDA / ROCm / Metal / Vulkan /
OpenCL / SYCL backends. New ``_gpu_offload_active: Optional[bool]``
field surfaces the result for any future API consumer.

This catches runtime-load failures the install-time fix cannot
cover (cudart bundle pairing PR #5322 is the install-side
companion): user overriding ``--fit-target``, uncommon driver +
toolkit configurations, future regressions in the install path.

Tests: 10 new cases in studio/backend/tests/test_llama_cpp_context_fit.py:
* TestTightFitPinsToGPU x3: noahterbest's exact reproducer (auto and
  explicit ctx pins to GPU at 94%); guard against threshold over-
  broadening (genuine overflow still falls back to ``--fit on``).
* TestClassifyGpuOffload x7: CUDA / ROCm / Metal buffer markers
  return True; CPU-only buffer lines return False; absent buffer
  lines or no GPUs detected return None (no warning).

25 context-fit tests pass (15 baseline + 10 new). 511 tests total
across the affected test files. No regressions.

Refs #5106
---
 studio/backend/core/inference/llama_cpp.py    | 147 ++++++++++++++--
 .../tests/test_llama_cpp_context_fit.py       | 159 +++++++++++++++++-
 2 files changed, 289 insertions(+), 17 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 8da836de38..2080f68f5a 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -433,6 +433,10 @@ def __init__(self):
         self._hf_variant: Optional[str] = None
         self._is_vision: bool = False
         self._healthy = False
+        # True/False once the post-load classifier sees evidence; None
+        # when there's no signal (no GPU detected, or llama-server's
+        # buffer-allocation log was absent / unparseable).
+        self._gpu_offload_active: Optional[bool] = None
         self._context_length: Optional[int] = None
         self._effective_context_length: Optional[int] = None
         self._max_context_length: Optional[int] = None
@@ -951,6 +955,20 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
             logger.debug(f"torch GPU probe failed: {e}")
             return []
 
+    # Fraction of free GPU VRAM Studio is willing to pin a model into
+    # before it falls back to ``--fit on`` (and lets the unsloth llama.cpp
+    # fork's fit logic decide how many layers to offload). 0.95 leaves
+    # ~5% of free VRAM for the CUDA context, compute buffers, flash-attn
+    # workspace, and other per-launch overhead. 0.90 is too conservative
+    # at this layer: when a model needs ~92-94% of free VRAM (issue
+    # #5106 / Discord "RAM not VRAM"), Studio used to flip ``use_fit=True``
+    # without ``-ngl``, the fork's fit logic ran with its default
+    # ``--fit-target 1024`` (1 GiB margin) and pushed substantial layer
+    # weight off the GPU even though the model would have loaded
+    # comfortably. The fork's own fit logic still serves as the safety
+    # net for the genuinely-too-large case.
+    _GPU_PIN_VRAM_FRACTION = 0.95
+
     @staticmethod
     def _select_gpus(
         model_size_bytes: int,
@@ -959,11 +977,11 @@ def _select_gpus(
         """Pick GPU(s) for a model based on estimated VRAM and free memory.
 
         ``model_size_bytes`` should include both model weights and estimated
-        KV cache.  The 90% threshold provides headroom for compute buffers,
-        CUDA context, and other runtime overhead.
+        KV cache.  The ``_GPU_PIN_VRAM_FRACTION`` threshold provides headroom
+        for compute buffers, CUDA context, and other runtime overhead.
 
         Returns (gpu_indices, use_fit):
-          - ([1], False)       model fits on 1 GPU at 90% of free
+          - ([1], False)       model fits on 1 GPU at the headroom threshold
           - ([1, 2], False)    model needs 2 GPUs
           - (None, True)       model too large, let --fit handle it
         """
@@ -971,12 +989,13 @@ def _select_gpus(
             return None, True
 
         model_size_mib = model_size_bytes / (1024 * 1024)
+        usable_fraction = LlamaCppBackend._GPU_PIN_VRAM_FRACTION
 
         # Sort GPUs by free memory descending
         ranked = sorted(gpus, key = lambda g: g[1], reverse = True)
 
-        # Try fitting on 1 GPU (90% of free memory threshold)
-        if ranked[0][1] * 0.90 >= model_size_mib:
+        # Try fitting on 1 GPU at the usable-VRAM threshold.
+        if ranked[0][1] * usable_fraction >= model_size_mib:
             return [ranked[0][0]], False
 
         # Try fitting on N GPUs (accumulate free memory from most-free)
@@ -984,7 +1003,7 @@ def _select_gpus(
         selected = []
         for idx, free_mib in ranked:
             selected.append(idx)
-            cumulative += free_mib * 0.90
+            cumulative += free_mib * usable_fraction
             if cumulative >= model_size_mib:
                 return sorted(selected), False
 
@@ -1217,10 +1236,15 @@ def _fit_context_to_vram(
     ) -> int:
         """Return the largest context length that fits in GPU VRAM.
 
-        Uses 90% of available VRAM as the budget (matching _select_gpus
-        threshold -- 10% reserved for compute buffers, CUDA context,
-        scratch space, flash-attn workspace, etc.).
-        If the model weights alone don't fit, returns min_ctx unchanged.
+        Uses 90% of available VRAM as the budget for the context-fitting
+        binary search (10% reserved for compute buffers, CUDA context,
+        scratch space, flash-attn workspace, etc.). The ctx-fit budget is
+        intentionally tighter than ``_GPU_PIN_VRAM_FRACTION``: this routine
+        chooses the slider/auto context value, where over-promising would
+        OOM at runtime; ``_select_gpus`` decides whether to pin a GPU at
+        all, where being conservative pushes layers to CPU instead.
+        If the model weights alone don't fit, returns ``requested_ctx``
+        unchanged and lets ``--fit on`` flex ``-ngl`` at runtime.
 
         ``kv_on_gpu`` mirrors ``--kv-offload`` (default on). When False
         the KV cache lives in CPU RAM and doesn't compete with weights
@@ -1966,6 +1990,7 @@ def load_model(
             # still has valid state to publish.
             effective_ctx = n_ctx if n_ctx > 0 else (self._context_length or 0)
             max_available_ctx = self._context_length or effective_ctx
+            gpus: list[tuple[int, int]] = []
             try:
                 model_size = self._get_gguf_size_bytes(model_path)
                 gpus = self._get_gpu_free_memory()
@@ -2050,7 +2075,15 @@ def load_model(
                         # No silent shrink: effective_ctx stays == n_ctx.
                     else:
                         # Auto context: prefer fewer GPUs, cap context to fit.
+                        # Match _select_gpus's headroom threshold so a model
+                        # that fits at 91-95% of free VRAM still pins to GPU
+                        # rather than falling through to ``--fit on`` (issue
+                        # #5106). The ctx cap from ``_fit_context_to_vram``
+                        # uses a more conservative 90% budget so the slider
+                        # value we land on still leaves room for compute
+                        # buffers / CUDA context overhead.
                         ranked = sorted(gpus, key = lambda g: g[1], reverse = True)
+                        pin_fraction = self._GPU_PIN_VRAM_FRACTION
                         for n_gpus in range(1, len(ranked) + 1):
                             subset = ranked[:n_gpus]
                             pool_mib = sum(free for _, free in subset)
@@ -2065,18 +2098,40 @@ def load_model(
                                 capped, cache_type_kv, n_parallel = n_parallel
                             )
                             total_mib = (model_size + kv) / (1024 * 1024)
-                            if total_mib <= pool_mib * 0.90:
+                            if total_mib <= pool_mib * pin_fraction:
                                 effective_ctx = capped
                                 gpu_indices = sorted(idx for idx, _ in subset)
                                 use_fit = False
                                 break
                         else:
-                            # No subset can host the weights (weights alone
-                            # exceed 90% of every pool). Per spec, default
-                            # the UI-visible context to 4096 and let
-                            # --fit on flex -ngl so llama-server offloads
-                            # layers to CPU RAM.
+                            # No subset can host the weights at the native
+                            # context. Default the UI-visible context to
+                            # 4096 -- but before handing off to ``--fit on``,
+                            # re-check whether the model fits at the smaller
+                            # context (issue #5106). Without this re-check,
+                            # a 20.8 GiB model with a 131072 native context
+                            # on a 22.8 GiB GPU is correctly classified as
+                            # "doesn't fit at native ctx", but then we'd
+                            # ship ``--fit on`` without ``-ngl`` even though
+                            # the same model + a 4096 KV cache pins
+                            # comfortably on the GPU.
                             effective_ctx = min(4096, effective_ctx)
+                            if effective_ctx > 0:
+                                for n_gpus in range(1, len(ranked) + 1):
+                                    subset = ranked[:n_gpus]
+                                    pool_mib = sum(free for _, free in subset)
+                                    kv = self._estimate_kv_cache_bytes(
+                                        effective_ctx,
+                                        cache_type_kv,
+                                        n_parallel = n_parallel,
+                                    )
+                                    total_mib = (model_size + kv) / (1024 * 1024)
+                                    if total_mib <= pool_mib * pin_fraction:
+                                        gpu_indices = sorted(
+                                            idx for idx, _ in subset
+                                        )
+                                        use_fit = False
+                                        break
 
                 elif gpus:
                     # Can't estimate KV -- fall back to file-size-only check.
@@ -2500,12 +2555,72 @@ def load_model(
 
             self._healthy = True
 
+            # Diagnose silent CPU fallback: if Studio detected GPUs but
+            # llama-server allocated only CPU model buffers (no CUDA0,
+            # ROCm0, Metal, etc. buffer line in its startup log), the
+            # prebuilt binary couldn't load its GPU backend at runtime.
+            # On Windows this is the unslothai/unsloth#5106 symptom --
+            # cudart64_X.dll / cublas64_X.dll missing because the user
+            # has no system CUDA toolkit and Studio used to ship without
+            # the cudart bundle. Fixed at install time (paired runtime
+            # archive) + warned here as a belt-and-suspenders for any
+            # other runtime-load failure.
+            self._gpu_offload_active = self._classify_gpu_offload(
+                gpu_indices is not None or use_fit, gpus or []
+            )
+            if self._gpu_offload_active is False:
+                logger.warning(
+                    "llama-server appears to have loaded the model entirely "
+                    "on CPU even though Studio detected at least one GPU. "
+                    "This usually means the prebuilt binary's GPU backend "
+                    "failed to load -- on Windows, cudart64_X.dll / "
+                    "cublas64_X.dll could not be resolved. Reinstall the "
+                    "Studio llama.cpp prebuilt or install a matching CUDA "
+                    "toolkit (issue unslothai/unsloth#5106).",
+                )
+
             logger.info(
                 f"llama-server ready on port {self._port} "
                 f"for model '{model_identifier}'"
             )
             return True
 
+    def _classify_gpu_offload(
+        self,
+        expected_gpu: bool,
+        detected_gpus: list[tuple[int, int]],
+    ) -> Optional[bool]:
+        """Return True if llama-server allocated GPU model buffers, False
+        if it allocated only CPU buffers (silent CPU fallback), or None
+        when there is no signal to classify (no GPU detected, or the
+        startup log didn't include a model-buffer-size line).
+
+        ``expected_gpu`` mirrors Studio's intent: True when we either
+        pinned a GPU (-ngl) or asked the fork to fit on GPU. We only
+        warn when the user expected GPU AND we have probe evidence that
+        it didn't happen -- otherwise stay silent.
+        """
+        if not detected_gpus or not expected_gpu:
+            return None
+        # llama-server prints one ``... model buffer size = ... MiB``
+        # line per backend buffer the model lives in. Backend names
+        # include ``CUDA0`` / ``CUDA_Host`` for NVIDIA, ``ROCm0`` for
+        # AMD, ``Metal`` for Apple, ``Vulkan0`` for Vulkan. ``CPU``
+        # / ``CPU_Mapped`` / ``CPU_AARCH64`` indicate CPU allocations.
+        gpu_markers = ("CUDA", "ROCm", "Metal", "Vulkan", "OpenCL", "SYCL")
+        saw_buffer_line = False
+        saw_gpu_buffer = False
+        for line in self._stdout_lines:
+            if "model buffer size" not in line:
+                continue
+            saw_buffer_line = True
+            if any(marker in line for marker in gpu_markers):
+                saw_gpu_buffer = True
+                break
+        if not saw_buffer_line:
+            return None
+        return saw_gpu_buffer
+
     def unload_model(self) -> bool:
         """Terminate the llama-server subprocess and cancel any in-flight download."""
         self._cancel_event.set()
diff --git a/studio/backend/tests/test_llama_cpp_context_fit.py b/studio/backend/tests/test_llama_cpp_context_fit.py
index caa6397901..93ccf77156 100644
--- a/studio/backend/tests/test_llama_cpp_context_fit.py
+++ b/studio/backend/tests/test_llama_cpp_context_fit.py
@@ -192,6 +192,7 @@ def fake_estimate(n_ctx_, _type = None, **_kwargs):
         else:
             ranked = sorted(gpus, key = lambda g: g[1], reverse = True)
             matched = False
+            pin_fraction = LlamaCppBackend._GPU_PIN_VRAM_FRACTION
             for n_gpus in range(1, len(ranked) + 1):
                 subset = ranked[:n_gpus]
                 pool_mib = sum(free for _, free in subset)
@@ -203,7 +204,7 @@ def fake_estimate(n_ctx_, _type = None, **_kwargs):
                 )
                 kv = inst._estimate_kv_cache_bytes(capped, cache_type_kv)
                 total_mib = (model_size + kv) / (1024 * 1024)
-                if total_mib <= pool_mib * 0.90:
+                if total_mib <= pool_mib * pin_fraction:
                     effective_ctx = capped
                     gpu_indices = sorted(idx for idx, _ in subset)
                     use_fit = False
@@ -211,6 +212,20 @@ def fake_estimate(n_ctx_, _type = None, **_kwargs):
                     break
             if not matched:
                 effective_ctx = min(FALLBACK_CTX, effective_ctx)
+                # Mirror llama_cpp.py: after dropping ctx to FALLBACK_CTX,
+                # re-check whether the model fits with the smaller KV cache.
+                if effective_ctx > 0:
+                    for n_gpus in range(1, len(ranked) + 1):
+                        subset = ranked[:n_gpus]
+                        pool_mib = sum(free for _, free in subset)
+                        kv = inst._estimate_kv_cache_bytes(
+                            effective_ctx, cache_type_kv
+                        )
+                        total_mib = (model_size + kv) / (1024 * 1024)
+                        if total_mib <= pool_mib * pin_fraction:
+                            gpu_indices = sorted(idx for idx, _ in subset)
+                            use_fit = False
+                            break
     elif gpus:
         gpu_indices, use_fit = inst._select_gpus(model_size, gpus)
         if use_fit and not explicit_ctx:
@@ -378,6 +393,67 @@ def test_no_kv_metadata_fittable_auto(self):
         assert plan["gpu_indices"] == [0]
 
 
+# ---------------------------------------------------------------------------
+# Issue #5106 / Discord "RAM not VRAM" regression: a model whose weights
+# occupy ~92-94% of free VRAM must still pin to GPU. Before the threshold
+# bump from 0.90 to 0.95 this case fell through to ``--fit on`` without
+# ``-ngl``, and the unsloth llama.cpp fork's fit logic (default
+# ``--fit-target 1024``) ended up offloading layers to CPU even though the
+# model would have loaded comfortably with all layers on GPU.
+# ---------------------------------------------------------------------------
+
+
+class TestTightFitPinsToGPU:
+    """Models that fit at 91-95% of free VRAM must use the GPU."""
+
+    def test_rtx_4090_qwen_24gb_class(self):
+        # noahterbest's reproducer in unslothai/unsloth#5106:
+        #   "GGUF size: 20.8 GB, est. KV cache: 0.1 GB, context: 4096,
+        #    GPUs free: [(0, 22805)], selected: None, fit: True"
+        # With ctx=4096, the model + KV occupies ~94% of free VRAM. The
+        # remaining ~1.4 GiB headroom is enough for the CUDA context and
+        # compute buffers on a 4090, so Studio should pin the GPU and
+        # offload all layers via ``-ngl -1`` instead of relying on the
+        # fork's fit logic.
+        plan = _drive(
+            n_ctx = 0,
+            model_gib = 20.8,
+            gpus = [(0, 22_805)],
+            native_ctx = 131072,
+            kv_per_token_bytes = 25_000,
+        )
+        assert plan["use_fit"] is False
+        assert plan["gpu_indices"] == [0]
+
+    def test_explicit_ctx_at_94_pct_pins_to_gpu(self):
+        # Same shape as above, but the user explicitly chose a context
+        # length. The explicit-ctx branch goes through ``_select_gpus``
+        # which must agree with the auto-ctx branch on the headroom rule.
+        plan = _drive(
+            n_ctx = 4096,
+            model_gib = 20.8,
+            gpus = [(0, 22_805)],
+            native_ctx = 131072,
+            kv_per_token_bytes = 25_000,
+        )
+        assert plan["use_fit"] is False
+        assert plan["gpu_indices"] == [0]
+
+    def test_genuine_overflow_still_uses_fit(self):
+        # Above the 95% pin threshold the fork's ``--fit on`` is still
+        # the right answer; we don't want the threshold bump to mask a
+        # truly oversized model as "fits".
+        plan = _drive(
+            n_ctx = 4096,
+            model_gib = 23,
+            gpus = [(0, 22_000)],
+            native_ctx = 131072,
+            kv_per_token_bytes = 25_000,
+        )
+        assert plan["use_fit"] is True
+        assert plan["gpu_indices"] is None
+
+
 # ---------------------------------------------------------------------------
 # Platform-agnostic input shape
 # ---------------------------------------------------------------------------
@@ -391,3 +467,84 @@ def test_identical_decision_across_platforms(platform_tag):
     plan_a = _drive(n_ctx = 0, model_gib = 8, gpus = [(0, 24_000)])
     plan_b = _drive(n_ctx = 0, model_gib = 8, gpus = [(0, 24_000)])
     assert plan_a == plan_b, platform_tag
+
+
+# ---------------------------------------------------------------------------
+# _classify_gpu_offload: detect silent CPU fallback (issue #5106 / Discord
+# "RAM not VRAM"). When Studio detected GPUs and intended to use them, but
+# llama-server allocated only CPU model buffers, the prebuilt's GPU
+# backend failed to load -- usually missing cudart64_X.dll on Windows.
+# ---------------------------------------------------------------------------
+
+
+class TestClassifyGpuOffload:
+    def _backend(self, stdout_lines):
+        inst = LlamaCppBackend.__new__(LlamaCppBackend)
+        inst._stdout_lines = list(stdout_lines)
+        return inst
+
+    def test_cuda_buffer_present_returns_true(self):
+        inst = self._backend(
+            [
+                "load_tensors: offloaded 33/33 layers to GPU",
+                "load_tensors:        CUDA0 model buffer size = 21000.0 MiB",
+                "load_tensors:   CPU_Mapped model buffer size =     0.6 MiB",
+            ]
+        )
+        assert inst._classify_gpu_offload(True, [(0, 22805)]) is True
+
+    def test_cpu_only_buffer_returns_false(self):
+        # llama-server printed buffer lines but only CPU buffers --
+        # this is the silent CPU fallback symptom we want to catch.
+        inst = self._backend(
+            [
+                "load_tensors:   CPU_Mapped model buffer size = 21000.0 MiB",
+                "load_tensors:          CPU model buffer size =     0.6 MiB",
+            ]
+        )
+        assert inst._classify_gpu_offload(True, [(0, 22805)]) is False
+
+    def test_no_buffer_lines_returns_none(self):
+        # If we can't see buffer-allocation lines at all, don't guess.
+        inst = self._backend(
+            [
+                "INFO [main] starting server",
+                "load_tensors: file format = GGUF V3",
+            ]
+        )
+        assert inst._classify_gpu_offload(True, [(0, 22805)]) is None
+
+    def test_no_gpus_detected_returns_none(self):
+        # CPU-only systems are valid; suppress the warning entirely.
+        inst = self._backend(
+            [
+                "load_tensors:   CPU_Mapped model buffer size = 21000.0 MiB",
+            ]
+        )
+        assert inst._classify_gpu_offload(False, []) is None
+
+    def test_user_did_not_intend_gpu_returns_none(self):
+        # Studio called start_llama_server without expecting GPU use;
+        # don't warn.
+        inst = self._backend(
+            [
+                "load_tensors:   CPU_Mapped model buffer size = 21000.0 MiB",
+            ]
+        )
+        assert inst._classify_gpu_offload(False, [(0, 22805)]) is None
+
+    def test_rocm_buffer_marker_returns_true(self):
+        inst = self._backend(
+            [
+                "load_tensors:        ROCm0 model buffer size = 21000.0 MiB",
+            ]
+        )
+        assert inst._classify_gpu_offload(True, [(0, 22805)]) is True
+
+    def test_metal_buffer_marker_returns_true(self):
+        inst = self._backend(
+            [
+                "load_tensors:       Metal model buffer size = 8000.0 MiB",
+            ]
+        )
+        assert inst._classify_gpu_offload(True, [(0, 22805)]) is True

From 2e9bb84811c8e74d4386cbda5974e8602326ad42 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 May 2026 10:40:43 +0000
Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/llama_cpp.py         | 4 +---
 studio/backend/tests/test_llama_cpp_context_fit.py | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 2080f68f5a..99b8afcdb6 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -2127,9 +2127,7 @@ def load_model(
                                     )
                                     total_mib = (model_size + kv) / (1024 * 1024)
                                     if total_mib <= pool_mib * pin_fraction:
-                                        gpu_indices = sorted(
-                                            idx for idx, _ in subset
-                                        )
+                                        gpu_indices = sorted(idx for idx, _ in subset)
                                         use_fit = False
                                         break
 
diff --git a/studio/backend/tests/test_llama_cpp_context_fit.py b/studio/backend/tests/test_llama_cpp_context_fit.py
index 93ccf77156..0395a92da3 100644
--- a/studio/backend/tests/test_llama_cpp_context_fit.py
+++ b/studio/backend/tests/test_llama_cpp_context_fit.py
@@ -218,9 +218,7 @@ def fake_estimate(n_ctx_, _type = None, **_kwargs):
                     for n_gpus in range(1, len(ranked) + 1):
                         subset = ranked[:n_gpus]
                         pool_mib = sum(free for _, free in subset)
-                        kv = inst._estimate_kv_cache_bytes(
-                            effective_ctx, cache_type_kv
-                        )
+                        kv = inst._estimate_kv_cache_bytes(effective_ctx, cache_type_kv)
                         total_mib = (model_size + kv) / (1024 * 1024)
                         if total_mib <= pool_mib * pin_fraction:
                             gpu_indices = sorted(idx for idx, _ in subset)

From c9d1ccc3d1b7b0aea0cc0e4bd82329a77fd20e42 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 7 May 2026 11:28:38 +0000
Subject: [PATCH 3/4] Trim comments to be more succinct

---
 studio/backend/core/inference/llama_cpp.py    | 93 +++++--------------
 .../tests/test_llama_cpp_context_fit.py       | 33 ++-----
 2 files changed, 32 insertions(+), 94 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 99b8afcdb6..f039fbf642 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -433,9 +433,7 @@ def __init__(self):
         self._hf_variant: Optional[str] = None
         self._is_vision: bool = False
         self._healthy = False
-        # True/False once the post-load classifier sees evidence; None
-        # when there's no signal (no GPU detected, or llama-server's
-        # buffer-allocation log was absent / unparseable).
+        # Set by _classify_gpu_offload after _wait_for_health.
         self._gpu_offload_active: Optional[bool] = None
         self._context_length: Optional[int] = None
         self._effective_context_length: Optional[int] = None
@@ -955,18 +953,11 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
             logger.debug(f"torch GPU probe failed: {e}")
             return []
 
-    # Fraction of free GPU VRAM Studio is willing to pin a model into
-    # before it falls back to ``--fit on`` (and lets the unsloth llama.cpp
-    # fork's fit logic decide how many layers to offload). 0.95 leaves
-    # ~5% of free VRAM for the CUDA context, compute buffers, flash-attn
-    # workspace, and other per-launch overhead. 0.90 is too conservative
-    # at this layer: when a model needs ~92-94% of free VRAM (issue
-    # #5106 / Discord "RAM not VRAM"), Studio used to flip ``use_fit=True``
-    # without ``-ngl``, the fork's fit logic ran with its default
-    # ``--fit-target 1024`` (1 GiB margin) and pushed substantial layer
-    # weight off the GPU even though the model would have loaded
-    # comfortably. The fork's own fit logic still serves as the safety
-    # net for the genuinely-too-large case.
+    # Free-VRAM fraction at which Studio pins the GPU directly instead
+    # of deferring to ``--fit on``. 5% headroom covers CUDA context +
+    # compute buffers; 0.90 was too conservative and dropped 91-94%
+    # fits to CPU offload (#5106). The fork's --fit on still catches
+    # the truly-too-large case.
     _GPU_PIN_VRAM_FRACTION = 0.95
 
     @staticmethod
@@ -1236,15 +1227,11 @@ def _fit_context_to_vram(
     ) -> int:
         """Return the largest context length that fits in GPU VRAM.
 
-        Uses 90% of available VRAM as the budget for the context-fitting
-        binary search (10% reserved for compute buffers, CUDA context,
-        scratch space, flash-attn workspace, etc.). The ctx-fit budget is
-        intentionally tighter than ``_GPU_PIN_VRAM_FRACTION``: this routine
-        chooses the slider/auto context value, where over-promising would
-        OOM at runtime; ``_select_gpus`` decides whether to pin a GPU at
-        all, where being conservative pushes layers to CPU instead.
-        If the model weights alone don't fit, returns ``requested_ctx``
-        unchanged and lets ``--fit on`` flex ``-ngl`` at runtime.
+        Uses 90% of available VRAM as the ctx-fit budget. Tighter than
+        ``_GPU_PIN_VRAM_FRACTION`` on purpose: over-promising context
+        OOMs at runtime, while pinning conservatively just defers to
+        --fit on. If the weights alone don't fit, returns
+        ``requested_ctx`` unchanged.
 
         ``kv_on_gpu`` mirrors ``--kv-offload`` (default on). When False
         the KV cache lives in CPU RAM and doesn't compete with weights
@@ -2074,14 +2061,9 @@ def load_model(
                         gpu_indices, use_fit = self._select_gpus(requested_total, gpus)
                         # No silent shrink: effective_ctx stays == n_ctx.
                     else:
-                        # Auto context: prefer fewer GPUs, cap context to fit.
-                        # Match _select_gpus's headroom threshold so a model
-                        # that fits at 91-95% of free VRAM still pins to GPU
-                        # rather than falling through to ``--fit on`` (issue
-                        # #5106). The ctx cap from ``_fit_context_to_vram``
-                        # uses a more conservative 90% budget so the slider
-                        # value we land on still leaves room for compute
-                        # buffers / CUDA context overhead.
+                        # Auto context: prefer fewer GPUs, cap context
+                        # to fit. Same headroom threshold as
+                        # _select_gpus (#5106).
                         ranked = sorted(gpus, key = lambda g: g[1], reverse = True)
                         pin_fraction = self._GPU_PIN_VRAM_FRACTION
                         for n_gpus in range(1, len(ranked) + 1):
@@ -2104,17 +2086,10 @@ def load_model(
                                 use_fit = False
                                 break
                         else:
-                            # No subset can host the weights at the native
-                            # context. Default the UI-visible context to
-                            # 4096 -- but before handing off to ``--fit on``,
-                            # re-check whether the model fits at the smaller
-                            # context (issue #5106). Without this re-check,
-                            # a 20.8 GiB model with a 131072 native context
-                            # on a 22.8 GiB GPU is correctly classified as
-                            # "doesn't fit at native ctx", but then we'd
-                            # ship ``--fit on`` without ``-ngl`` even though
-                            # the same model + a 4096 KV cache pins
-                            # comfortably on the GPU.
+                            # Native ctx doesn't fit. Drop to 4096 and
+                            # re-check before deferring to --fit on:
+                            # a model that overflows at 131k may pin
+                            # comfortably with a 4096 KV cache (#5106).
                             effective_ctx = min(4096, effective_ctx)
                             if effective_ctx > 0:
                                 for n_gpus in range(1, len(ranked) + 1):
@@ -2553,16 +2528,7 @@ def load_model(
 
             self._healthy = True
 
-            # Diagnose silent CPU fallback: if Studio detected GPUs but
-            # llama-server allocated only CPU model buffers (no CUDA0,
-            # ROCm0, Metal, etc. buffer line in its startup log), the
-            # prebuilt binary couldn't load its GPU backend at runtime.
-            # On Windows this is the unslothai/unsloth#5106 symptom --
-            # cudart64_X.dll / cublas64_X.dll missing because the user
-            # has no system CUDA toolkit and Studio used to ship without
-            # the cudart bundle. Fixed at install time (paired runtime
-            # archive) + warned here as a belt-and-suspenders for any
-            # other runtime-load failure.
+            # Catch silent CPU fallback when GPU was intended (#5106).
             self._gpu_offload_active = self._classify_gpu_offload(
                 gpu_indices is not None or use_fit, gpus or []
             )
@@ -2588,23 +2554,14 @@ def _classify_gpu_offload(
         expected_gpu: bool,
         detected_gpus: list[tuple[int, int]],
     ) -> Optional[bool]:
-        """Return True if llama-server allocated GPU model buffers, False
-        if it allocated only CPU buffers (silent CPU fallback), or None
-        when there is no signal to classify (no GPU detected, or the
-        startup log didn't include a model-buffer-size line).
-
-        ``expected_gpu`` mirrors Studio's intent: True when we either
-        pinned a GPU (-ngl) or asked the fork to fit on GPU. We only
-        warn when the user expected GPU AND we have probe evidence that
-        it didn't happen -- otherwise stay silent.
-        """
+        """True if a GPU model buffer was allocated, False if only CPU
+        buffers landed despite GPU intent, None when there's no signal
+        (no GPU detected, no buffer-size lines, etc.)."""
         if not detected_gpus or not expected_gpu:
             return None
-        # llama-server prints one ``... model buffer size = ... MiB``
-        # line per backend buffer the model lives in. Backend names
-        # include ``CUDA0`` / ``CUDA_Host`` for NVIDIA, ``ROCm0`` for
-        # AMD, ``Metal`` for Apple, ``Vulkan0`` for Vulkan. ``CPU``
-        # / ``CPU_Mapped`` / ``CPU_AARCH64`` indicate CPU allocations.
+        # llama-server logs one ``... model buffer size = N MiB`` line
+        # per backend buffer; CUDA0 / ROCm0 / Metal / Vulkan0 /
+        # OpenCL0 / SYCL0 are GPU, CPU / CPU_Mapped are not.
         gpu_markers = ("CUDA", "ROCm", "Metal", "Vulkan", "OpenCL", "SYCL")
         saw_buffer_line = False
         saw_gpu_buffer = False
diff --git a/studio/backend/tests/test_llama_cpp_context_fit.py b/studio/backend/tests/test_llama_cpp_context_fit.py
index 0395a92da3..1ea76edd15 100644
--- a/studio/backend/tests/test_llama_cpp_context_fit.py
+++ b/studio/backend/tests/test_llama_cpp_context_fit.py
@@ -212,8 +212,7 @@ def fake_estimate(n_ctx_, _type = None, **_kwargs):
                     break
             if not matched:
                 effective_ctx = min(FALLBACK_CTX, effective_ctx)
-                # Mirror llama_cpp.py: after dropping ctx to FALLBACK_CTX,
-                # re-check whether the model fits with the smaller KV cache.
+                # Mirror llama_cpp.py: re-check fit at FALLBACK_CTX.
                 if effective_ctx > 0:
                     for n_gpus in range(1, len(ranked) + 1):
                         subset = ranked[:n_gpus]
@@ -392,12 +391,7 @@ def test_no_kv_metadata_fittable_auto(self):
 
 
 # ---------------------------------------------------------------------------
-# Issue #5106 / Discord "RAM not VRAM" regression: a model whose weights
-# occupy ~92-94% of free VRAM must still pin to GPU. Before the threshold
-# bump from 0.90 to 0.95 this case fell through to ``--fit on`` without
-# ``-ngl``, and the unsloth llama.cpp fork's fit logic (default
-# ``--fit-target 1024``) ended up offloading layers to CPU even though the
-# model would have loaded comfortably with all layers on GPU.
+# #5106 regression: 91-95% utilization must still pin GPU.
 # ---------------------------------------------------------------------------
 
 
@@ -405,14 +399,8 @@ class TestTightFitPinsToGPU:
     """Models that fit at 91-95% of free VRAM must use the GPU."""
 
     def test_rtx_4090_qwen_24gb_class(self):
-        # noahterbest's reproducer in unslothai/unsloth#5106:
-        #   "GGUF size: 20.8 GB, est. KV cache: 0.1 GB, context: 4096,
-        #    GPUs free: [(0, 22805)], selected: None, fit: True"
-        # With ctx=4096, the model + KV occupies ~94% of free VRAM. The
-        # remaining ~1.4 GiB headroom is enough for the CUDA context and
-        # compute buffers on a 4090, so Studio should pin the GPU and
-        # offload all layers via ``-ngl -1`` instead of relying on the
-        # fork's fit logic.
+        # noahterbest's #5106 log: 20.8 GB model on 22805 MiB free
+        # GPU, ctx=4096 -> ~94% utilization, ~1.4 GiB headroom.
         plan = _drive(
             n_ctx = 0,
             model_gib = 20.8,
@@ -424,9 +412,7 @@ def test_rtx_4090_qwen_24gb_class(self):
         assert plan["gpu_indices"] == [0]
 
     def test_explicit_ctx_at_94_pct_pins_to_gpu(self):
-        # Same shape as above, but the user explicitly chose a context
-        # length. The explicit-ctx branch goes through ``_select_gpus``
-        # which must agree with the auto-ctx branch on the headroom rule.
+        # Explicit-ctx branch must agree with auto-ctx on headroom.
         plan = _drive(
             n_ctx = 4096,
             model_gib = 20.8,
@@ -438,9 +424,7 @@ def test_explicit_ctx_at_94_pct_pins_to_gpu(self):
         assert plan["gpu_indices"] == [0]
 
     def test_genuine_overflow_still_uses_fit(self):
-        # Above the 95% pin threshold the fork's ``--fit on`` is still
-        # the right answer; we don't want the threshold bump to mask a
-        # truly oversized model as "fits".
+        # Beyond 95% must still defer to --fit on.
         plan = _drive(
             n_ctx = 4096,
             model_gib = 23,
@@ -468,10 +452,7 @@ def test_identical_decision_across_platforms(platform_tag):
 
 
 # ---------------------------------------------------------------------------
-# _classify_gpu_offload: detect silent CPU fallback (issue #5106 / Discord
-# "RAM not VRAM"). When Studio detected GPUs and intended to use them, but
-# llama-server allocated only CPU model buffers, the prebuilt's GPU
-# backend failed to load -- usually missing cudart64_X.dll on Windows.
+# _classify_gpu_offload: detect silent CPU fallback (#5106).
 # ---------------------------------------------------------------------------
 
 

From e70c1cd34976698d26bea9fb2b3765a10972ecd7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 10 May 2026 12:43:04 +0000
Subject: [PATCH 4/4] Scrub .github/workflows for staging push (matches staging
 base)

---
 .github/workflows/studio-backend-ci.yml      | 200 -------------------
 .github/workflows/studio-frontend-ci.yml     | 108 ----------
 .github/workflows/studio-inference-smoke.yml | 185 -----------------
 .github/workflows/studio-tauri-smoke.yml     | 105 ----------
 .github/workflows/wheel-smoke.yml            | 124 ------------
 5 files changed, 722 deletions(-)
 delete mode 100644 .github/workflows/studio-backend-ci.yml
 delete mode 100644 .github/workflows/studio-frontend-ci.yml
 delete mode 100644 .github/workflows/studio-inference-smoke.yml
 delete mode 100644 .github/workflows/studio-tauri-smoke.yml
 delete mode 100644 .github/workflows/wheel-smoke.yml

diff --git a/.github/workflows/studio-backend-ci.yml b/.github/workflows/studio-backend-ci.yml
deleted file mode 100644
index 5a858888e7..0000000000
--- a/.github/workflows/studio-backend-ci.yml
+++ /dev/null
@@ -1,200 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Runs the existing studio/backend/tests/ suite (~860 tests, all CPU-friendly)
-# on every PR that touches the backend or unsloth library. Until this lands,
-# none of those tests run automatically. Verified locally on Python 3.13 with
-# the surgical exclusions below: 861 pass, 4 skipped.
-#
-# Exclusions:
-#   - tests/test_studio_api.py: end-to-end against a live model + GGUF download,
-#     too heavy for free runners. Run separately when GPU CI is available.
-#   - -k 'not llama_cpp_load_progress_live': spawns a real llama.cpp process,
-#     not appropriate for CPU-only runners.
-#
-# ruff is non-blocking initially; remove `|| true` once the backend lints clean.
-
-name: Backend CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'tests/**'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-backend-ci.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  pytest:
-    name: (Python ${{ matrix.python }})
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    strategy:
-      fail-fast: false
-      matrix:
-        python: ['3.10', '3.11', '3.12', '3.13']
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '${{ matrix.python }}'
-          cache: 'pip'
-
-      - name: Install backend test dependencies (CPU only)
-        run: |
-          python -m pip install --upgrade pip
-          # Studio's declared backend deps:
-          pip install -r studio/backend/requirements/studio.txt
-          # Extras that studio.txt does not list but the import chain needs
-          # (python-multipart for FastAPI form/file uploads, sqlalchemy/cryptography
-          #  for the auth DB, yaml/jinja2 for utils.models.model_config, etc.):
-          pip install \
-            python-multipart aiofiles sqlalchemy cryptography \
-            pyyaml jinja2 mammoth unpdf requests \
-            'numpy<3' pytest pytest-asyncio httpx
-          # Torch CPU + transformers are required by a chunk of the backend test
-          # suite (gpu_selection, kv_cache_estimation, utils). CPU-only torch
-          # keeps the install ~250 MB / ~1 min on a clean runner.
-          pip install --index-url https://download.pytorch.org/whl/cpu 'torch>=2.4,<2.11'
-          pip install 'transformers>=4.51,<5.5'
-
-      - name: Backend tests
-        working-directory: studio/backend
-        # Locally validated against this dep set: 831 passed, 5 skipped, 35 deselected.
-        # Deselections (all environment-specific, would never pass on a GPU-less
-        # `ubuntu-latest` runner regardless of code correctness):
-        #   - llama_cpp_load_progress_live: spawns a real llama.cpp process
-        #   - TestGpuAutoSelection / TestPreSpawnGpuResolution / TestPerGpuFitGuardAllCounts:
-        #       require live transformers config introspection on real GPUs
-        #   - TestTransformersIntrospection: same
-        #   - test_returns_cuda_when_cuda_available / test_calls_cuda_cache_when_cuda:
-        #       assume CUDA-capable GPU
-        run: |
-          python -m pytest tests/ -q --tb=short \
-            --ignore=tests/test_studio_api.py \
-            -k 'not llama_cpp_load_progress_live and not TestGpuAutoSelection and not TestPreSpawnGpuResolution and not TestPerGpuFitGuardAllCounts and not TestTransformersIntrospection and not test_returns_cuda_when_cuda_available and not test_calls_cuda_cache_when_cuda'
-
-  repo-cpu-tests:
-    # Auto-discover everything under tests/ that is not GPU-bound by
-    # design. New tests added in covered directories are picked up
-    # without a workflow edit. Locally validated: 779 passed, 11
-    # skipped, 23 deselected. tests/conftest.py (mirroring unsloth-zoo
-    # PR #624) pre-loads unsloth_zoo.device_type and unsloth.device_type
-    # under a mocked torch.cuda.is_available so the unsloth import
-    # chain succeeds on CPU.
-    name: Repo tests (CPU)
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Install deps (shared shape with backend pytest job)
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r studio/backend/requirements/studio.txt
-          pip install \
-            python-multipart aiofiles sqlalchemy cryptography \
-            pyyaml jinja2 mammoth unpdf requests typer \
-            'numpy<3' pytest pytest-asyncio httpx
-          # torchvision is needed because unsloth_zoo.vision_utils imports
-          # it at module scope and is reached via unsloth.models._utils.
-          pip install --index-url https://download.pytorch.org/whl/cpu \
-            'torch>=2.4,<2.11' 'torchvision<0.26'
-          pip install 'transformers>=4.51,<5.5'
-          # bitsandbytes is a hard import in unsloth/models/_utils.py.
-          # Recent versions ship a CPU build so it installs on a free
-          # Linux runner; the kernels still raise on use, but import
-          # succeeds and the package collects.
-          pip install 'bitsandbytes>=0.45'
-          # unsloth.device_type imports unsloth_zoo.utils.Version at module
-          # scope, so the conftest harness needs unsloth_zoo on the path
-          # even though it is an optional dep of unsloth.
-          pip install 'unsloth_zoo>=2026.5.1'
-          pip install -e . --no-deps
-
-      - name: Repo tests (CPU, auto-discovered)
-        env:
-          # tests/python/* import install_python_stack from studio/.
-          PYTHONPATH: ${{ github.workspace }}/studio
-          # Skip lazy compilation work the unsloth import chain wants to
-          # do at import time on a real GPU.
-          UNSLOTH_COMPILE_DISABLE: '1'
-        # --ignore: GPU-bound directories (qlora and saving need real
-        #   weights / GPU; tests/sh is a shell suite the next step
-        #   handles; tests/utils is a helpers folder, not tests).
-        # State-sensitive hardware-spoofing files are pulled out and run
-        # in isolation in the next step because they mutate
-        # hardware.py module globals (IS_ROCM / DEVICE) and pollute
-        # downstream tests.
-        # -m: honour markers already declared in tests/python/conftest.py
-        #   (`server` = needs studio venv, `e2e` = needs network).
-        # --deselect: two registry tests that hit huggingface_hub for
-        #   live model existence checks; they belong on a network job.
-        run: |
-          python -m pytest tests/ -q --tb=short \
-            --ignore=tests/qlora \
-            --ignore=tests/saving \
-            --ignore=tests/utils \
-            --ignore=tests/sh \
-            --ignore=tests/studio/test_hardware_dispatch_matrix.py \
-            --ignore=tests/studio/test_is_mlx_dispatch_gate.py \
-            -m 'not server and not e2e' \
-            --deselect tests/test_model_registry.py::test_model_registration \
-            --deselect tests/test_model_registry.py::test_all_model_registration
-
-      - name: Hardware-spoof tests (state-sensitive, run in isolation)
-        env:
-          PYTHONPATH: ${{ github.workspace }}/studio
-          UNSLOTH_COMPILE_DISABLE: '1'
-        # These two files mutate hardware.py module globals at runtime
-        # via the spoof fixtures, which leaks state into any other test
-        # that imports hardware. Run them in their own pytest invocation
-        # so the leak does not cross file boundaries.
-        run: |
-          python -m pytest -q --tb=short \
-            tests/studio/test_hardware_dispatch_matrix.py \
-            tests/studio/test_is_mlx_dispatch_gate.py
-
-      - name: Shell installer tests
-        # Subset that does not depend on a writable / pristine install.sh
-        # tree; test_install_host_defaults.sh checks install.ps1 layout
-        # which has drifted (separate followup).
-        run: |
-          set -e
-          for s in \
-              tests/sh/test_get_torch_index_url.sh \
-              tests/sh/test_mac_intel_compat.sh \
-              tests/sh/test_tauri_install_exit_order.sh \
-              tests/sh/test_torch_constraint.sh; do
-              echo "::group::$s"
-              bash "$s"
-              echo "::endgroup::"
-          done
-
-  ruff:
-    name: Backend ruff lint (non-blocking)
-    runs-on: ubuntu-latest
-    timeout-minutes: 5
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - run: pip install ruff
-      - name: ruff check (non-blocking until accumulated drift is cleared)
-        run: ruff check studio/backend || true
diff --git a/.github/workflows/studio-frontend-ci.yml b/.github/workflows/studio-frontend-ci.yml
deleted file mode 100644
index 039bd5dd08..0000000000
--- a/.github/workflows/studio-frontend-ci.yml
+++ /dev/null
@@ -1,108 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Frontend PR gate: lockfile freshness, typecheck, build, and a bundle grep
-# that catches the 2026.5.1 chat-history regression at the JS level.
-#
-# biome runs as non-blocking for now: the codebase currently has accumulated
-# ~470 errors and ~1650 warnings against the existing biome config. Surfacing
-# the count in CI lets us drive it down without forcing a fleet-wide cleanup
-# in the same PR. Drop `continue-on-error` once that number is zero.
-
-name: Frontend CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/frontend/**'
-      - '.github/workflows/studio-frontend-ci.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    name: Frontend build + bundle sanity
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    defaults:
-      run:
-        working-directory: studio/frontend
-    steps:
-      - uses: actions/checkout@v4
-
-      # FIXME: drop this step once @assistant-ui/* and assistant-stream
-      # leave 0.x -- on 1.x, caret ranges are conventional. Until then,
-      # every 0.minor on this surface is a SemVer-major (this is exactly
-      # how 2026.5.1 shipped a broken chat runtime: ^0.12.19 quietly
-      # resolved to 0.12.28).
-      - name: '@assistant-ui must be pinned exactly (no caret/tilde)'
-        working-directory: ${{ github.workspace }}
-        run: |
-          set -e
-          if grep -nE '"(@assistant-ui/[a-z-]+|assistant-stream)":[[:space:]]*"[\^~]' studio/frontend/package.json; then
-            echo "::error file=studio/frontend/package.json::These packages must be pinned to exact versions until they leave 0.x. Drop the leading ^ or ~."
-            exit 1
-          fi
-          echo "All assistant-ui packages are pinned exactly."
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22'
-          cache: 'npm'
-          cache-dependency-path: studio/frontend/package-lock.json
-
-      - name: Lockfile must agree with package.json (npm ci is strict)
-        run: npm ci --no-fund --no-audit
-
-      - name: npm ci must not have modified the working tree
-        working-directory: ${{ github.workspace }}
-        run: |
-          if ! git diff --quiet -- studio/frontend; then
-            echo "::error::npm ci modified files; commit the updated lockfile"
-            git status -- studio/frontend
-            exit 1
-          fi
-
-      - name: Typecheck
-        run: npm run typecheck
-
-      - name: Build
-        run: npm run build
-
-      - name: Built bundle must not contain Studio's unstable_Provider call site
-        run: |
-          set -e
-          JS=$(ls dist/assets/index-*.js | head -1)
-          HITS=$(grep -c 'unstable_Provider:' "$JS" || echo 0)
-          echo "main bundle: $JS"
-          echo "unstable_Provider: hits=$HITS (assistant-ui internals contribute up to 3)"
-          if [ "$HITS" -gt 3 ]; then
-            echo "::error file=studio/frontend/src/features/chat/runtime-provider.tsx::Studio bundle still passes unstable_Provider through useRemoteThreadListRuntime; this is the 2026.5.1 chat-history regression. Pass adapters directly into useLocalRuntime instead."
-            exit 1
-          fi
-
-      - name: Bundle size budget (75 MB)
-        run: |
-          SIZE=$(du -sb dist | cut -f1)
-          BUDGET=$((75 * 1024 * 1024))
-          echo "dist size: $SIZE bytes ($((SIZE/1024/1024)) MB), budget: $BUDGET bytes (75 MB)"
-          if [ "$SIZE" -gt "$BUDGET" ]; then
-            echo "::error::studio/frontend/dist/ exceeded the 75 MB budget. Drop dead deps (e.g. the unused next dep) or split chunks."
-            exit 1
-          fi
-
-      - name: Biome (non-blocking until accumulated drift is cleared)
-        continue-on-error: true
-        run: npm run biome:check
-
-      - name: Upload built dist on failure
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: studio-frontend-dist
-          path: studio/frontend/dist
-          retention-days: 3
diff --git a/.github/workflows/studio-inference-smoke.yml b/.github/workflows/studio-inference-smoke.yml
deleted file mode 100644
index 8efe072d28..0000000000
--- a/.github/workflows/studio-inference-smoke.yml
+++ /dev/null
@@ -1,185 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# End-to-end smoke: install Studio via install.sh --local --no-torch, download
-# a tiny GGUF, boot Studio, log in, change password, load the model, send a
-# chat completion, assert a non-empty response. Only workflow that tests "the
-# app actually works".
-#
-# Model: Qwen3.5-2B UD-IQ3_XXS (~890 MiB) -- small enough that the cache miss
-# is cheap and inference fits in the 25 min CPU-runner budget. GGUF is cached
-# across runs via actions/cache.
-
-name: Studio GGUF CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.sh'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-inference-smoke.yml'
-  push:
-    branches: [main, pip]
-  # Manual trigger for pre-warming the GGUF cache on main, or re-running
-  # against an arbitrary branch without pushing a no-op commit.
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-env:
-  GGUF_REPO: unsloth/Qwen3.5-2B-GGUF
-  GGUF_FILE: Qwen3.5-2B-UD-IQ3_XXS.gguf
-  STUDIO_PORT: '18888'
-
-jobs:
-  inference:
-    name: Studio boots, loads a GGUF, answers a chat completion
-    runs-on: ubuntu-latest
-    timeout-minutes: 25
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Linux dependencies for llama.cpp prebuilt
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            libcurl4-openssl-dev libssl-dev jq
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22'
-          cache: 'npm'
-          cache-dependency-path: studio/frontend/package-lock.json
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Cache GGUF model file
-        id: cache-gguf
-        uses: actions/cache@v4
-        with:
-          path: gguf-cache
-          key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
-      - name: Download GGUF if cache miss
-        if: steps.cache-gguf.outputs.cache-hit != 'true'
-        run: |
-          # huggingface-cli was deprecated in huggingface_hub 1.13; the new CLI is `hf`.
-          python -m pip install --upgrade huggingface_hub hf_transfer
-          mkdir -p gguf-cache
-          HF_HUB_ENABLE_HF_TRANSFER=1 \
-            hf download "$GGUF_REPO" "$GGUF_FILE" --local-dir gguf-cache
-
-      - name: Install Studio (--local, --no-torch keeps the install lean)
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Assert llama.cpp prebuilt was installed (no source-build fallback)
-        # ubuntu-latest is CPU-only x86_64, so studio/setup.sh should route
-        # to ggml-org/llama.cpp and grab bin-ubuntu-x64.tar.gz. A source
-        # build here means the routing regressed.
-        run: |
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::llama.cpp prebuilt path failed on ubuntu-latest. studio/setup.sh routing regressed; CPU-only Linux x86_64 should hit ggml-org/llama.cpp's bin-ubuntu-x64.tar.gz."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          if ! grep -qE "prebuilt installed and validated|prebuilt up to date and validated" logs/install.log; then
-            echo "::error::install.log does not contain the success marker for the llama.cpp prebuilt path. Did setup.sh skip the prebuilt install?"
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          echo "llama.cpp prebuilt path used successfully"
-
-      - name: Reset auth + start Studio in the background
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        run: |
-          for i in $(seq 1 60); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              echo "ready after ${i}s"
-              cat /tmp/health.json
-              jq -e '.status == "healthy"' /tmp/health.json
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Studio did not become healthy in 60s"
-          tail -200 logs/studio.log
-          exit 1
-
-      - name: Login + change bootstrap password
-        run: |
-          PW=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIPasswordSmoke12345!"
-          TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$PW\"}" | jq -r .access_token)
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$PW\",\"new_password\":\"$NEW\"}" > /dev/null
-          # Re-login to clear must_change_password flag.
-          NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV"
-
-      - name: Load the GGUF into Studio
-        run: |
-          GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}"
-          ls -lh "$GGUF_PATH"
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 600 \
-            -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name, is_gguf, context_length}'
-
-      - name: Send a chat completion + assert non-empty response
-        run: |
-          RESP=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/chat/completions" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 900 \
-            -d '{
-              "messages":[{"role":"user","content":"Say hello in one short sentence."}],
-              "max_tokens":40,
-              "stream":false
-            }')
-          echo "raw response: $RESP"
-          CONTENT=$(echo "$RESP" | jq -r '.choices[0].message.content // empty')
-          echo "model response: $CONTENT"
-          if [ -z "$CONTENT" ]; then
-            echo "::error::Empty assistant response from Studio"
-            exit 1
-          fi
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" || true
-          sleep 2
-          ss -tln | grep ":${STUDIO_PORT}" || true
-
-      - name: Upload Studio + install logs on failure
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: studio-inference-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
diff --git a/.github/workflows/studio-tauri-smoke.yml b/.github/workflows/studio-tauri-smoke.yml
deleted file mode 100644
index fcc9c8d963..0000000000
--- a/.github/workflows/studio-tauri-smoke.yml
+++ /dev/null
@@ -1,105 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# PR-time smoke for the Tauri desktop wrapper. Builds the frontend and the
-# Tauri Linux debug binary, with no codesigning. Catches:
-#   - tauri.conf.json drift
-#   - src-tauri Cargo.toml or rust source breakage
-#   - Tauri CLI version drift (we pin 2.10.1, matching release-desktop.yml)
-#   - frontend output not picked up by Tauri's distDir
-#
-# Linux-only on a free `ubuntu-latest` runner. Mac and Windows desktop builds
-# stay in release-desktop.yml (manual `workflow_dispatch`) because they need
-# code-signing secrets and ~30 min of runner time each.
-
-name: Studio Tauri CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/frontend/**'
-      - 'studio/src-tauri/**'
-      - '.github/workflows/studio-tauri-smoke.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  linux-debug-build:
-    name: Tauri Linux debug build (no codesign)
-    runs-on: ubuntu-22.04
-    timeout-minutes: 25
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Linux native deps for Tauri / WebKit2GTK
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y \
-            libwebkit2gtk-4.1-dev libayatana-appindicator3-dev \
-            librsvg2-dev libxdo-dev libssl-dev patchelf
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '24'
-          cache: 'npm'
-          cache-dependency-path: studio/frontend/package-lock.json
-
-      - uses: dtolnay/rust-toolchain@stable
-
-      - uses: swatinem/rust-cache@v2
-        with:
-          workspaces: studio/src-tauri -> target
-
-      - name: Install pinned Tauri CLI (matches release-desktop.yml)
-        run: npm install --save-dev --prefix studio @tauri-apps/cli@2.10.1
-
-      - name: Verify pinned Tauri CLI version
-        run: |
-          out="$(npx --prefix studio tauri --version)"
-          echo "$out"
-          [ "$out" = "tauri-cli 2.10.1" ] || { echo "::error::expected tauri-cli 2.10.1, got $out"; exit 1; }
-
-      - name: Frontend build (npm ci, vite)
-        working-directory: studio/frontend
-        run: |
-          npm ci --no-fund --no-audit
-          npm run build
-          test -f dist/index.html
-
-      - name: Tauri debug build (Linux, no bundle, no codesign)
-        # `--debug` + `--no-bundle` keeps this lean: compiles the Rust crate,
-        # confirms the frontend dist is wired into Tauri, but skips the AppImage
-        # / .deb production. Code signing is irrelevant because we never produce
-        # a distributable artifact.
-        env:
-          TAURI_SIGNING_PRIVATE_KEY: ''
-          TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ''
-        run: npx --prefix studio tauri build --debug --no-bundle
-
-      - name: Inspect produced binary
-        run: |
-          BIN=$(find studio/src-tauri/target/debug -maxdepth 1 -type f -executable 2>/dev/null \
-                | grep -Ev '\.(d|so|dylib|dll)$' \
-                | grep -Ev '/(deps|build|examples)$' \
-                | head -1)
-          echo "binary: $BIN"
-          if [ -z "$BIN" ]; then
-            echo "::error::Tauri debug binary not produced"
-            ls -la studio/src-tauri/target/debug/ || true
-            exit 1
-          fi
-          file "$BIN"
-          du -h "$BIN"
-
-      - uses: actions/upload-artifact@v4
-        if: failure()
-        with:
-          name: tauri-debug-build
-          path: |
-            studio/src-tauri/target/debug
-            studio/frontend/dist
-          retention-days: 3
diff --git a/.github/workflows/wheel-smoke.yml b/.github/workflows/wheel-smoke.yml
deleted file mode 100644
index 080a6bb261..0000000000
--- a/.github/workflows/wheel-smoke.yml
+++ /dev/null
@@ -1,124 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Builds the PyPI wheel from the PR branch, then verifies the built wheel
-# actually contains what we expect to ship and does NOT contain the broken
-# Studio bundle that 2026.5.1 published. This is the single workflow that
-# would have blocked the 2026.5.1 release before twine upload.
-#
-# Verified locally end-to-end against this branch:
-#   - python -m build produces unsloth-<version>-py3-none-any.whl in 13s
-#   - wheel content sanity passes:
-#       lockfile shipped, frontend dist shipped,
-#       no node_modules in wheel, no bun.lock in wheel,
-#       main bundle has unstable_Provider hits=1 (assistant-ui internals only).
-#   - Studio backend imports cleanly from the installed wheel with the
-#     lightweight dep set below.
-
-name: Wheel CI
-
-on:
-  pull_request:
-    paths:
-      - 'pyproject.toml'
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - '.github/workflows/wheel-smoke.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  wheel:
-    name: Wheel build + content sanity + import smoke
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22'
-          cache: 'npm'
-          cache-dependency-path: studio/frontend/package-lock.json
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-
-      - name: Build frontend
-        run: |
-          cd studio/frontend
-          npm ci --no-fund --no-audit
-          npm run build
-
-      - name: Build wheel + sdist
-        run: |
-          python -m pip install --upgrade pip build
-          rm -rf dist build ./*.egg-info
-          python -m build
-
-      - name: Wheel content sanity
-        run: |
-          python - <<'PY'
-          import zipfile, glob, sys
-          w = glob.glob("dist/unsloth-*.whl")
-          if not w:
-              print("FAIL: no wheel produced"); sys.exit(2)
-          w = w[0]
-          print(f"wheel: {w}")
-          with zipfile.ZipFile(w) as z:
-              n = z.namelist()
-              checks = {
-                "lockfile shipped":      any(s.endswith("studio/frontend/package-lock.json") for s in n),
-                "frontend dist shipped": any(s.endswith("studio/frontend/dist/index.html")    for s in n),
-                "no node_modules":       not any("studio/frontend/node_modules/" in s for s in n),
-                "no bun.lock":           not any(s.endswith("studio/frontend/bun.lock")       for s in n),
-              }
-              js = [s for s in n
-                    if "studio/frontend/dist/assets/" in s
-                    and s.endswith(".js")
-                    and "/index-" in s]
-              if not js:
-                  print("FAIL: no main bundle index-*.js in wheel"); sys.exit(2)
-              data = z.read(js[0]).decode("utf-8", "replace")
-              hits = data.count("unstable_Provider:")
-              print(f"main bundle: {js[0]}")
-              print(f"unstable_Provider hits: {hits} (>=4 indicates 2026.5.1 regression)")
-              checks["bundle has no Studio unstable_Provider call site"] = (hits < 4)
-
-              print()
-              for k, v in checks.items():
-                  print(f"  [{'PASS' if v else 'FAIL'}] {k}")
-              sys.exit(0 if all(checks.values()) else 1)
-          PY
-
-      - name: Studio backend import smoke
-        # Imports `studio.backend.main:app` from the freshly-installed wheel in
-        # a clean venv. This catches the class of bug that 2026.5.1 shipped with:
-        # frontend dist missing, package-lock.json missing, or the wheel's Python
-        # source tree broken in a way that surfaces only at app construction time.
-        run: |
-          python -m venv /tmp/v
-          /tmp/v/bin/pip install --upgrade pip
-          /tmp/v/bin/pip install -r studio/backend/requirements/studio.txt
-          /tmp/v/bin/pip install \
-            python-multipart aiofiles sqlalchemy cryptography \
-            pyyaml jinja2 mammoth unpdf requests \
-            'numpy<3'
-          /tmp/v/bin/pip install --no-deps dist/unsloth-*.whl
-          # Run from /tmp so Python imports the installed package, not the source tree.
-          cd /tmp
-          /tmp/v/bin/python -c "from studio.backend.main import app; print('Studio backend OK:', app.title)"
-
-      - name: Upload wheel on failure
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: unsloth-wheel
-          path: dist/
-          retention-days: 7