From fbd28015c203984c78a2ba907da3ef1e5570a331 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 7 May 2026 10:39:44 +0000 Subject: [PATCH 1/4] Studio: pin GPU at 95% headroom and warn on silent CPU fallback Two related runtime-side fixes for unslothai/unsloth#5106 ("model loaded fully on RAM instead of VRAM"): 1. GPU pin threshold bump 0.90 -> 0.95 ------------------------------------- ``_select_gpus`` and the auto-ctx pin loop in ``start_llama_server`` used a ``pool * 0.90`` threshold to decide whether the model fits on GPU. Models that needed 91-94% of free VRAM were classified as "does not fit", so Studio set ``gpu_indices = None`` and shipped ``--fit on`` to llama-server without ``-ngl``. The unsloth llama.cpp fork's ``--fit on`` then ran with its default ``--fit-target 1024`` (1 GiB margin per device, an upstream default inherited from ggml-org#18679). On a tight fit where compute buffers + CUDA context push the projected free below the 1 GiB target, the fork's fit logic shaves layer weights off the GPU -- slow inference for users whose models would have loaded comfortably with ``-ngl -1``. The classic reproducer from #5106 (noahterbest's log): GGUF size: 20.8 GB, est. KV cache: 0.1 GB, context: 4096, GPUs free: [(0, 22805)], selected: None, fit: True 20.8 GiB on a 22.27 GiB free RTX 4090 is 94% utilization. The model fits (1.4 GiB headroom), but the 0.90 threshold kicks it to fit mode. Bumping to 0.95 keeps these in the fits-on-GPU branch and emits ``-ngl -1`` directly. The fork's ``--fit on`` still serves as the safety net for the genuinely-too-large case. The auto-ctx fallback also re-checks fit at 4096 before handing off to ``--fit on``: a 20.8 GiB model with a 131072 native context fails the auto loop at native ctx, falls back to ``min(4096, ctx)``, but its weights + 4096 KV pin to the GPU comfortably. Without the re-check we still emitted ``--fit on``. ``_fit_context_to_vram``'s 0.90 budget for context binary search is intentionally left tighter than the pin fraction. That routine chooses the slider value, where over-promising would OOM at runtime. ``_select_gpus`` decides whether to pin at all, where being conservative pushes layers to CPU. 2. Belt-and-suspenders: warn on silent CPU fallback --------------------------------------------------- After ``_wait_for_health`` succeeds, scan llama-server's stdout for ``model buffer size`` lines. If Studio detected GPUs and intended GPU use but only CPU buffers were allocated, log a structured warning citing #5106. Markers cover CUDA / ROCm / Metal / Vulkan / OpenCL / SYCL backends. New ``_gpu_offload_active: Optional[bool]`` field surfaces the result for any future API consumer. This catches runtime-load failures the install-time fix cannot cover (cudart bundle pairing PR #5322 is the install-side companion): user overriding ``--fit-target``, uncommon driver + toolkit configurations, future regressions in the install path. Tests: 10 new cases in studio/backend/tests/test_llama_cpp_context_fit.py: * TestTightFitPinsToGPU x3: noahterbest's exact reproducer (auto and explicit ctx pins to GPU at 94%); guard against threshold over- broadening (genuine overflow still falls back to ``--fit on``). * TestClassifyGpuOffload x7: CUDA / ROCm / Metal buffer markers return True; CPU-only buffer lines return False; absent buffer lines or no GPUs detected return None (no warning). 25 context-fit tests pass (15 baseline + 10 new). 511 tests total across the affected test files. No regressions. Refs #5106 --- studio/backend/core/inference/llama_cpp.py | 147 ++++++++++++++-- .../tests/test_llama_cpp_context_fit.py | 159 +++++++++++++++++- 2 files changed, 289 insertions(+), 17 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 8da836de38..2080f68f5a 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -433,6 +433,10 @@ def __init__(self): self._hf_variant: Optional[str] = None self._is_vision: bool = False self._healthy = False + # True/False once the post-load classifier sees evidence; None + # when there's no signal (no GPU detected, or llama-server's + # buffer-allocation log was absent / unparseable). + self._gpu_offload_active: Optional[bool] = None self._context_length: Optional[int] = None self._effective_context_length: Optional[int] = None self._max_context_length: Optional[int] = None @@ -951,6 +955,20 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]: logger.debug(f"torch GPU probe failed: {e}") return [] + # Fraction of free GPU VRAM Studio is willing to pin a model into + # before it falls back to ``--fit on`` (and lets the unsloth llama.cpp + # fork's fit logic decide how many layers to offload). 0.95 leaves + # ~5% of free VRAM for the CUDA context, compute buffers, flash-attn + # workspace, and other per-launch overhead. 0.90 is too conservative + # at this layer: when a model needs ~92-94% of free VRAM (issue + # #5106 / Discord "RAM not VRAM"), Studio used to flip ``use_fit=True`` + # without ``-ngl``, the fork's fit logic ran with its default + # ``--fit-target 1024`` (1 GiB margin) and pushed substantial layer + # weight off the GPU even though the model would have loaded + # comfortably. The fork's own fit logic still serves as the safety + # net for the genuinely-too-large case. + _GPU_PIN_VRAM_FRACTION = 0.95 + @staticmethod def _select_gpus( model_size_bytes: int, @@ -959,11 +977,11 @@ def _select_gpus( """Pick GPU(s) for a model based on estimated VRAM and free memory. ``model_size_bytes`` should include both model weights and estimated - KV cache. The 90% threshold provides headroom for compute buffers, - CUDA context, and other runtime overhead. + KV cache. The ``_GPU_PIN_VRAM_FRACTION`` threshold provides headroom + for compute buffers, CUDA context, and other runtime overhead. Returns (gpu_indices, use_fit): - - ([1], False) model fits on 1 GPU at 90% of free + - ([1], False) model fits on 1 GPU at the headroom threshold - ([1, 2], False) model needs 2 GPUs - (None, True) model too large, let --fit handle it """ @@ -971,12 +989,13 @@ def _select_gpus( return None, True model_size_mib = model_size_bytes / (1024 * 1024) + usable_fraction = LlamaCppBackend._GPU_PIN_VRAM_FRACTION # Sort GPUs by free memory descending ranked = sorted(gpus, key = lambda g: g[1], reverse = True) - # Try fitting on 1 GPU (90% of free memory threshold) - if ranked[0][1] * 0.90 >= model_size_mib: + # Try fitting on 1 GPU at the usable-VRAM threshold. + if ranked[0][1] * usable_fraction >= model_size_mib: return [ranked[0][0]], False # Try fitting on N GPUs (accumulate free memory from most-free) @@ -984,7 +1003,7 @@ def _select_gpus( selected = [] for idx, free_mib in ranked: selected.append(idx) - cumulative += free_mib * 0.90 + cumulative += free_mib * usable_fraction if cumulative >= model_size_mib: return sorted(selected), False @@ -1217,10 +1236,15 @@ def _fit_context_to_vram( ) -> int: """Return the largest context length that fits in GPU VRAM. - Uses 90% of available VRAM as the budget (matching _select_gpus - threshold -- 10% reserved for compute buffers, CUDA context, - scratch space, flash-attn workspace, etc.). - If the model weights alone don't fit, returns min_ctx unchanged. + Uses 90% of available VRAM as the budget for the context-fitting + binary search (10% reserved for compute buffers, CUDA context, + scratch space, flash-attn workspace, etc.). The ctx-fit budget is + intentionally tighter than ``_GPU_PIN_VRAM_FRACTION``: this routine + chooses the slider/auto context value, where over-promising would + OOM at runtime; ``_select_gpus`` decides whether to pin a GPU at + all, where being conservative pushes layers to CPU instead. + If the model weights alone don't fit, returns ``requested_ctx`` + unchanged and lets ``--fit on`` flex ``-ngl`` at runtime. ``kv_on_gpu`` mirrors ``--kv-offload`` (default on). When False the KV cache lives in CPU RAM and doesn't compete with weights @@ -1966,6 +1990,7 @@ def load_model( # still has valid state to publish. effective_ctx = n_ctx if n_ctx > 0 else (self._context_length or 0) max_available_ctx = self._context_length or effective_ctx + gpus: list[tuple[int, int]] = [] try: model_size = self._get_gguf_size_bytes(model_path) gpus = self._get_gpu_free_memory() @@ -2050,7 +2075,15 @@ def load_model( # No silent shrink: effective_ctx stays == n_ctx. else: # Auto context: prefer fewer GPUs, cap context to fit. + # Match _select_gpus's headroom threshold so a model + # that fits at 91-95% of free VRAM still pins to GPU + # rather than falling through to ``--fit on`` (issue + # #5106). The ctx cap from ``_fit_context_to_vram`` + # uses a more conservative 90% budget so the slider + # value we land on still leaves room for compute + # buffers / CUDA context overhead. ranked = sorted(gpus, key = lambda g: g[1], reverse = True) + pin_fraction = self._GPU_PIN_VRAM_FRACTION for n_gpus in range(1, len(ranked) + 1): subset = ranked[:n_gpus] pool_mib = sum(free for _, free in subset) @@ -2065,18 +2098,40 @@ def load_model( capped, cache_type_kv, n_parallel = n_parallel ) total_mib = (model_size + kv) / (1024 * 1024) - if total_mib <= pool_mib * 0.90: + if total_mib <= pool_mib * pin_fraction: effective_ctx = capped gpu_indices = sorted(idx for idx, _ in subset) use_fit = False break else: - # No subset can host the weights (weights alone - # exceed 90% of every pool). Per spec, default - # the UI-visible context to 4096 and let - # --fit on flex -ngl so llama-server offloads - # layers to CPU RAM. + # No subset can host the weights at the native + # context. Default the UI-visible context to + # 4096 -- but before handing off to ``--fit on``, + # re-check whether the model fits at the smaller + # context (issue #5106). Without this re-check, + # a 20.8 GiB model with a 131072 native context + # on a 22.8 GiB GPU is correctly classified as + # "doesn't fit at native ctx", but then we'd + # ship ``--fit on`` without ``-ngl`` even though + # the same model + a 4096 KV cache pins + # comfortably on the GPU. effective_ctx = min(4096, effective_ctx) + if effective_ctx > 0: + for n_gpus in range(1, len(ranked) + 1): + subset = ranked[:n_gpus] + pool_mib = sum(free for _, free in subset) + kv = self._estimate_kv_cache_bytes( + effective_ctx, + cache_type_kv, + n_parallel = n_parallel, + ) + total_mib = (model_size + kv) / (1024 * 1024) + if total_mib <= pool_mib * pin_fraction: + gpu_indices = sorted( + idx for idx, _ in subset + ) + use_fit = False + break elif gpus: # Can't estimate KV -- fall back to file-size-only check. @@ -2500,12 +2555,72 @@ def load_model( self._healthy = True + # Diagnose silent CPU fallback: if Studio detected GPUs but + # llama-server allocated only CPU model buffers (no CUDA0, + # ROCm0, Metal, etc. buffer line in its startup log), the + # prebuilt binary couldn't load its GPU backend at runtime. + # On Windows this is the unslothai/unsloth#5106 symptom -- + # cudart64_X.dll / cublas64_X.dll missing because the user + # has no system CUDA toolkit and Studio used to ship without + # the cudart bundle. Fixed at install time (paired runtime + # archive) + warned here as a belt-and-suspenders for any + # other runtime-load failure. + self._gpu_offload_active = self._classify_gpu_offload( + gpu_indices is not None or use_fit, gpus or [] + ) + if self._gpu_offload_active is False: + logger.warning( + "llama-server appears to have loaded the model entirely " + "on CPU even though Studio detected at least one GPU. " + "This usually means the prebuilt binary's GPU backend " + "failed to load -- on Windows, cudart64_X.dll / " + "cublas64_X.dll could not be resolved. Reinstall the " + "Studio llama.cpp prebuilt or install a matching CUDA " + "toolkit (issue unslothai/unsloth#5106).", + ) + logger.info( f"llama-server ready on port {self._port} " f"for model '{model_identifier}'" ) return True + def _classify_gpu_offload( + self, + expected_gpu: bool, + detected_gpus: list[tuple[int, int]], + ) -> Optional[bool]: + """Return True if llama-server allocated GPU model buffers, False + if it allocated only CPU buffers (silent CPU fallback), or None + when there is no signal to classify (no GPU detected, or the + startup log didn't include a model-buffer-size line). + + ``expected_gpu`` mirrors Studio's intent: True when we either + pinned a GPU (-ngl) or asked the fork to fit on GPU. We only + warn when the user expected GPU AND we have probe evidence that + it didn't happen -- otherwise stay silent. + """ + if not detected_gpus or not expected_gpu: + return None + # llama-server prints one ``... model buffer size = ... MiB`` + # line per backend buffer the model lives in. Backend names + # include ``CUDA0`` / ``CUDA_Host`` for NVIDIA, ``ROCm0`` for + # AMD, ``Metal`` for Apple, ``Vulkan0`` for Vulkan. ``CPU`` + # / ``CPU_Mapped`` / ``CPU_AARCH64`` indicate CPU allocations. + gpu_markers = ("CUDA", "ROCm", "Metal", "Vulkan", "OpenCL", "SYCL") + saw_buffer_line = False + saw_gpu_buffer = False + for line in self._stdout_lines: + if "model buffer size" not in line: + continue + saw_buffer_line = True + if any(marker in line for marker in gpu_markers): + saw_gpu_buffer = True + break + if not saw_buffer_line: + return None + return saw_gpu_buffer + def unload_model(self) -> bool: """Terminate the llama-server subprocess and cancel any in-flight download.""" self._cancel_event.set() diff --git a/studio/backend/tests/test_llama_cpp_context_fit.py b/studio/backend/tests/test_llama_cpp_context_fit.py index caa6397901..93ccf77156 100644 --- a/studio/backend/tests/test_llama_cpp_context_fit.py +++ b/studio/backend/tests/test_llama_cpp_context_fit.py @@ -192,6 +192,7 @@ def fake_estimate(n_ctx_, _type = None, **_kwargs): else: ranked = sorted(gpus, key = lambda g: g[1], reverse = True) matched = False + pin_fraction = LlamaCppBackend._GPU_PIN_VRAM_FRACTION for n_gpus in range(1, len(ranked) + 1): subset = ranked[:n_gpus] pool_mib = sum(free for _, free in subset) @@ -203,7 +204,7 @@ def fake_estimate(n_ctx_, _type = None, **_kwargs): ) kv = inst._estimate_kv_cache_bytes(capped, cache_type_kv) total_mib = (model_size + kv) / (1024 * 1024) - if total_mib <= pool_mib * 0.90: + if total_mib <= pool_mib * pin_fraction: effective_ctx = capped gpu_indices = sorted(idx for idx, _ in subset) use_fit = False @@ -211,6 +212,20 @@ def fake_estimate(n_ctx_, _type = None, **_kwargs): break if not matched: effective_ctx = min(FALLBACK_CTX, effective_ctx) + # Mirror llama_cpp.py: after dropping ctx to FALLBACK_CTX, + # re-check whether the model fits with the smaller KV cache. + if effective_ctx > 0: + for n_gpus in range(1, len(ranked) + 1): + subset = ranked[:n_gpus] + pool_mib = sum(free for _, free in subset) + kv = inst._estimate_kv_cache_bytes( + effective_ctx, cache_type_kv + ) + total_mib = (model_size + kv) / (1024 * 1024) + if total_mib <= pool_mib * pin_fraction: + gpu_indices = sorted(idx for idx, _ in subset) + use_fit = False + break elif gpus: gpu_indices, use_fit = inst._select_gpus(model_size, gpus) if use_fit and not explicit_ctx: @@ -378,6 +393,67 @@ def test_no_kv_metadata_fittable_auto(self): assert plan["gpu_indices"] == [0] +# --------------------------------------------------------------------------- +# Issue #5106 / Discord "RAM not VRAM" regression: a model whose weights +# occupy ~92-94% of free VRAM must still pin to GPU. Before the threshold +# bump from 0.90 to 0.95 this case fell through to ``--fit on`` without +# ``-ngl``, and the unsloth llama.cpp fork's fit logic (default +# ``--fit-target 1024``) ended up offloading layers to CPU even though the +# model would have loaded comfortably with all layers on GPU. +# --------------------------------------------------------------------------- + + +class TestTightFitPinsToGPU: + """Models that fit at 91-95% of free VRAM must use the GPU.""" + + def test_rtx_4090_qwen_24gb_class(self): + # noahterbest's reproducer in unslothai/unsloth#5106: + # "GGUF size: 20.8 GB, est. KV cache: 0.1 GB, context: 4096, + # GPUs free: [(0, 22805)], selected: None, fit: True" + # With ctx=4096, the model + KV occupies ~94% of free VRAM. The + # remaining ~1.4 GiB headroom is enough for the CUDA context and + # compute buffers on a 4090, so Studio should pin the GPU and + # offload all layers via ``-ngl -1`` instead of relying on the + # fork's fit logic. + plan = _drive( + n_ctx = 0, + model_gib = 20.8, + gpus = [(0, 22_805)], + native_ctx = 131072, + kv_per_token_bytes = 25_000, + ) + assert plan["use_fit"] is False + assert plan["gpu_indices"] == [0] + + def test_explicit_ctx_at_94_pct_pins_to_gpu(self): + # Same shape as above, but the user explicitly chose a context + # length. The explicit-ctx branch goes through ``_select_gpus`` + # which must agree with the auto-ctx branch on the headroom rule. + plan = _drive( + n_ctx = 4096, + model_gib = 20.8, + gpus = [(0, 22_805)], + native_ctx = 131072, + kv_per_token_bytes = 25_000, + ) + assert plan["use_fit"] is False + assert plan["gpu_indices"] == [0] + + def test_genuine_overflow_still_uses_fit(self): + # Above the 95% pin threshold the fork's ``--fit on`` is still + # the right answer; we don't want the threshold bump to mask a + # truly oversized model as "fits". + plan = _drive( + n_ctx = 4096, + model_gib = 23, + gpus = [(0, 22_000)], + native_ctx = 131072, + kv_per_token_bytes = 25_000, + ) + assert plan["use_fit"] is True + assert plan["gpu_indices"] is None + + # --------------------------------------------------------------------------- # Platform-agnostic input shape # --------------------------------------------------------------------------- @@ -391,3 +467,84 @@ def test_identical_decision_across_platforms(platform_tag): plan_a = _drive(n_ctx = 0, model_gib = 8, gpus = [(0, 24_000)]) plan_b = _drive(n_ctx = 0, model_gib = 8, gpus = [(0, 24_000)]) assert plan_a == plan_b, platform_tag + + +# --------------------------------------------------------------------------- +# _classify_gpu_offload: detect silent CPU fallback (issue #5106 / Discord +# "RAM not VRAM"). When Studio detected GPUs and intended to use them, but +# llama-server allocated only CPU model buffers, the prebuilt's GPU +# backend failed to load -- usually missing cudart64_X.dll on Windows. +# --------------------------------------------------------------------------- + + +class TestClassifyGpuOffload: + def _backend(self, stdout_lines): + inst = LlamaCppBackend.__new__(LlamaCppBackend) + inst._stdout_lines = list(stdout_lines) + return inst + + def test_cuda_buffer_present_returns_true(self): + inst = self._backend( + [ + "load_tensors: offloaded 33/33 layers to GPU", + "load_tensors: CUDA0 model buffer size = 21000.0 MiB", + "load_tensors: CPU_Mapped model buffer size = 0.6 MiB", + ] + ) + assert inst._classify_gpu_offload(True, [(0, 22805)]) is True + + def test_cpu_only_buffer_returns_false(self): + # llama-server printed buffer lines but only CPU buffers -- + # this is the silent CPU fallback symptom we want to catch. + inst = self._backend( + [ + "load_tensors: CPU_Mapped model buffer size = 21000.0 MiB", + "load_tensors: CPU model buffer size = 0.6 MiB", + ] + ) + assert inst._classify_gpu_offload(True, [(0, 22805)]) is False + + def test_no_buffer_lines_returns_none(self): + # If we can't see buffer-allocation lines at all, don't guess. + inst = self._backend( + [ + "INFO [main] starting server", + "load_tensors: file format = GGUF V3", + ] + ) + assert inst._classify_gpu_offload(True, [(0, 22805)]) is None + + def test_no_gpus_detected_returns_none(self): + # CPU-only systems are valid; suppress the warning entirely. + inst = self._backend( + [ + "load_tensors: CPU_Mapped model buffer size = 21000.0 MiB", + ] + ) + assert inst._classify_gpu_offload(False, []) is None + + def test_user_did_not_intend_gpu_returns_none(self): + # Studio called start_llama_server without expecting GPU use; + # don't warn. + inst = self._backend( + [ + "load_tensors: CPU_Mapped model buffer size = 21000.0 MiB", + ] + ) + assert inst._classify_gpu_offload(False, [(0, 22805)]) is None + + def test_rocm_buffer_marker_returns_true(self): + inst = self._backend( + [ + "load_tensors: ROCm0 model buffer size = 21000.0 MiB", + ] + ) + assert inst._classify_gpu_offload(True, [(0, 22805)]) is True + + def test_metal_buffer_marker_returns_true(self): + inst = self._backend( + [ + "load_tensors: Metal model buffer size = 8000.0 MiB", + ] + ) + assert inst._classify_gpu_offload(True, [(0, 22805)]) is True From 2e9bb84811c8e74d4386cbda5974e8602326ad42 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 7 May 2026 10:40:43 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/core/inference/llama_cpp.py | 4 +--- studio/backend/tests/test_llama_cpp_context_fit.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 2080f68f5a..99b8afcdb6 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -2127,9 +2127,7 @@ def load_model( ) total_mib = (model_size + kv) / (1024 * 1024) if total_mib <= pool_mib * pin_fraction: - gpu_indices = sorted( - idx for idx, _ in subset - ) + gpu_indices = sorted(idx for idx, _ in subset) use_fit = False break diff --git a/studio/backend/tests/test_llama_cpp_context_fit.py b/studio/backend/tests/test_llama_cpp_context_fit.py index 93ccf77156..0395a92da3 100644 --- a/studio/backend/tests/test_llama_cpp_context_fit.py +++ b/studio/backend/tests/test_llama_cpp_context_fit.py @@ -218,9 +218,7 @@ def fake_estimate(n_ctx_, _type = None, **_kwargs): for n_gpus in range(1, len(ranked) + 1): subset = ranked[:n_gpus] pool_mib = sum(free for _, free in subset) - kv = inst._estimate_kv_cache_bytes( - effective_ctx, cache_type_kv - ) + kv = inst._estimate_kv_cache_bytes(effective_ctx, cache_type_kv) total_mib = (model_size + kv) / (1024 * 1024) if total_mib <= pool_mib * pin_fraction: gpu_indices = sorted(idx for idx, _ in subset) From c9d1ccc3d1b7b0aea0cc0e4bd82329a77fd20e42 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 7 May 2026 11:28:38 +0000 Subject: [PATCH 3/4] Trim comments to be more succinct --- studio/backend/core/inference/llama_cpp.py | 93 +++++-------------- .../tests/test_llama_cpp_context_fit.py | 33 ++----- 2 files changed, 32 insertions(+), 94 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 99b8afcdb6..f039fbf642 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -433,9 +433,7 @@ def __init__(self): self._hf_variant: Optional[str] = None self._is_vision: bool = False self._healthy = False - # True/False once the post-load classifier sees evidence; None - # when there's no signal (no GPU detected, or llama-server's - # buffer-allocation log was absent / unparseable). + # Set by _classify_gpu_offload after _wait_for_health. self._gpu_offload_active: Optional[bool] = None self._context_length: Optional[int] = None self._effective_context_length: Optional[int] = None @@ -955,18 +953,11 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]: logger.debug(f"torch GPU probe failed: {e}") return [] - # Fraction of free GPU VRAM Studio is willing to pin a model into - # before it falls back to ``--fit on`` (and lets the unsloth llama.cpp - # fork's fit logic decide how many layers to offload). 0.95 leaves - # ~5% of free VRAM for the CUDA context, compute buffers, flash-attn - # workspace, and other per-launch overhead. 0.90 is too conservative - # at this layer: when a model needs ~92-94% of free VRAM (issue - # #5106 / Discord "RAM not VRAM"), Studio used to flip ``use_fit=True`` - # without ``-ngl``, the fork's fit logic ran with its default - # ``--fit-target 1024`` (1 GiB margin) and pushed substantial layer - # weight off the GPU even though the model would have loaded - # comfortably. The fork's own fit logic still serves as the safety - # net for the genuinely-too-large case. + # Free-VRAM fraction at which Studio pins the GPU directly instead + # of deferring to ``--fit on``. 5% headroom covers CUDA context + + # compute buffers; 0.90 was too conservative and dropped 91-94% + # fits to CPU offload (#5106). The fork's --fit on still catches + # the truly-too-large case. _GPU_PIN_VRAM_FRACTION = 0.95 @staticmethod @@ -1236,15 +1227,11 @@ def _fit_context_to_vram( ) -> int: """Return the largest context length that fits in GPU VRAM. - Uses 90% of available VRAM as the budget for the context-fitting - binary search (10% reserved for compute buffers, CUDA context, - scratch space, flash-attn workspace, etc.). The ctx-fit budget is - intentionally tighter than ``_GPU_PIN_VRAM_FRACTION``: this routine - chooses the slider/auto context value, where over-promising would - OOM at runtime; ``_select_gpus`` decides whether to pin a GPU at - all, where being conservative pushes layers to CPU instead. - If the model weights alone don't fit, returns ``requested_ctx`` - unchanged and lets ``--fit on`` flex ``-ngl`` at runtime. + Uses 90% of available VRAM as the ctx-fit budget. Tighter than + ``_GPU_PIN_VRAM_FRACTION`` on purpose: over-promising context + OOMs at runtime, while pinning conservatively just defers to + --fit on. If the weights alone don't fit, returns + ``requested_ctx`` unchanged. ``kv_on_gpu`` mirrors ``--kv-offload`` (default on). When False the KV cache lives in CPU RAM and doesn't compete with weights @@ -2074,14 +2061,9 @@ def load_model( gpu_indices, use_fit = self._select_gpus(requested_total, gpus) # No silent shrink: effective_ctx stays == n_ctx. else: - # Auto context: prefer fewer GPUs, cap context to fit. - # Match _select_gpus's headroom threshold so a model - # that fits at 91-95% of free VRAM still pins to GPU - # rather than falling through to ``--fit on`` (issue - # #5106). The ctx cap from ``_fit_context_to_vram`` - # uses a more conservative 90% budget so the slider - # value we land on still leaves room for compute - # buffers / CUDA context overhead. + # Auto context: prefer fewer GPUs, cap context + # to fit. Same headroom threshold as + # _select_gpus (#5106). ranked = sorted(gpus, key = lambda g: g[1], reverse = True) pin_fraction = self._GPU_PIN_VRAM_FRACTION for n_gpus in range(1, len(ranked) + 1): @@ -2104,17 +2086,10 @@ def load_model( use_fit = False break else: - # No subset can host the weights at the native - # context. Default the UI-visible context to - # 4096 -- but before handing off to ``--fit on``, - # re-check whether the model fits at the smaller - # context (issue #5106). Without this re-check, - # a 20.8 GiB model with a 131072 native context - # on a 22.8 GiB GPU is correctly classified as - # "doesn't fit at native ctx", but then we'd - # ship ``--fit on`` without ``-ngl`` even though - # the same model + a 4096 KV cache pins - # comfortably on the GPU. + # Native ctx doesn't fit. Drop to 4096 and + # re-check before deferring to --fit on: + # a model that overflows at 131k may pin + # comfortably with a 4096 KV cache (#5106). effective_ctx = min(4096, effective_ctx) if effective_ctx > 0: for n_gpus in range(1, len(ranked) + 1): @@ -2553,16 +2528,7 @@ def load_model( self._healthy = True - # Diagnose silent CPU fallback: if Studio detected GPUs but - # llama-server allocated only CPU model buffers (no CUDA0, - # ROCm0, Metal, etc. buffer line in its startup log), the - # prebuilt binary couldn't load its GPU backend at runtime. - # On Windows this is the unslothai/unsloth#5106 symptom -- - # cudart64_X.dll / cublas64_X.dll missing because the user - # has no system CUDA toolkit and Studio used to ship without - # the cudart bundle. Fixed at install time (paired runtime - # archive) + warned here as a belt-and-suspenders for any - # other runtime-load failure. + # Catch silent CPU fallback when GPU was intended (#5106). self._gpu_offload_active = self._classify_gpu_offload( gpu_indices is not None or use_fit, gpus or [] ) @@ -2588,23 +2554,14 @@ def _classify_gpu_offload( expected_gpu: bool, detected_gpus: list[tuple[int, int]], ) -> Optional[bool]: - """Return True if llama-server allocated GPU model buffers, False - if it allocated only CPU buffers (silent CPU fallback), or None - when there is no signal to classify (no GPU detected, or the - startup log didn't include a model-buffer-size line). - - ``expected_gpu`` mirrors Studio's intent: True when we either - pinned a GPU (-ngl) or asked the fork to fit on GPU. We only - warn when the user expected GPU AND we have probe evidence that - it didn't happen -- otherwise stay silent. - """ + """True if a GPU model buffer was allocated, False if only CPU + buffers landed despite GPU intent, None when there's no signal + (no GPU detected, no buffer-size lines, etc.).""" if not detected_gpus or not expected_gpu: return None - # llama-server prints one ``... model buffer size = ... MiB`` - # line per backend buffer the model lives in. Backend names - # include ``CUDA0`` / ``CUDA_Host`` for NVIDIA, ``ROCm0`` for - # AMD, ``Metal`` for Apple, ``Vulkan0`` for Vulkan. ``CPU`` - # / ``CPU_Mapped`` / ``CPU_AARCH64`` indicate CPU allocations. + # llama-server logs one ``... model buffer size = N MiB`` line + # per backend buffer; CUDA0 / ROCm0 / Metal / Vulkan0 / + # OpenCL0 / SYCL0 are GPU, CPU / CPU_Mapped are not. gpu_markers = ("CUDA", "ROCm", "Metal", "Vulkan", "OpenCL", "SYCL") saw_buffer_line = False saw_gpu_buffer = False diff --git a/studio/backend/tests/test_llama_cpp_context_fit.py b/studio/backend/tests/test_llama_cpp_context_fit.py index 0395a92da3..1ea76edd15 100644 --- a/studio/backend/tests/test_llama_cpp_context_fit.py +++ b/studio/backend/tests/test_llama_cpp_context_fit.py @@ -212,8 +212,7 @@ def fake_estimate(n_ctx_, _type = None, **_kwargs): break if not matched: effective_ctx = min(FALLBACK_CTX, effective_ctx) - # Mirror llama_cpp.py: after dropping ctx to FALLBACK_CTX, - # re-check whether the model fits with the smaller KV cache. + # Mirror llama_cpp.py: re-check fit at FALLBACK_CTX. if effective_ctx > 0: for n_gpus in range(1, len(ranked) + 1): subset = ranked[:n_gpus] @@ -392,12 +391,7 @@ def test_no_kv_metadata_fittable_auto(self): # --------------------------------------------------------------------------- -# Issue #5106 / Discord "RAM not VRAM" regression: a model whose weights -# occupy ~92-94% of free VRAM must still pin to GPU. Before the threshold -# bump from 0.90 to 0.95 this case fell through to ``--fit on`` without -# ``-ngl``, and the unsloth llama.cpp fork's fit logic (default -# ``--fit-target 1024``) ended up offloading layers to CPU even though the -# model would have loaded comfortably with all layers on GPU. +# #5106 regression: 91-95% utilization must still pin GPU. # --------------------------------------------------------------------------- @@ -405,14 +399,8 @@ class TestTightFitPinsToGPU: """Models that fit at 91-95% of free VRAM must use the GPU.""" def test_rtx_4090_qwen_24gb_class(self): - # noahterbest's reproducer in unslothai/unsloth#5106: - # "GGUF size: 20.8 GB, est. KV cache: 0.1 GB, context: 4096, - # GPUs free: [(0, 22805)], selected: None, fit: True" - # With ctx=4096, the model + KV occupies ~94% of free VRAM. The - # remaining ~1.4 GiB headroom is enough for the CUDA context and - # compute buffers on a 4090, so Studio should pin the GPU and - # offload all layers via ``-ngl -1`` instead of relying on the - # fork's fit logic. + # noahterbest's #5106 log: 20.8 GB model on 22805 MiB free + # GPU, ctx=4096 -> ~94% utilization, ~1.4 GiB headroom. plan = _drive( n_ctx = 0, model_gib = 20.8, @@ -424,9 +412,7 @@ def test_rtx_4090_qwen_24gb_class(self): assert plan["gpu_indices"] == [0] def test_explicit_ctx_at_94_pct_pins_to_gpu(self): - # Same shape as above, but the user explicitly chose a context - # length. The explicit-ctx branch goes through ``_select_gpus`` - # which must agree with the auto-ctx branch on the headroom rule. + # Explicit-ctx branch must agree with auto-ctx on headroom. plan = _drive( n_ctx = 4096, model_gib = 20.8, @@ -438,9 +424,7 @@ def test_explicit_ctx_at_94_pct_pins_to_gpu(self): assert plan["gpu_indices"] == [0] def test_genuine_overflow_still_uses_fit(self): - # Above the 95% pin threshold the fork's ``--fit on`` is still - # the right answer; we don't want the threshold bump to mask a - # truly oversized model as "fits". + # Beyond 95% must still defer to --fit on. plan = _drive( n_ctx = 4096, model_gib = 23, @@ -468,10 +452,7 @@ def test_identical_decision_across_platforms(platform_tag): # --------------------------------------------------------------------------- -# _classify_gpu_offload: detect silent CPU fallback (issue #5106 / Discord -# "RAM not VRAM"). When Studio detected GPUs and intended to use them, but -# llama-server allocated only CPU model buffers, the prebuilt's GPU -# backend failed to load -- usually missing cudart64_X.dll on Windows. +# _classify_gpu_offload: detect silent CPU fallback (#5106). # --------------------------------------------------------------------------- From e70c1cd34976698d26bea9fb2b3765a10972ecd7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 10 May 2026 12:43:04 +0000 Subject: [PATCH 4/4] Scrub .github/workflows for staging push (matches staging base) --- .github/workflows/studio-backend-ci.yml | 200 ------------------- .github/workflows/studio-frontend-ci.yml | 108 ---------- .github/workflows/studio-inference-smoke.yml | 185 ----------------- .github/workflows/studio-tauri-smoke.yml | 105 ---------- .github/workflows/wheel-smoke.yml | 124 ------------ 5 files changed, 722 deletions(-) delete mode 100644 .github/workflows/studio-backend-ci.yml delete mode 100644 .github/workflows/studio-frontend-ci.yml delete mode 100644 .github/workflows/studio-inference-smoke.yml delete mode 100644 .github/workflows/studio-tauri-smoke.yml delete mode 100644 .github/workflows/wheel-smoke.yml diff --git a/.github/workflows/studio-backend-ci.yml b/.github/workflows/studio-backend-ci.yml deleted file mode 100644 index 5a858888e7..0000000000 --- a/.github/workflows/studio-backend-ci.yml +++ /dev/null @@ -1,200 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Runs the existing studio/backend/tests/ suite (~860 tests, all CPU-friendly) -# on every PR that touches the backend or unsloth library. Until this lands, -# none of those tests run automatically. Verified locally on Python 3.13 with -# the surgical exclusions below: 861 pass, 4 skipped. -# -# Exclusions: -# - tests/test_studio_api.py: end-to-end against a live model + GGUF download, -# too heavy for free runners. Run separately when GPU CI is available. -# - -k 'not llama_cpp_load_progress_live': spawns a real llama.cpp process, -# not appropriate for CPU-only runners. -# -# ruff is non-blocking initially; remove `|| true` once the backend lints clean. - -name: Backend CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'tests/**' - - 'pyproject.toml' - - '.github/workflows/studio-backend-ci.yml' - push: - branches: [main, pip] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - pytest: - name: (Python ${{ matrix.python }}) - runs-on: ubuntu-latest - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - python: ['3.10', '3.11', '3.12', '3.13'] - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: '${{ matrix.python }}' - cache: 'pip' - - - name: Install backend test dependencies (CPU only) - run: | - python -m pip install --upgrade pip - # Studio's declared backend deps: - pip install -r studio/backend/requirements/studio.txt - # Extras that studio.txt does not list but the import chain needs - # (python-multipart for FastAPI form/file uploads, sqlalchemy/cryptography - # for the auth DB, yaml/jinja2 for utils.models.model_config, etc.): - pip install \ - python-multipart aiofiles sqlalchemy cryptography \ - pyyaml jinja2 mammoth unpdf requests \ - 'numpy<3' pytest pytest-asyncio httpx - # Torch CPU + transformers are required by a chunk of the backend test - # suite (gpu_selection, kv_cache_estimation, utils). CPU-only torch - # keeps the install ~250 MB / ~1 min on a clean runner. - pip install --index-url https://download.pytorch.org/whl/cpu 'torch>=2.4,<2.11' - pip install 'transformers>=4.51,<5.5' - - - name: Backend tests - working-directory: studio/backend - # Locally validated against this dep set: 831 passed, 5 skipped, 35 deselected. - # Deselections (all environment-specific, would never pass on a GPU-less - # `ubuntu-latest` runner regardless of code correctness): - # - llama_cpp_load_progress_live: spawns a real llama.cpp process - # - TestGpuAutoSelection / TestPreSpawnGpuResolution / TestPerGpuFitGuardAllCounts: - # require live transformers config introspection on real GPUs - # - TestTransformersIntrospection: same - # - test_returns_cuda_when_cuda_available / test_calls_cuda_cache_when_cuda: - # assume CUDA-capable GPU - run: | - python -m pytest tests/ -q --tb=short \ - --ignore=tests/test_studio_api.py \ - -k 'not llama_cpp_load_progress_live and not TestGpuAutoSelection and not TestPreSpawnGpuResolution and not TestPerGpuFitGuardAllCounts and not TestTransformersIntrospection and not test_returns_cuda_when_cuda_available and not test_calls_cuda_cache_when_cuda' - - repo-cpu-tests: - # Auto-discover everything under tests/ that is not GPU-bound by - # design. New tests added in covered directories are picked up - # without a workflow edit. Locally validated: 779 passed, 11 - # skipped, 23 deselected. tests/conftest.py (mirroring unsloth-zoo - # PR #624) pre-loads unsloth_zoo.device_type and unsloth.device_type - # under a mocked torch.cuda.is_available so the unsloth import - # chain succeeds on CPU. - name: Repo tests (CPU) - runs-on: ubuntu-latest - timeout-minutes: 10 - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: '3.12' - cache: 'pip' - - - name: Install deps (shared shape with backend pytest job) - run: | - python -m pip install --upgrade pip - pip install -r studio/backend/requirements/studio.txt - pip install \ - python-multipart aiofiles sqlalchemy cryptography \ - pyyaml jinja2 mammoth unpdf requests typer \ - 'numpy<3' pytest pytest-asyncio httpx - # torchvision is needed because unsloth_zoo.vision_utils imports - # it at module scope and is reached via unsloth.models._utils. - pip install --index-url https://download.pytorch.org/whl/cpu \ - 'torch>=2.4,<2.11' 'torchvision<0.26' - pip install 'transformers>=4.51,<5.5' - # bitsandbytes is a hard import in unsloth/models/_utils.py. - # Recent versions ship a CPU build so it installs on a free - # Linux runner; the kernels still raise on use, but import - # succeeds and the package collects. - pip install 'bitsandbytes>=0.45' - # unsloth.device_type imports unsloth_zoo.utils.Version at module - # scope, so the conftest harness needs unsloth_zoo on the path - # even though it is an optional dep of unsloth. - pip install 'unsloth_zoo>=2026.5.1' - pip install -e . --no-deps - - - name: Repo tests (CPU, auto-discovered) - env: - # tests/python/* import install_python_stack from studio/. - PYTHONPATH: ${{ github.workspace }}/studio - # Skip lazy compilation work the unsloth import chain wants to - # do at import time on a real GPU. - UNSLOTH_COMPILE_DISABLE: '1' - # --ignore: GPU-bound directories (qlora and saving need real - # weights / GPU; tests/sh is a shell suite the next step - # handles; tests/utils is a helpers folder, not tests). - # State-sensitive hardware-spoofing files are pulled out and run - # in isolation in the next step because they mutate - # hardware.py module globals (IS_ROCM / DEVICE) and pollute - # downstream tests. - # -m: honour markers already declared in tests/python/conftest.py - # (`server` = needs studio venv, `e2e` = needs network). - # --deselect: two registry tests that hit huggingface_hub for - # live model existence checks; they belong on a network job. - run: | - python -m pytest tests/ -q --tb=short \ - --ignore=tests/qlora \ - --ignore=tests/saving \ - --ignore=tests/utils \ - --ignore=tests/sh \ - --ignore=tests/studio/test_hardware_dispatch_matrix.py \ - --ignore=tests/studio/test_is_mlx_dispatch_gate.py \ - -m 'not server and not e2e' \ - --deselect tests/test_model_registry.py::test_model_registration \ - --deselect tests/test_model_registry.py::test_all_model_registration - - - name: Hardware-spoof tests (state-sensitive, run in isolation) - env: - PYTHONPATH: ${{ github.workspace }}/studio - UNSLOTH_COMPILE_DISABLE: '1' - # These two files mutate hardware.py module globals at runtime - # via the spoof fixtures, which leaks state into any other test - # that imports hardware. Run them in their own pytest invocation - # so the leak does not cross file boundaries. - run: | - python -m pytest -q --tb=short \ - tests/studio/test_hardware_dispatch_matrix.py \ - tests/studio/test_is_mlx_dispatch_gate.py - - - name: Shell installer tests - # Subset that does not depend on a writable / pristine install.sh - # tree; test_install_host_defaults.sh checks install.ps1 layout - # which has drifted (separate followup). - run: | - set -e - for s in \ - tests/sh/test_get_torch_index_url.sh \ - tests/sh/test_mac_intel_compat.sh \ - tests/sh/test_tauri_install_exit_order.sh \ - tests/sh/test_torch_constraint.sh; do - echo "::group::$s" - bash "$s" - echo "::endgroup::" - done - - ruff: - name: Backend ruff lint (non-blocking) - runs-on: ubuntu-latest - timeout-minutes: 5 - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: '3.12' - cache: 'pip' - - run: pip install ruff - - name: ruff check (non-blocking until accumulated drift is cleared) - run: ruff check studio/backend || true diff --git a/.github/workflows/studio-frontend-ci.yml b/.github/workflows/studio-frontend-ci.yml deleted file mode 100644 index 039bd5dd08..0000000000 --- a/.github/workflows/studio-frontend-ci.yml +++ /dev/null @@ -1,108 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Frontend PR gate: lockfile freshness, typecheck, build, and a bundle grep -# that catches the 2026.5.1 chat-history regression at the JS level. -# -# biome runs as non-blocking for now: the codebase currently has accumulated -# ~470 errors and ~1650 warnings against the existing biome config. Surfacing -# the count in CI lets us drive it down without forcing a fleet-wide cleanup -# in the same PR. Drop `continue-on-error` once that number is zero. - -name: Frontend CI - -on: - pull_request: - paths: - - 'studio/frontend/**' - - '.github/workflows/studio-frontend-ci.yml' - push: - branches: [main, pip] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Frontend build + bundle sanity - runs-on: ubuntu-latest - timeout-minutes: 10 - defaults: - run: - working-directory: studio/frontend - steps: - - uses: actions/checkout@v4 - - # FIXME: drop this step once @assistant-ui/* and assistant-stream - # leave 0.x -- on 1.x, caret ranges are conventional. Until then, - # every 0.minor on this surface is a SemVer-major (this is exactly - # how 2026.5.1 shipped a broken chat runtime: ^0.12.19 quietly - # resolved to 0.12.28). - - name: '@assistant-ui must be pinned exactly (no caret/tilde)' - working-directory: ${{ github.workspace }} - run: | - set -e - if grep -nE '"(@assistant-ui/[a-z-]+|assistant-stream)":[[:space:]]*"[\^~]' studio/frontend/package.json; then - echo "::error file=studio/frontend/package.json::These packages must be pinned to exact versions until they leave 0.x. Drop the leading ^ or ~." - exit 1 - fi - echo "All assistant-ui packages are pinned exactly." - - - uses: actions/setup-node@v4 - with: - node-version: '22' - cache: 'npm' - cache-dependency-path: studio/frontend/package-lock.json - - - name: Lockfile must agree with package.json (npm ci is strict) - run: npm ci --no-fund --no-audit - - - name: npm ci must not have modified the working tree - working-directory: ${{ github.workspace }} - run: | - if ! git diff --quiet -- studio/frontend; then - echo "::error::npm ci modified files; commit the updated lockfile" - git status -- studio/frontend - exit 1 - fi - - - name: Typecheck - run: npm run typecheck - - - name: Build - run: npm run build - - - name: Built bundle must not contain Studio's unstable_Provider call site - run: | - set -e - JS=$(ls dist/assets/index-*.js | head -1) - HITS=$(grep -c 'unstable_Provider:' "$JS" || echo 0) - echo "main bundle: $JS" - echo "unstable_Provider: hits=$HITS (assistant-ui internals contribute up to 3)" - if [ "$HITS" -gt 3 ]; then - echo "::error file=studio/frontend/src/features/chat/runtime-provider.tsx::Studio bundle still passes unstable_Provider through useRemoteThreadListRuntime; this is the 2026.5.1 chat-history regression. Pass adapters directly into useLocalRuntime instead." - exit 1 - fi - - - name: Bundle size budget (75 MB) - run: | - SIZE=$(du -sb dist | cut -f1) - BUDGET=$((75 * 1024 * 1024)) - echo "dist size: $SIZE bytes ($((SIZE/1024/1024)) MB), budget: $BUDGET bytes (75 MB)" - if [ "$SIZE" -gt "$BUDGET" ]; then - echo "::error::studio/frontend/dist/ exceeded the 75 MB budget. Drop dead deps (e.g. the unused next dep) or split chunks." - exit 1 - fi - - - name: Biome (non-blocking until accumulated drift is cleared) - continue-on-error: true - run: npm run biome:check - - - name: Upload built dist on failure - if: failure() - uses: actions/upload-artifact@v4 - with: - name: studio-frontend-dist - path: studio/frontend/dist - retention-days: 3 diff --git a/.github/workflows/studio-inference-smoke.yml b/.github/workflows/studio-inference-smoke.yml deleted file mode 100644 index 8efe072d28..0000000000 --- a/.github/workflows/studio-inference-smoke.yml +++ /dev/null @@ -1,185 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# End-to-end smoke: install Studio via install.sh --local --no-torch, download -# a tiny GGUF, boot Studio, log in, change password, load the model, send a -# chat completion, assert a non-empty response. Only workflow that tests "the -# app actually works". -# -# Model: Qwen3.5-2B UD-IQ3_XXS (~890 MiB) -- small enough that the cache miss -# is cheap and inference fits in the 25 min CPU-runner budget. GGUF is cached -# across runs via actions/cache. - -name: Studio GGUF CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'install.sh' - - 'pyproject.toml' - - '.github/workflows/studio-inference-smoke.yml' - push: - branches: [main, pip] - # Manual trigger for pre-warming the GGUF cache on main, or re-running - # against an arbitrary branch without pushing a no-op commit. - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - GGUF_REPO: unsloth/Qwen3.5-2B-GGUF - GGUF_FILE: Qwen3.5-2B-UD-IQ3_XXS.gguf - STUDIO_PORT: '18888' - -jobs: - inference: - name: Studio boots, loads a GGUF, answers a chat completion - runs-on: ubuntu-latest - timeout-minutes: 25 - steps: - - uses: actions/checkout@v4 - - - name: Linux dependencies for llama.cpp prebuilt - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - libcurl4-openssl-dev libssl-dev jq - - - uses: actions/setup-node@v4 - with: - node-version: '22' - cache: 'npm' - cache-dependency-path: studio/frontend/package-lock.json - - - uses: actions/setup-python@v5 - with: - python-version: '3.12' - cache: 'pip' - - - name: Cache GGUF model file - id: cache-gguf - uses: actions/cache@v4 - with: - path: gguf-cache - key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1 - - - name: Download GGUF if cache miss - if: steps.cache-gguf.outputs.cache-hit != 'true' - run: | - # huggingface-cli was deprecated in huggingface_hub 1.13; the new CLI is `hf`. - python -m pip install --upgrade huggingface_hub hf_transfer - mkdir -p gguf-cache - HF_HUB_ENABLE_HF_TRANSFER=1 \ - hf download "$GGUF_REPO" "$GGUF_FILE" --local-dir gguf-cache - - - name: Install Studio (--local, --no-torch keeps the install lean) - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Assert llama.cpp prebuilt was installed (no source-build fallback) - # ubuntu-latest is CPU-only x86_64, so studio/setup.sh should route - # to ggml-org/llama.cpp and grab bin-ubuntu-x64.tar.gz. A source - # build here means the routing regressed. - run: | - if grep -q "falling back to source build" logs/install.log; then - echo "::error::llama.cpp prebuilt path failed on ubuntu-latest. studio/setup.sh routing regressed; CPU-only Linux x86_64 should hit ggml-org/llama.cpp's bin-ubuntu-x64.tar.gz." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - if ! grep -qE "prebuilt installed and validated|prebuilt up to date and validated" logs/install.log; then - echo "::error::install.log does not contain the success marker for the llama.cpp prebuilt path. Did setup.sh skip the prebuilt install?" - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - echo "llama.cpp prebuilt path used successfully" - - - name: Reset auth + start Studio in the background - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health - run: | - for i in $(seq 1 60); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - echo "ready after ${i}s" - cat /tmp/health.json - jq -e '.status == "healthy"' /tmp/health.json - exit 0 - fi - sleep 1 - done - echo "Studio did not become healthy in 60s" - tail -200 logs/studio.log - exit 1 - - - name: Login + change bootstrap password - run: | - PW=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIPasswordSmoke12345!" - TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$PW\"}" | jq -r .access_token) - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - -d "{\"current_password\":\"$PW\",\"new_password\":\"$NEW\"}" > /dev/null - # Re-login to clear must_change_password flag. - NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) - echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV" - - - name: Load the GGUF into Studio - run: | - GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}" - ls -lh "$GGUF_PATH" - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 600 \ - -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name, is_gguf, context_length}' - - - name: Send a chat completion + assert non-empty response - run: | - RESP=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/chat/completions" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 900 \ - -d '{ - "messages":[{"role":"user","content":"Say hello in one short sentence."}], - "max_tokens":40, - "stream":false - }') - echo "raw response: $RESP" - CONTENT=$(echo "$RESP" | jq -r '.choices[0].message.content // empty') - echo "model response: $CONTENT" - if [ -z "$CONTENT" ]; then - echo "::error::Empty assistant response from Studio" - exit 1 - fi - - - name: Stop Studio - if: always() - run: | - kill "${STUDIO_PID}" || true - sleep 2 - ss -tln | grep ":${STUDIO_PORT}" || true - - - name: Upload Studio + install logs on failure - if: failure() - uses: actions/upload-artifact@v4 - with: - name: studio-inference-log - path: | - logs/studio.log - logs/install.log - retention-days: 7 diff --git a/.github/workflows/studio-tauri-smoke.yml b/.github/workflows/studio-tauri-smoke.yml deleted file mode 100644 index fcc9c8d963..0000000000 --- a/.github/workflows/studio-tauri-smoke.yml +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# PR-time smoke for the Tauri desktop wrapper. Builds the frontend and the -# Tauri Linux debug binary, with no codesigning. Catches: -# - tauri.conf.json drift -# - src-tauri Cargo.toml or rust source breakage -# - Tauri CLI version drift (we pin 2.10.1, matching release-desktop.yml) -# - frontend output not picked up by Tauri's distDir -# -# Linux-only on a free `ubuntu-latest` runner. Mac and Windows desktop builds -# stay in release-desktop.yml (manual `workflow_dispatch`) because they need -# code-signing secrets and ~30 min of runner time each. - -name: Studio Tauri CI - -on: - pull_request: - paths: - - 'studio/frontend/**' - - 'studio/src-tauri/**' - - '.github/workflows/studio-tauri-smoke.yml' - push: - branches: [main, pip] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - linux-debug-build: - name: Tauri Linux debug build (no codesign) - runs-on: ubuntu-22.04 - timeout-minutes: 25 - steps: - - uses: actions/checkout@v4 - - - name: Linux native deps for Tauri / WebKit2GTK - run: | - sudo apt-get update - sudo apt-get install -y \ - libwebkit2gtk-4.1-dev libayatana-appindicator3-dev \ - librsvg2-dev libxdo-dev libssl-dev patchelf - - - uses: actions/setup-node@v4 - with: - node-version: '24' - cache: 'npm' - cache-dependency-path: studio/frontend/package-lock.json - - - uses: dtolnay/rust-toolchain@stable - - - uses: swatinem/rust-cache@v2 - with: - workspaces: studio/src-tauri -> target - - - name: Install pinned Tauri CLI (matches release-desktop.yml) - run: npm install --save-dev --prefix studio @tauri-apps/cli@2.10.1 - - - name: Verify pinned Tauri CLI version - run: | - out="$(npx --prefix studio tauri --version)" - echo "$out" - [ "$out" = "tauri-cli 2.10.1" ] || { echo "::error::expected tauri-cli 2.10.1, got $out"; exit 1; } - - - name: Frontend build (npm ci, vite) - working-directory: studio/frontend - run: | - npm ci --no-fund --no-audit - npm run build - test -f dist/index.html - - - name: Tauri debug build (Linux, no bundle, no codesign) - # `--debug` + `--no-bundle` keeps this lean: compiles the Rust crate, - # confirms the frontend dist is wired into Tauri, but skips the AppImage - # / .deb production. Code signing is irrelevant because we never produce - # a distributable artifact. - env: - TAURI_SIGNING_PRIVATE_KEY: '' - TAURI_SIGNING_PRIVATE_KEY_PASSWORD: '' - run: npx --prefix studio tauri build --debug --no-bundle - - - name: Inspect produced binary - run: | - BIN=$(find studio/src-tauri/target/debug -maxdepth 1 -type f -executable 2>/dev/null \ - | grep -Ev '\.(d|so|dylib|dll)$' \ - | grep -Ev '/(deps|build|examples)$' \ - | head -1) - echo "binary: $BIN" - if [ -z "$BIN" ]; then - echo "::error::Tauri debug binary not produced" - ls -la studio/src-tauri/target/debug/ || true - exit 1 - fi - file "$BIN" - du -h "$BIN" - - - uses: actions/upload-artifact@v4 - if: failure() - with: - name: tauri-debug-build - path: | - studio/src-tauri/target/debug - studio/frontend/dist - retention-days: 3 diff --git a/.github/workflows/wheel-smoke.yml b/.github/workflows/wheel-smoke.yml deleted file mode 100644 index 080a6bb261..0000000000 --- a/.github/workflows/wheel-smoke.yml +++ /dev/null @@ -1,124 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Builds the PyPI wheel from the PR branch, then verifies the built wheel -# actually contains what we expect to ship and does NOT contain the broken -# Studio bundle that 2026.5.1 published. This is the single workflow that -# would have blocked the 2026.5.1 release before twine upload. -# -# Verified locally end-to-end against this branch: -# - python -m build produces unsloth--py3-none-any.whl in 13s -# - wheel content sanity passes: -# lockfile shipped, frontend dist shipped, -# no node_modules in wheel, no bun.lock in wheel, -# main bundle has unstable_Provider hits=1 (assistant-ui internals only). -# - Studio backend imports cleanly from the installed wheel with the -# lightweight dep set below. - -name: Wheel CI - -on: - pull_request: - paths: - - 'pyproject.toml' - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - '.github/workflows/wheel-smoke.yml' - push: - branches: [main, pip] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - wheel: - name: Wheel build + content sanity + import smoke - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-node@v4 - with: - node-version: '22' - cache: 'npm' - cache-dependency-path: studio/frontend/package-lock.json - - - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Build frontend - run: | - cd studio/frontend - npm ci --no-fund --no-audit - npm run build - - - name: Build wheel + sdist - run: | - python -m pip install --upgrade pip build - rm -rf dist build ./*.egg-info - python -m build - - - name: Wheel content sanity - run: | - python - <<'PY' - import zipfile, glob, sys - w = glob.glob("dist/unsloth-*.whl") - if not w: - print("FAIL: no wheel produced"); sys.exit(2) - w = w[0] - print(f"wheel: {w}") - with zipfile.ZipFile(w) as z: - n = z.namelist() - checks = { - "lockfile shipped": any(s.endswith("studio/frontend/package-lock.json") for s in n), - "frontend dist shipped": any(s.endswith("studio/frontend/dist/index.html") for s in n), - "no node_modules": not any("studio/frontend/node_modules/" in s for s in n), - "no bun.lock": not any(s.endswith("studio/frontend/bun.lock") for s in n), - } - js = [s for s in n - if "studio/frontend/dist/assets/" in s - and s.endswith(".js") - and "/index-" in s] - if not js: - print("FAIL: no main bundle index-*.js in wheel"); sys.exit(2) - data = z.read(js[0]).decode("utf-8", "replace") - hits = data.count("unstable_Provider:") - print(f"main bundle: {js[0]}") - print(f"unstable_Provider hits: {hits} (>=4 indicates 2026.5.1 regression)") - checks["bundle has no Studio unstable_Provider call site"] = (hits < 4) - - print() - for k, v in checks.items(): - print(f" [{'PASS' if v else 'FAIL'}] {k}") - sys.exit(0 if all(checks.values()) else 1) - PY - - - name: Studio backend import smoke - # Imports `studio.backend.main:app` from the freshly-installed wheel in - # a clean venv. This catches the class of bug that 2026.5.1 shipped with: - # frontend dist missing, package-lock.json missing, or the wheel's Python - # source tree broken in a way that surfaces only at app construction time. - run: | - python -m venv /tmp/v - /tmp/v/bin/pip install --upgrade pip - /tmp/v/bin/pip install -r studio/backend/requirements/studio.txt - /tmp/v/bin/pip install \ - python-multipart aiofiles sqlalchemy cryptography \ - pyyaml jinja2 mammoth unpdf requests \ - 'numpy<3' - /tmp/v/bin/pip install --no-deps dist/unsloth-*.whl - # Run from /tmp so Python imports the installed package, not the source tree. - cd /tmp - /tmp/v/bin/python -c "from studio.backend.main import app; print('Studio backend OK:', app.title)" - - - name: Upload wheel on failure - if: failure() - uses: actions/upload-artifact@v4 - with: - name: unsloth-wheel - path: dist/ - retention-days: 7