diff --git a/.github/workflows/studio-backend-ci.yml b/.github/workflows/studio-backend-ci.yml
deleted file mode 100644
index 5a858888e7..0000000000
--- a/.github/workflows/studio-backend-ci.yml
+++ /dev/null
@@ -1,200 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Runs the existing studio/backend/tests/ suite (~860 tests, all CPU-friendly)
-# on every PR that touches the backend or unsloth library. Until this lands,
-# none of those tests run automatically. Verified locally on Python 3.13 with
-# the surgical exclusions below: 861 pass, 4 skipped.
-#
-# Exclusions:
-#   - tests/test_studio_api.py: end-to-end against a live model + GGUF download,
-#     too heavy for free runners. Run separately when GPU CI is available.
-#   - -k 'not llama_cpp_load_progress_live': spawns a real llama.cpp process,
-#     not appropriate for CPU-only runners.
-#
-# ruff is non-blocking initially; remove `|| true` once the backend lints clean.
-
-name: Backend CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'tests/**'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-backend-ci.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  pytest:
-    name: (Python ${{ matrix.python }})
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    strategy:
-      fail-fast: false
-      matrix:
-        python: ['3.10', '3.11', '3.12', '3.13']
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '${{ matrix.python }}'
-          cache: 'pip'
-
-      - name: Install backend test dependencies (CPU only)
-        run: |
-          python -m pip install --upgrade pip
-          # Studio's declared backend deps:
-          pip install -r studio/backend/requirements/studio.txt
-          # Extras that studio.txt does not list but the import chain needs
-          # (python-multipart for FastAPI form/file uploads, sqlalchemy/cryptography
-          #  for the auth DB, yaml/jinja2 for utils.models.model_config, etc.):
-          pip install \
-            python-multipart aiofiles sqlalchemy cryptography \
-            pyyaml jinja2 mammoth unpdf requests \
-            'numpy<3' pytest pytest-asyncio httpx
-          # Torch CPU + transformers are required by a chunk of the backend test
-          # suite (gpu_selection, kv_cache_estimation, utils). CPU-only torch
-          # keeps the install ~250 MB / ~1 min on a clean runner.
-          pip install --index-url https://download.pytorch.org/whl/cpu 'torch>=2.4,<2.11'
-          pip install 'transformers>=4.51,<5.5'
-
-      - name: Backend tests
-        working-directory: studio/backend
-        # Locally validated against this dep set: 831 passed, 5 skipped, 35 deselected.
-        # Deselections (all environment-specific, would never pass on a GPU-less
-        # `ubuntu-latest` runner regardless of code correctness):
-        #   - llama_cpp_load_progress_live: spawns a real llama.cpp process
-        #   - TestGpuAutoSelection / TestPreSpawnGpuResolution / TestPerGpuFitGuardAllCounts:
-        #       require live transformers config introspection on real GPUs
-        #   - TestTransformersIntrospection: same
-        #   - test_returns_cuda_when_cuda_available / test_calls_cuda_cache_when_cuda:
-        #       assume CUDA-capable GPU
-        run: |
-          python -m pytest tests/ -q --tb=short \
-            --ignore=tests/test_studio_api.py \
-            -k 'not llama_cpp_load_progress_live and not TestGpuAutoSelection and not TestPreSpawnGpuResolution and not TestPerGpuFitGuardAllCounts and not TestTransformersIntrospection and not test_returns_cuda_when_cuda_available and not test_calls_cuda_cache_when_cuda'
-
-  repo-cpu-tests:
-    # Auto-discover everything under tests/ that is not GPU-bound by
-    # design. New tests added in covered directories are picked up
-    # without a workflow edit. Locally validated: 779 passed, 11
-    # skipped, 23 deselected. tests/conftest.py (mirroring unsloth-zoo
-    # PR #624) pre-loads unsloth_zoo.device_type and unsloth.device_type
-    # under a mocked torch.cuda.is_available so the unsloth import
-    # chain succeeds on CPU.
-    name: Repo tests (CPU)
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Install deps (shared shape with backend pytest job)
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r studio/backend/requirements/studio.txt
-          pip install \
-            python-multipart aiofiles sqlalchemy cryptography \
-            pyyaml jinja2 mammoth unpdf requests typer \
-            'numpy<3' pytest pytest-asyncio httpx
-          # torchvision is needed because unsloth_zoo.vision_utils imports
-          # it at module scope and is reached via unsloth.models._utils.
-          pip install --index-url https://download.pytorch.org/whl/cpu \
-            'torch>=2.4,<2.11' 'torchvision<0.26'
-          pip install 'transformers>=4.51,<5.5'
-          # bitsandbytes is a hard import in unsloth/models/_utils.py.
-          # Recent versions ship a CPU build so it installs on a free
-          # Linux runner; the kernels still raise on use, but import
-          # succeeds and the package collects.
-          pip install 'bitsandbytes>=0.45'
-          # unsloth.device_type imports unsloth_zoo.utils.Version at module
-          # scope, so the conftest harness needs unsloth_zoo on the path
-          # even though it is an optional dep of unsloth.
-          pip install 'unsloth_zoo>=2026.5.1'
-          pip install -e . --no-deps
-
-      - name: Repo tests (CPU, auto-discovered)
-        env:
-          # tests/python/* import install_python_stack from studio/.
-          PYTHONPATH: ${{ github.workspace }}/studio
-          # Skip lazy compilation work the unsloth import chain wants to
-          # do at import time on a real GPU.
-          UNSLOTH_COMPILE_DISABLE: '1'
-        # --ignore: GPU-bound directories (qlora and saving need real
-        #   weights / GPU; tests/sh is a shell suite the next step
-        #   handles; tests/utils is a helpers folder, not tests).
-        # State-sensitive hardware-spoofing files are pulled out and run
-        # in isolation in the next step because they mutate
-        # hardware.py module globals (IS_ROCM / DEVICE) and pollute
-        # downstream tests.
-        # -m: honour markers already declared in tests/python/conftest.py
-        #   (`server` = needs studio venv, `e2e` = needs network).
-        # --deselect: two registry tests that hit huggingface_hub for
-        #   live model existence checks; they belong on a network job.
-        run: |
-          python -m pytest tests/ -q --tb=short \
-            --ignore=tests/qlora \
-            --ignore=tests/saving \
-            --ignore=tests/utils \
-            --ignore=tests/sh \
-            --ignore=tests/studio/test_hardware_dispatch_matrix.py \
-            --ignore=tests/studio/test_is_mlx_dispatch_gate.py \
-            -m 'not server and not e2e' \
-            --deselect tests/test_model_registry.py::test_model_registration \
-            --deselect tests/test_model_registry.py::test_all_model_registration
-
-      - name: Hardware-spoof tests (state-sensitive, run in isolation)
-        env:
-          PYTHONPATH: ${{ github.workspace }}/studio
-          UNSLOTH_COMPILE_DISABLE: '1'
-        # These two files mutate hardware.py module globals at runtime
-        # via the spoof fixtures, which leaks state into any other test
-        # that imports hardware. Run them in their own pytest invocation
-        # so the leak does not cross file boundaries.
-        run: |
-          python -m pytest -q --tb=short \
-            tests/studio/test_hardware_dispatch_matrix.py \
-            tests/studio/test_is_mlx_dispatch_gate.py
-
-      - name: Shell installer tests
-        # Subset that does not depend on a writable / pristine install.sh
-        # tree; test_install_host_defaults.sh checks install.ps1 layout
-        # which has drifted (separate followup).
-        run: |
-          set -e
-          for s in \
-              tests/sh/test_get_torch_index_url.sh \
-              tests/sh/test_mac_intel_compat.sh \
-              tests/sh/test_tauri_install_exit_order.sh \
-              tests/sh/test_torch_constraint.sh; do
-              echo "::group::$s"
-              bash "$s"
-              echo "::endgroup::"
-          done
-
-  ruff:
-    name: Backend ruff lint (non-blocking)
-    runs-on: ubuntu-latest
-    timeout-minutes: 5
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - run: pip install ruff
-      - name: ruff check (non-blocking until accumulated drift is cleared)
-        run: ruff check studio/backend || true
diff --git a/.github/workflows/studio-frontend-ci.yml b/.github/workflows/studio-frontend-ci.yml
deleted file mode 100644
index 039bd5dd08..0000000000
--- a/.github/workflows/studio-frontend-ci.yml
+++ /dev/null
@@ -1,108 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Frontend PR gate: lockfile freshness, typecheck, build, and a bundle grep
-# that catches the 2026.5.1 chat-history regression at the JS level.
-#
-# biome runs as non-blocking for now: the codebase currently has accumulated
-# ~470 errors and ~1650 warnings against the existing biome config. Surfacing
-# the count in CI lets us drive it down without forcing a fleet-wide cleanup
-# in the same PR. Drop `continue-on-error` once that number is zero.
-
-name: Frontend CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/frontend/**'
-      - '.github/workflows/studio-frontend-ci.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    name: Frontend build + bundle sanity
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    defaults:
-      run:
-        working-directory: studio/frontend
-    steps:
-      - uses: actions/checkout@v4
-
-      # FIXME: drop this step once @assistant-ui/* and assistant-stream
-      # leave 0.x -- on 1.x, caret ranges are conventional. Until then,
-      # every 0.minor on this surface is a SemVer-major (this is exactly
-      # how 2026.5.1 shipped a broken chat runtime: ^0.12.19 quietly
-      # resolved to 0.12.28).
-      - name: '@assistant-ui must be pinned exactly (no caret/tilde)'
-        working-directory: ${{ github.workspace }}
-        run: |
-          set -e
-          if grep -nE '"(@assistant-ui/[a-z-]+|assistant-stream)":[[:space:]]*"[\^~]' studio/frontend/package.json; then
-            echo "::error file=studio/frontend/package.json::These packages must be pinned to exact versions until they leave 0.x. Drop the leading ^ or ~."
-            exit 1
-          fi
-          echo "All assistant-ui packages are pinned exactly."
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22'
-          cache: 'npm'
-          cache-dependency-path: studio/frontend/package-lock.json
-
-      - name: Lockfile must agree with package.json (npm ci is strict)
-        run: npm ci --no-fund --no-audit
-
-      - name: npm ci must not have modified the working tree
-        working-directory: ${{ github.workspace }}
-        run: |
-          if ! git diff --quiet -- studio/frontend; then
-            echo "::error::npm ci modified files; commit the updated lockfile"
-            git status -- studio/frontend
-            exit 1
-          fi
-
-      - name: Typecheck
-        run: npm run typecheck
-
-      - name: Build
-        run: npm run build
-
-      - name: Built bundle must not contain Studio's unstable_Provider call site
-        run: |
-          set -e
-          JS=$(ls dist/assets/index-*.js | head -1)
-          HITS=$(grep -c 'unstable_Provider:' "$JS" || echo 0)
-          echo "main bundle: $JS"
-          echo "unstable_Provider: hits=$HITS (assistant-ui internals contribute up to 3)"
-          if [ "$HITS" -gt 3 ]; then
-            echo "::error file=studio/frontend/src/features/chat/runtime-provider.tsx::Studio bundle still passes unstable_Provider through useRemoteThreadListRuntime; this is the 2026.5.1 chat-history regression. Pass adapters directly into useLocalRuntime instead."
-            exit 1
-          fi
-
-      - name: Bundle size budget (75 MB)
-        run: |
-          SIZE=$(du -sb dist | cut -f1)
-          BUDGET=$((75 * 1024 * 1024))
-          echo "dist size: $SIZE bytes ($((SIZE/1024/1024)) MB), budget: $BUDGET bytes (75 MB)"
-          if [ "$SIZE" -gt "$BUDGET" ]; then
-            echo "::error::studio/frontend/dist/ exceeded the 75 MB budget. Drop dead deps (e.g. the unused next dep) or split chunks."
-            exit 1
-          fi
-
-      - name: Biome (non-blocking until accumulated drift is cleared)
-        continue-on-error: true
-        run: npm run biome:check
-
-      - name: Upload built dist on failure
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: studio-frontend-dist
-          path: studio/frontend/dist
-          retention-days: 3
diff --git a/.github/workflows/studio-inference-smoke.yml b/.github/workflows/studio-inference-smoke.yml
deleted file mode 100644
index 8efe072d28..0000000000
--- a/.github/workflows/studio-inference-smoke.yml
+++ /dev/null
@@ -1,185 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# End-to-end smoke: install Studio via install.sh --local --no-torch, download
-# a tiny GGUF, boot Studio, log in, change password, load the model, send a
-# chat completion, assert a non-empty response. Only workflow that tests "the
-# app actually works".
-#
-# Model: Qwen3.5-2B UD-IQ3_XXS (~890 MiB) -- small enough that the cache miss
-# is cheap and inference fits in the 25 min CPU-runner budget. GGUF is cached
-# across runs via actions/cache.
-
-name: Studio GGUF CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.sh'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-inference-smoke.yml'
-  push:
-    branches: [main, pip]
-  # Manual trigger for pre-warming the GGUF cache on main, or re-running
-  # against an arbitrary branch without pushing a no-op commit.
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-env:
-  GGUF_REPO: unsloth/Qwen3.5-2B-GGUF
-  GGUF_FILE: Qwen3.5-2B-UD-IQ3_XXS.gguf
-  STUDIO_PORT: '18888'
-
-jobs:
-  inference:
-    name: Studio boots, loads a GGUF, answers a chat completion
-    runs-on: ubuntu-latest
-    timeout-minutes: 25
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Linux dependencies for llama.cpp prebuilt
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            libcurl4-openssl-dev libssl-dev jq
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22'
-          cache: 'npm'
-          cache-dependency-path: studio/frontend/package-lock.json
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Cache GGUF model file
-        id: cache-gguf
-        uses: actions/cache@v4
-        with:
-          path: gguf-cache
-          key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
-      - name: Download GGUF if cache miss
-        if: steps.cache-gguf.outputs.cache-hit != 'true'
-        run: |
-          # huggingface-cli was deprecated in huggingface_hub 1.13; the new CLI is `hf`.
-          python -m pip install --upgrade huggingface_hub hf_transfer
-          mkdir -p gguf-cache
-          HF_HUB_ENABLE_HF_TRANSFER=1 \
-            hf download "$GGUF_REPO" "$GGUF_FILE" --local-dir gguf-cache
-
-      - name: Install Studio (--local, --no-torch keeps the install lean)
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Assert llama.cpp prebuilt was installed (no source-build fallback)
-        # ubuntu-latest is CPU-only x86_64, so studio/setup.sh should route
-        # to ggml-org/llama.cpp and grab bin-ubuntu-x64.tar.gz. A source
-        # build here means the routing regressed.
-        run: |
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::llama.cpp prebuilt path failed on ubuntu-latest. studio/setup.sh routing regressed; CPU-only Linux x86_64 should hit ggml-org/llama.cpp's bin-ubuntu-x64.tar.gz."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          if ! grep -qE "prebuilt installed and validated|prebuilt up to date and validated" logs/install.log; then
-            echo "::error::install.log does not contain the success marker for the llama.cpp prebuilt path. Did setup.sh skip the prebuilt install?"
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          echo "llama.cpp prebuilt path used successfully"
-
-      - name: Reset auth + start Studio in the background
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        run: |
-          for i in $(seq 1 60); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              echo "ready after ${i}s"
-              cat /tmp/health.json
-              jq -e '.status == "healthy"' /tmp/health.json
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Studio did not become healthy in 60s"
-          tail -200 logs/studio.log
-          exit 1
-
-      - name: Login + change bootstrap password
-        run: |
-          PW=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIPasswordSmoke12345!"
-          TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$PW\"}" | jq -r .access_token)
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$PW\",\"new_password\":\"$NEW\"}" > /dev/null
-          # Re-login to clear must_change_password flag.
-          NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV"
-
-      - name: Load the GGUF into Studio
-        run: |
-          GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}"
-          ls -lh "$GGUF_PATH"
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 600 \
-            -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name, is_gguf, context_length}'
-
-      - name: Send a chat completion + assert non-empty response
-        run: |
-          RESP=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/chat/completions" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 900 \
-            -d '{
-              "messages":[{"role":"user","content":"Say hello in one short sentence."}],
-              "max_tokens":40,
-              "stream":false
-            }')
-          echo "raw response: $RESP"
-          CONTENT=$(echo "$RESP" | jq -r '.choices[0].message.content // empty')
-          echo "model response: $CONTENT"
-          if [ -z "$CONTENT" ]; then
-            echo "::error::Empty assistant response from Studio"
-            exit 1
-          fi
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" || true
-          sleep 2
-          ss -tln | grep ":${STUDIO_PORT}" || true
-
-      - name: Upload Studio + install logs on failure
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: studio-inference-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
diff --git a/.github/workflows/studio-tauri-smoke.yml b/.github/workflows/studio-tauri-smoke.yml
deleted file mode 100644
index fcc9c8d963..0000000000
--- a/.github/workflows/studio-tauri-smoke.yml
+++ /dev/null
@@ -1,105 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# PR-time smoke for the Tauri desktop wrapper. Builds the frontend and the
-# Tauri Linux debug binary, with no codesigning. Catches:
-#   - tauri.conf.json drift
-#   - src-tauri Cargo.toml or rust source breakage
-#   - Tauri CLI version drift (we pin 2.10.1, matching release-desktop.yml)
-#   - frontend output not picked up by Tauri's distDir
-#
-# Linux-only on a free `ubuntu-latest` runner. Mac and Windows desktop builds
-# stay in release-desktop.yml (manual `workflow_dispatch`) because they need
-# code-signing secrets and ~30 min of runner time each.
-
-name: Studio Tauri CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/frontend/**'
-      - 'studio/src-tauri/**'
-      - '.github/workflows/studio-tauri-smoke.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  linux-debug-build:
-    name: Tauri Linux debug build (no codesign)
-    runs-on: ubuntu-22.04
-    timeout-minutes: 25
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Linux native deps for Tauri / WebKit2GTK
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y \
-            libwebkit2gtk-4.1-dev libayatana-appindicator3-dev \
-            librsvg2-dev libxdo-dev libssl-dev patchelf
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '24'
-          cache: 'npm'
-          cache-dependency-path: studio/frontend/package-lock.json
-
-      - uses: dtolnay/rust-toolchain@stable
-
-      - uses: swatinem/rust-cache@v2
-        with:
-          workspaces: studio/src-tauri -> target
-
-      - name: Install pinned Tauri CLI (matches release-desktop.yml)
-        run: npm install --save-dev --prefix studio @tauri-apps/cli@2.10.1
-
-      - name: Verify pinned Tauri CLI version
-        run: |
-          out="$(npx --prefix studio tauri --version)"
-          echo "$out"
-          [ "$out" = "tauri-cli 2.10.1" ] || { echo "::error::expected tauri-cli 2.10.1, got $out"; exit 1; }
-
-      - name: Frontend build (npm ci, vite)
-        working-directory: studio/frontend
-        run: |
-          npm ci --no-fund --no-audit
-          npm run build
-          test -f dist/index.html
-
-      - name: Tauri debug build (Linux, no bundle, no codesign)
-        # `--debug` + `--no-bundle` keeps this lean: compiles the Rust crate,
-        # confirms the frontend dist is wired into Tauri, but skips the AppImage
-        # / .deb production. Code signing is irrelevant because we never produce
-        # a distributable artifact.
-        env:
-          TAURI_SIGNING_PRIVATE_KEY: ''
-          TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ''
-        run: npx --prefix studio tauri build --debug --no-bundle
-
-      - name: Inspect produced binary
-        run: |
-          BIN=$(find studio/src-tauri/target/debug -maxdepth 1 -type f -executable 2>/dev/null \
-                | grep -Ev '\.(d|so|dylib|dll)$' \
-                | grep -Ev '/(deps|build|examples)$' \
-                | head -1)
-          echo "binary: $BIN"
-          if [ -z "$BIN" ]; then
-            echo "::error::Tauri debug binary not produced"
-            ls -la studio/src-tauri/target/debug/ || true
-            exit 1
-          fi
-          file "$BIN"
-          du -h "$BIN"
-
-      - uses: actions/upload-artifact@v4
-        if: failure()
-        with:
-          name: tauri-debug-build
-          path: |
-            studio/src-tauri/target/debug
-            studio/frontend/dist
-          retention-days: 3
diff --git a/.github/workflows/wheel-smoke.yml b/.github/workflows/wheel-smoke.yml
deleted file mode 100644
index 080a6bb261..0000000000
--- a/.github/workflows/wheel-smoke.yml
+++ /dev/null
@@ -1,124 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Builds the PyPI wheel from the PR branch, then verifies the built wheel
-# actually contains what we expect to ship and does NOT contain the broken
-# Studio bundle that 2026.5.1 published. This is the single workflow that
-# would have blocked the 2026.5.1 release before twine upload.
-#
-# Verified locally end-to-end against this branch:
-#   - python -m build produces unsloth-<version>-py3-none-any.whl in 13s
-#   - wheel content sanity passes:
-#       lockfile shipped, frontend dist shipped,
-#       no node_modules in wheel, no bun.lock in wheel,
-#       main bundle has unstable_Provider hits=1 (assistant-ui internals only).
-#   - Studio backend imports cleanly from the installed wheel with the
-#     lightweight dep set below.
-
-name: Wheel CI
-
-on:
-  pull_request:
-    paths:
-      - 'pyproject.toml'
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - '.github/workflows/wheel-smoke.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  wheel:
-    name: Wheel build + content sanity + import smoke
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22'
-          cache: 'npm'
-          cache-dependency-path: studio/frontend/package-lock.json
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-
-      - name: Build frontend
-        run: |
-          cd studio/frontend
-          npm ci --no-fund --no-audit
-          npm run build
-
-      - name: Build wheel + sdist
-        run: |
-          python -m pip install --upgrade pip build
-          rm -rf dist build ./*.egg-info
-          python -m build
-
-      - name: Wheel content sanity
-        run: |
-          python - <<'PY'
-          import zipfile, glob, sys
-          w = glob.glob("dist/unsloth-*.whl")
-          if not w:
-              print("FAIL: no wheel produced"); sys.exit(2)
-          w = w[0]
-          print(f"wheel: {w}")
-          with zipfile.ZipFile(w) as z:
-              n = z.namelist()
-              checks = {
-                "lockfile shipped":      any(s.endswith("studio/frontend/package-lock.json") for s in n),
-                "frontend dist shipped": any(s.endswith("studio/frontend/dist/index.html")    for s in n),
-                "no node_modules":       not any("studio/frontend/node_modules/" in s for s in n),
-                "no bun.lock":           not any(s.endswith("studio/frontend/bun.lock")       for s in n),
-              }
-              js = [s for s in n
-                    if "studio/frontend/dist/assets/" in s
-                    and s.endswith(".js")
-                    and "/index-" in s]
-              if not js:
-                  print("FAIL: no main bundle index-*.js in wheel"); sys.exit(2)
-              data = z.read(js[0]).decode("utf-8", "replace")
-              hits = data.count("unstable_Provider:")
-              print(f"main bundle: {js[0]}")
-              print(f"unstable_Provider hits: {hits} (>=4 indicates 2026.5.1 regression)")
-              checks["bundle has no Studio unstable_Provider call site"] = (hits < 4)
-
-              print()
-              for k, v in checks.items():
-                  print(f"  [{'PASS' if v else 'FAIL'}] {k}")
-              sys.exit(0 if all(checks.values()) else 1)
-          PY
-
-      - name: Studio backend import smoke
-        # Imports `studio.backend.main:app` from the freshly-installed wheel in
-        # a clean venv. This catches the class of bug that 2026.5.1 shipped with:
-        # frontend dist missing, package-lock.json missing, or the wheel's Python
-        # source tree broken in a way that surfaces only at app construction time.
-        run: |
-          python -m venv /tmp/v
-          /tmp/v/bin/pip install --upgrade pip
-          /tmp/v/bin/pip install -r studio/backend/requirements/studio.txt
-          /tmp/v/bin/pip install \
-            python-multipart aiofiles sqlalchemy cryptography \
-            pyyaml jinja2 mammoth unpdf requests \
-            'numpy<3'
-          /tmp/v/bin/pip install --no-deps dist/unsloth-*.whl
-          # Run from /tmp so Python imports the installed package, not the source tree.
-          cd /tmp
-          /tmp/v/bin/python -c "from studio.backend.main import app; print('Studio backend OK:', app.title)"
-
-      - name: Upload wheel on failure
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: unsloth-wheel
-          path: dist/
-          retention-days: 7
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index d39f2588ef..93dddcc76d 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2065,6 +2065,19 @@ def unsloth_fast_generate(
 ):
     # If the model starts out in training mode, restore training mode after generation
     restore_training_mode = self.training
+    # why: snapshot the actual GC mode value (e.g. "unsloth") before for_inference
+    # clears it, so the post-generate restore preserves the caller's configured GC
+    # mode rather than collapsing it to a plain bool.
+    use_gradient_checkpointing = next(
+        (
+            v
+            for v in (
+                getattr(m, "gradient_checkpointing", False) for m in self.modules()
+            )
+            if v
+        ),
+        False,
+    )
 
     FastLlamaModel.for_inference(self)
 
@@ -2138,7 +2151,10 @@ def unsloth_fast_generate(
     # pass
 
     if restore_training_mode:
-        FastLlamaModel.for_training(self)
+        FastLlamaModel.for_training(
+            self,
+            use_gradient_checkpointing=use_gradient_checkpointing,
+        )
 
     return output
 
diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index ac9b35a822..b00db26438 100755
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -186,6 +186,21 @@ def unwrap_model_for_generation(
 
     @contextmanager
     def unsloth_unwrap_model_for_generation(model, *args, **kwargs):
+        # why: snapshot before TRL's unwrap context manager, which calls
+        # gradient_checkpointing_disable() before yielding; preserve the actual
+        # mode value (e.g. "unsloth") rather than collapsing it to a bool, so
+        # the finally restore matches the caller's configured GC mode.
+        use_gradient_checkpointing = next(
+            (
+                v
+                for v in (
+                    getattr(m, "gradient_checkpointing", False)
+                    for m in model.modules()
+                )
+                if v
+            ),
+            False,
+        )
         with unwrap_model_for_generation(model, *args, **kwargs) as unwrapped_model:
             # Put the model in inference mode.
             FastLanguageModel.for_inference(model)
@@ -207,7 +222,10 @@ def generate_with_clone(*args, **kwargs):
             finally:
                 # Restore generate and return
                 unwrapped_model.generate = original_generate
-                FastLanguageModel.for_training(model)
+                FastLanguageModel.for_training(
+                    model,
+                    use_gradient_checkpointing=use_gradient_checkpointing,
+                )
 
     from transformers import Trainer
     from transformers.trainer_pt_utils import nested_detach
diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
index 0f10847282..9ffe40b555 100755
--- a/unsloth/models/rl_replacements.py
+++ b/unsloth/models/rl_replacements.py
@@ -650,7 +650,7 @@ def grpo_trainer__generate_and_score_completions(function_name, function):
                 # Left pad prompt before calculation old and ref hidden states
                 left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt(prompt_completion_ids, logits_to_keep, self.processing_class.pad_token_id)
                 max_left_pad = torch.max(left_pad_tokens_per_prompt).item()
-        self.model.for_training()"""
+        self.model.for_training(use_gradient_checkpointing=getattr(self.args, 'gradient_checkpointing', True))"""
 
     function = function.replace(line_to_replace, replacement_lines)