Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions studio/backend/core/export/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,114 @@ def export_lora_adapter(
logger.error(traceback.format_exc())
return False, f"Adapter export failed: {str(e)}"

def export_autoround_4bit(
self,
save_directory: str,
export_format: str = "auto_awq",
bits: int = 4,
group_size: int = 128,
iters: int = 200,
nsamples: int = 128,
dataset: str = "NeelNanda/pile-10k",
push_to_hub: bool = False,
repo_id: Optional[str] = None,
hf_token: Optional[str] = None,
private: bool = False,
) -> Tuple[bool, str]:
"""Export model to 4-bit quantized format using Auto-Round.

Produces an AWQ, GPTQ, or native Auto-Round quantized model that can
be loaded directly by vLLM with --quantization awq / gptq.

Args:
save_directory: Local output directory.
export_format: ``"auto_awq"``, ``"auto_gptq"``, or ``"auto_round"``.
bits: Quantisation bitwidth (default 4).
group_size: Quantisation group size (default 128).
iters: Auto-Round calibration iterations (0 = fast RTN).
nsamples: Number of calibration samples.
dataset: HF dataset used for calibration.
push_to_hub: Upload the result to Hugging Face Hub.
repo_id: Hub repo ``"username/model-name"``.
hf_token: HF access token.
private: Create a private Hub repository.

Returns:
Tuple of (success: bool, message: str)
"""
if not self.current_model or not self.current_tokenizer:
return False, "No model loaded. Please select a checkpoint first."

try:
from unsloth.save import save_to_autoround_4bit
except ImportError:
# Fallback: try direct import path
try:
import sys, os as _os

sys.path.insert(0, str(Path(__file__).resolve().parents[4]))
from unsloth.save import save_to_autoround_4bit
except ImportError as exc:
return False, (
f"Could not import save_to_autoround_4bit: {exc}. "
"Ensure the unsloth package is installed."
)

try:
import tempfile

# Resolve output directory upfront: use the requested path, or a
# temporary directory when only pushing to hub (no local save).
_tmp_ctx = None
if save_directory:
out_dir = str(resolve_export_dir(save_directory))
ensure_dir(Path(out_dir))
elif push_to_hub:
_tmp_ctx = tempfile.TemporaryDirectory(prefix = "_unsloth_autoround4bit_")
out_dir = _tmp_ctx.__enter__()
else:
return False, "Either save_directory or push_to_hub must be specified."

logger.info(f"Saving Auto-Round 4-bit model to: {out_dir}")

try:
save_to_autoround_4bit(
model_or_path = self.current_model,
tokenizer = self.current_tokenizer,
output_dir = out_dir,
export_format = export_format,
bits = bits,
group_size = group_size,
iters = iters,
nsamples = nsamples,
dataset = dataset,
push_to_hub = push_to_hub,
repo_id = repo_id,
token = hf_token,
private = private,
)

if save_directory:
self._write_export_metadata(out_dir)
logger.info(f"Auto-Round 4-bit model saved to {out_dir}")
finally:
if _tmp_ctx is not None:
_tmp_ctx.__exit__(None, None, None)

fmt_label = {
"auto_awq": "AWQ",
"auto_gptq": "GPTQ",
"auto_round": "Auto-Round",
}.get(export_format, export_format.upper())
return True, f"Auto-Round 4-bit ({fmt_label}) model exported successfully"

except Exception as e:
logger.error(f"Error exporting Auto-Round 4-bit model: {e}")
import traceback

logger.error(traceback.format_exc())
return False, f"Auto-Round 4-bit export failed: {str(e)}"


# Global export backend instance
_export_backend = None
Expand Down
40 changes: 40 additions & 0 deletions studio/backend/core/export/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,46 @@ def _run_export(self, export_type: str, params: dict) -> Tuple[bool, str]:
except RuntimeError as exc:
return False, str(exc)

def export_autoround_4bit(
self,
save_directory: str,
export_format: str = "auto_awq",
bits: int = 4,
group_size: int = 128,
iters: int = 200,
nsamples: int = 128,
dataset: str = "NeelNanda/pile-10k",
push_to_hub: bool = False,
repo_id: Optional[str] = None,
hf_token: Optional[str] = None,
private: bool = False,
) -> Tuple[bool, str]:
if not self._ensure_subprocess_alive():
return False, "Export subprocess is not running"

cmd = {
"type": "export",
"export_type": "autoround4bit",
"save_directory": save_directory,
"export_format": export_format,
"bits": bits,
"group_size": group_size,
"iters": iters,
"nsamples": nsamples,
"dataset": dataset,
"push_to_hub": push_to_hub,
"repo_id": repo_id,
"hf_token": hf_token,
"private": private,
}
self._send_cmd(cmd)

try:
resp = self._wait_response("export_autoround4bit_done", timeout = 7200.0)
return resp.get("success", False), resp.get("message", "")
except RuntimeError as exc:
return False, str(exc)

def cleanup_memory(self) -> bool:
"""Cleanup export-related models from memory."""
if not self._ensure_subprocess_alive():
Expand Down
14 changes: 14 additions & 0 deletions studio/backend/core/export/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,20 @@ def _handle_export(backend, cmd: dict, resp_queue: Any) -> None:
hf_token = cmd.get("hf_token"),
private = cmd.get("private", False),
)
elif export_type == "autoround4bit":
success, message = backend.export_autoround_4bit(
save_directory = cmd.get("save_directory", ""),
export_format = cmd.get("export_format", "auto_awq"),
bits = cmd.get("bits", 4),
group_size = cmd.get("group_size", 128),
iters = cmd.get("iters", 200),
nsamples = cmd.get("nsamples", 128),
dataset = cmd.get("dataset", "NeelNanda/pile-10k"),
push_to_hub = cmd.get("push_to_hub", False),
repo_id = cmd.get("repo_id"),
hf_token = cmd.get("hf_token"),
private = cmd.get("private", False),
)
else:
success, message = False, f"Unknown export type: {export_type}"

Expand Down
2 changes: 2 additions & 0 deletions studio/backend/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
ExportBaseModelRequest,
ExportGGUFRequest,
ExportLoRAAdapterRequest,
ExportAutoRound4bitRequest,
)
from .users import Token
from .datasets import (
Expand Down Expand Up @@ -104,6 +105,7 @@
"ExportBaseModelRequest",
"ExportGGUFRequest",
"ExportLoRAAdapterRequest",
"ExportAutoRound4bitRequest",
"Token",
# Dataset schemas
"CheckFormatRequest",
Expand Down
17 changes: 17 additions & 0 deletions studio/backend/models/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,20 @@ class ExportLoRAAdapterRequest(ExportCommonOptions):
"""Request for exporting only the LoRA adapter (not merged)."""

# Uses fields from ExportCommonOptions only


class ExportAutoRound4bitRequest(ExportCommonOptions):
"""Request for exporting the model to 4-bit format using Auto-Round."""

export_format: Literal["auto_awq", "auto_gptq", "auto_round"] = Field(
"auto_awq",
description = "Target 4-bit format for efficient inference",
)
bits: int = Field(4, description = "Target bitwidth")
group_size: int = Field(128, description = "Quantisation group size")
iters: int = Field(200, description = "Auto-Round mapping iterations")
nsamples: int = Field(128, description = "Calibration sample count")
dataset: str = Field(
"NeelNanda/pile-10k",
description = "HuggingFace dataset for calibration",
)
1 change: 1 addition & 0 deletions studio/backend/requirements/extras.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,4 @@ tabulate
fastmcp>=3.0.2
openai>=2.7.2
websockets>=15.0.1
auto-round
41 changes: 41 additions & 0 deletions studio/backend/routes/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
ExportBaseModelRequest,
ExportGGUFRequest,
ExportLoRAAdapterRequest,
ExportAutoRound4bitRequest,
)

router = APIRouter()
Expand Down Expand Up @@ -309,3 +310,43 @@ async def export_lora_adapter(
status_code = 500,
detail = f"Failed to export LoRA adapter: {str(e)}",
)


@router.post("/export/autoround4bit", response_model = ExportOperationResponse)
async def export_autoround_4bit(
request: ExportAutoRound4bitRequest,
current_subject: str = Depends(get_current_subject),
):
"""
Export the model to 4-bit format using Auto-Round (AWQ/GPTQ).

Wraps ExportBackend.export_autoround_4bit.
"""
try:
backend = get_export_backend()
success, message = backend.export_autoround_4bit(
save_directory = request.save_directory,
export_format = request.export_format,
bits = request.bits,
group_size = request.group_size,
iters = request.iters,
nsamples = request.nsamples,
dataset = request.dataset,
push_to_hub = request.push_to_hub,
repo_id = request.repo_id,
hf_token = request.hf_token,
private = request.private,
)

if not success:
raise HTTPException(status_code = 400, detail = message)

return ExportOperationResponse(success = True, message = message)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error exporting Auto-Round 4-bit model: {e}", exc_info = True)
raise HTTPException(
status_code = 500,
detail = f"Failed to export Auto-Round 4-bit model: {str(e)}",
)
21 changes: 21 additions & 0 deletions studio/frontend/src/features/export/api/export-api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,27 @@ export async function exportLoRA(params: {
return parseJson<ExportOperationResponse>(response);
}

export async function exportAutoRound4bit(params: {
save_directory: string;
export_format?: string;
bits?: number;
group_size?: number;
iters?: number;
nsamples?: number;
dataset?: string;
push_to_hub?: boolean;
repo_id?: string | null;
hf_token?: string | null;
private?: boolean;
}): Promise<ExportOperationResponse> {
const response = await authFetch("/api/export/export/autoround4bit", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(params),
});
return parseJson<ExportOperationResponse>(response);
}

export async function cleanupExport(): Promise<ExportOperationResponse> {
const response = await authFetch("/api/export/cleanup", { method: "POST" });
return parseJson<ExportOperationResponse>(response);
Expand Down
19 changes: 18 additions & 1 deletion studio/frontend/src/features/export/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import type { TrainingMethod } from "@/types/training";

export type ExportMethod = "merged" | "lora" | "gguf";
export type ExportMethod = "merged" | "lora" | "gguf" | "autoround4bit";

export const EXPORT_METHODS: {
value: ExportMethod;
Expand All @@ -26,6 +26,14 @@ export const EXPORT_METHODS: {
tooltip:
"Exports only the trained adapter. Pair with the base model at inference time to save storage.",
},
{
value: "autoround4bit",
title: "4-bit (vLLM / Auto-Round)",
description: "High-accuracy AWQ/GPTQ 4-bit for vLLM deployment.",
tooltip:
"Uses Intel Auto-Round to quantize your model to W4A16. Best combination of speed, memory, and performance for vLLM.",
badge: "Recommended",
},
{
value: "gguf",
title: "GGUF / Llama.cpp",
Expand All @@ -46,6 +54,12 @@ export const QUANT_OPTIONS = [
{ value: "f32", label: "F32", size: "~28.4 GB" },
];

export const AUTOROUND_QUANT_OPTIONS = [
{ value: "auto_awq", label: "AWQ", desc: "Activation aware Weight Quantization" },
{ value: "auto_gptq", label: "GPTQ", desc: "Alternative 4-bit standard" },
{ value: "auto_round", label: "Auto-Round", desc: "Native Intel format" },
];

export function getEstimatedSize(
method: ExportMethod | null,
quantLevels: string[],
Expand All @@ -67,6 +81,9 @@ export function getEstimatedSize(
if (method === "lora") {
return "~100 MB";
}
if (method === "autoround4bit") {
return "~4.1 GB";
}
return "—";
}

Expand Down
Loading