unslothai · OnePunchMonk · Apr 3, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
@@ -685,6 +685,114 @@ def export_lora_adapter(
             logger.error(traceback.format_exc())
             return False, f"Adapter export failed: {str(e)}"
 
+    def export_autoround_4bit(
+        self,
+        save_directory: str,
+        export_format: str = "auto_awq",
+        bits: int = 4,
+        group_size: int = 128,
+        iters: int = 200,
+        nsamples: int = 128,
+        dataset: str = "NeelNanda/pile-10k",
+        push_to_hub: bool = False,
+        repo_id: Optional[str] = None,
+        hf_token: Optional[str] = None,
+        private: bool = False,
+    ) -> Tuple[bool, str]:
+        """Export model to 4-bit quantized format using Auto-Round.
+
+        Produces an AWQ, GPTQ, or native Auto-Round quantized model that can
+        be loaded directly by vLLM with --quantization awq / gptq.
+
+        Args:
+            save_directory: Local output directory.
+            export_format: ``"auto_awq"``, ``"auto_gptq"``, or ``"auto_round"``.
+            bits: Quantisation bitwidth (default 4).
+            group_size: Quantisation group size (default 128).
+            iters: Auto-Round calibration iterations (0 = fast RTN).
+            nsamples: Number of calibration samples.
+            dataset: HF dataset used for calibration.
+            push_to_hub: Upload the result to Hugging Face Hub.
+            repo_id: Hub repo ``"username/model-name"``.
+            hf_token: HF access token.
+            private: Create a private Hub repository.
+
+        Returns:
+            Tuple of (success: bool, message: str)
+        """
+        if not self.current_model or not self.current_tokenizer:
+            return False, "No model loaded. Please select a checkpoint first."
+
+        try:
+            from unsloth.save import save_to_autoround_4bit
+        except ImportError:
+            # Fallback: try direct import path
+            try:
+                import sys, os as _os
+
+                sys.path.insert(0, str(Path(__file__).resolve().parents[4]))
+                from unsloth.save import save_to_autoround_4bit
+            except ImportError as exc:
+                return False, (
+                    f"Could not import save_to_autoround_4bit: {exc}. "
+                    "Ensure the unsloth package is installed."
+                )
+
+        try:
+            import tempfile
+
+            # Resolve output directory upfront: use the requested path, or a
+            # temporary directory when only pushing to hub (no local save).
+            _tmp_ctx = None
+            if save_directory:
+                out_dir = str(resolve_export_dir(save_directory))
+                ensure_dir(Path(out_dir))
+            elif push_to_hub:
+                _tmp_ctx = tempfile.TemporaryDirectory(prefix = "_unsloth_autoround4bit_")
+                out_dir = _tmp_ctx.__enter__()
+            else:
+                return False, "Either save_directory or push_to_hub must be specified."
+
+            logger.info(f"Saving Auto-Round 4-bit model to: {out_dir}")
+
+            try:
+                save_to_autoround_4bit(
+                    model_or_path = self.current_model,
+                    tokenizer = self.current_tokenizer,
+                    output_dir = out_dir,
+                    export_format = export_format,
+                    bits = bits,
+                    group_size = group_size,
+                    iters = iters,
+                    nsamples = nsamples,
+                    dataset = dataset,
+                    push_to_hub = push_to_hub,
+                    repo_id = repo_id,
+                    token = hf_token,
+                    private = private,
+                )
+
+                if save_directory:
+                    self._write_export_metadata(out_dir)
+                    logger.info(f"Auto-Round 4-bit model saved to {out_dir}")
+            finally:
+                if _tmp_ctx is not None:
+                    _tmp_ctx.__exit__(None, None, None)
+
+            fmt_label = {
+                "auto_awq": "AWQ",
+                "auto_gptq": "GPTQ",
+                "auto_round": "Auto-Round",
+            }.get(export_format, export_format.upper())
+            return True, f"Auto-Round 4-bit ({fmt_label}) model exported successfully"
+
+        except Exception as e:
+            logger.error(f"Error exporting Auto-Round 4-bit model: {e}")
+            import traceback
+
+            logger.error(traceback.format_exc())
+            return False, f"Auto-Round 4-bit export failed: {str(e)}"
+
 
 # Global export backend instance
 _export_backend = None

@@ -364,6 +364,46 @@ def _run_export(self, export_type: str, params: dict) -> Tuple[bool, str]:
         except RuntimeError as exc:
             return False, str(exc)
 
+    def export_autoround_4bit(
+        self,
+        save_directory: str,
+        export_format: str = "auto_awq",
+        bits: int = 4,
+        group_size: int = 128,
+        iters: int = 200,
+        nsamples: int = 128,
+        dataset: str = "NeelNanda/pile-10k",
+        push_to_hub: bool = False,
+        repo_id: Optional[str] = None,
+        hf_token: Optional[str] = None,
+        private: bool = False,
+    ) -> Tuple[bool, str]:
+        if not self._ensure_subprocess_alive():
+            return False, "Export subprocess is not running"
+
+        cmd = {
+            "type": "export",
+            "export_type": "autoround4bit",
+            "save_directory": save_directory,
+            "export_format": export_format,
+            "bits": bits,
+            "group_size": group_size,
+            "iters": iters,
+            "nsamples": nsamples,
+            "dataset": dataset,
+            "push_to_hub": push_to_hub,
+            "repo_id": repo_id,
+            "hf_token": hf_token,
+            "private": private,
+        }
+        self._send_cmd(cmd)
+
+        try:
+            resp = self._wait_response("export_autoround4bit_done", timeout = 7200.0)
+            return resp.get("success", False), resp.get("message", "")
+        except RuntimeError as exc:
+            return False, str(exc)
+
     def cleanup_memory(self) -> bool:
         """Cleanup export-related models from memory."""
         if not self._ensure_subprocess_alive():

@@ -152,6 +152,20 @@ def _handle_export(backend, cmd: dict, resp_queue: Any) -> None:
                 hf_token = cmd.get("hf_token"),
                 private = cmd.get("private", False),
             )
+        elif export_type == "autoround4bit":
+            success, message = backend.export_autoround_4bit(
+                save_directory = cmd.get("save_directory", ""),
+                export_format = cmd.get("export_format", "auto_awq"),
+                bits = cmd.get("bits", 4),
+                group_size = cmd.get("group_size", 128),
+                iters = cmd.get("iters", 200),
+                nsamples = cmd.get("nsamples", 128),
+                dataset = cmd.get("dataset", "NeelNanda/pile-10k"),
+                push_to_hub = cmd.get("push_to_hub", False),
+                repo_id = cmd.get("repo_id"),
+                hf_token = cmd.get("hf_token"),
+                private = cmd.get("private", False),
+            )
         else:
             success, message = False, f"Unknown export type: {export_type}"
 

@@ -41,6 +41,7 @@
     ExportBaseModelRequest,
     ExportGGUFRequest,
     ExportLoRAAdapterRequest,
+    ExportAutoRound4bitRequest,
 )
 from .users import Token
 from .datasets import (
@@ -104,6 +105,7 @@
     "ExportBaseModelRequest",
     "ExportGGUFRequest",
     "ExportLoRAAdapterRequest",
+    "ExportAutoRound4bitRequest",
     "Token",
     # Dataset schemas
     "CheckFormatRequest",

@@ -130,3 +130,20 @@ class ExportLoRAAdapterRequest(ExportCommonOptions):
     """Request for exporting only the LoRA adapter (not merged)."""
 
     # Uses fields from ExportCommonOptions only
+
+
+class ExportAutoRound4bitRequest(ExportCommonOptions):
+    """Request for exporting the model to 4-bit format using Auto-Round."""
+
+    export_format: Literal["auto_awq", "auto_gptq", "auto_round"] = Field(
+        "auto_awq",
+        description = "Target 4-bit format for efficient inference",
+    )
+    bits: int = Field(4, description = "Target bitwidth")
+    group_size: int = Field(128, description = "Quantisation group size")
+    iters: int = Field(200, description = "Auto-Round mapping iterations")
+    nsamples: int = Field(128, description = "Calibration sample count")
+    dataset: str = Field(
+        "NeelNanda/pile-10k",
+        description = "HuggingFace dataset for calibration",
+    )
@@ -55,3 +55,4 @@ tabulate
 fastmcp>=3.0.2
 openai>=2.7.2
 websockets>=15.0.1
+auto-round
@@ -37,6 +37,7 @@
     ExportBaseModelRequest,
     ExportGGUFRequest,
     ExportLoRAAdapterRequest,
+    ExportAutoRound4bitRequest,
 )
 
 router = APIRouter()
@@ -309,3 +310,43 @@ async def export_lora_adapter(
             status_code = 500,
             detail = f"Failed to export LoRA adapter: {str(e)}",
         )
+
+
+@router.post("/export/autoround4bit", response_model = ExportOperationResponse)
+async def export_autoround_4bit(
+    request: ExportAutoRound4bitRequest,
+    current_subject: str = Depends(get_current_subject),
+):
+    """
+    Export the model to 4-bit format using Auto-Round (AWQ/GPTQ).
+
+    Wraps ExportBackend.export_autoround_4bit.
+    """
+    try:
+        backend = get_export_backend()
+        success, message = backend.export_autoround_4bit(
+            save_directory = request.save_directory,
+            export_format = request.export_format,
+            bits = request.bits,
+            group_size = request.group_size,
+            iters = request.iters,
+            nsamples = request.nsamples,
+            dataset = request.dataset,
+            push_to_hub = request.push_to_hub,
+            repo_id = request.repo_id,
+            hf_token = request.hf_token,
+            private = request.private,
+        )
+
+        if not success:
+            raise HTTPException(status_code = 400, detail = message)
+
+        return ExportOperationResponse(success = True, message = message)
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error exporting Auto-Round 4-bit model: {e}", exc_info = True)
+        raise HTTPException(
+            status_code = 500,
+            detail = f"Failed to export Auto-Round 4-bit model: {str(e)}",
+        )
@@ -127,6 +127,27 @@ export async function exportLoRA(params: {
   return parseJson<ExportOperationResponse>(response);
 }
 
+export async function exportAutoRound4bit(params: {
+  save_directory: string;
+  export_format?: string;
+  bits?: number;
+  group_size?: number;
+  iters?: number;
+  nsamples?: number;
+  dataset?: string;
+  push_to_hub?: boolean;
+  repo_id?: string | null;
+  hf_token?: string | null;
+  private?: boolean;
+}): Promise<ExportOperationResponse> {
+  const response = await authFetch("/api/export/export/autoround4bit", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(params),
+  });
+  return parseJson<ExportOperationResponse>(response);
+}
+
 export async function cleanupExport(): Promise<ExportOperationResponse> {
   const response = await authFetch("/api/export/cleanup", { method: "POST" });
   return parseJson<ExportOperationResponse>(response);

@@ -3,7 +3,7 @@
 
 import type { TrainingMethod } from "@/types/training";
 
-export type ExportMethod = "merged" | "lora" | "gguf";
+export type ExportMethod = "merged" | "lora" | "gguf" | "autoround4bit";
 
 export const EXPORT_METHODS: {
   value: ExportMethod;
@@ -26,6 +26,14 @@ export const EXPORT_METHODS: {
     tooltip:
       "Exports only the trained adapter. Pair with the base model at inference time to save storage.",
   },
+  {
+    value: "autoround4bit",
+    title: "4-bit (vLLM / Auto-Round)",
+    description: "High-accuracy AWQ/GPTQ 4-bit for vLLM deployment.",
+    tooltip:
+      "Uses Intel Auto-Round to quantize your model to W4A16. Best combination of speed, memory, and performance for vLLM.",
+    badge: "Recommended",
+  },
   {
     value: "gguf",
     title: "GGUF / Llama.cpp",
@@ -46,6 +54,12 @@ export const QUANT_OPTIONS = [
   { value: "f32", label: "F32", size: "~28.4 GB" },
 ];
 
+export const AUTOROUND_QUANT_OPTIONS = [
+  { value: "auto_awq", label: "AWQ", desc: "Activation aware Weight Quantization" },
+  { value: "auto_gptq", label: "GPTQ", desc: "Alternative 4-bit standard" },
+  { value: "auto_round", label: "Auto-Round", desc: "Native Intel format" },
+];
+
 export function getEstimatedSize(
   method: ExportMethod | null,
   quantLevels: string[],
@@ -67,6 +81,9 @@ export function getEstimatedSize(
   if (method === "lora") {
     return "~100 MB";
   }
+  if (method === "autoround4bit") {
+    return "~4.1 GB";
+  }
   return "—";
 }