Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 76 additions & 1 deletion unsloth_zoo/saving_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,82 @@ def _merge_and_overwrite_lora(
tensors[key] = resized[key]
else:
tensors[key] = f.get_tensor(key)
save_file(tensors, filename_original)

# Fix for Windows file locking (os error 1224)
# Use retry logic with safe atomic operations
import tempfile
import shutil

# Import safetensors exception to catch wrapped Windows errors
try:
from safetensors.torch import SafetensorError
except ImportError:
SafetensorError = Exception # Fallback if not available

max_retries = 10
base_delay = 0.2 # seconds
temp_dir = os.path.dirname(filename_original)

for attempt in range(max_retries):
try:
# Force garbage collection and CUDA cache cleanup
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()

# Create temp file in same directory for atomic replace
with tempfile.NamedTemporaryFile(
delete=False,
dir=temp_dir,
suffix=".safetensors.tmp"
) as tmp_file:
tmp_path = tmp_file.name

try:
# Write to temp file (safe - original untouched)
save_file(tensors, tmp_path)

# Only delete original after successful write
if os.path.exists(filename_original):
os.remove(filename_original)

# Move temp to original location (atomic)
shutil.move(tmp_path, filename_original)
break # Success

except Exception as write_error:
# Clean up temp file on write failure
try:
if os.path.exists(tmp_path):
os.remove(tmp_path)
except:
pass
raise write_error

except (OSError, IOError, SafetensorError) as e:
# Catch both OS errors and safetensors-wrapped Windows errors
error_msg = str(e).lower()
is_lock_error = "1224" in error_msg or "user-mapped" in error_msg or "cannot be performed" in error_msg

if is_lock_error and attempt < max_retries - 1:
# Exponential backoff for lock errors
delay = base_delay * (2 ** (attempt // 2))
if UNSLOTH_ENABLE_LOGGING:
logger.warning(
f"[Retry {attempt + 1}/{max_retries}] Windows file lock detected: {e}. "
f"Waiting {delay:.1f}s before retry..."
)
time.sleep(delay)
elif is_lock_error and attempt == max_retries - 1:
raise RuntimeError(
f"Failed to save file after {max_retries} attempts due to Windows file lock. "
"Original shard preserved - no data loss. "
"Solutions: 1) Restart Unsloth Studio 2) Disable antivirus 3) Close File Explorer windows"
)
else:
# Non-lock errors - fail immediately
raise RuntimeError(f"Model merge failed with error: {e}")

del tensors

if torch.cuda.is_available():
Expand Down