prune-open-webui/prune_operations.py at main · vaclcer/prune-open-webui · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
"""
Prune Operations - All Helper Functions

This module contains all the helper functions from backend/open_webui/routers/prune.py
that perform the actual pruning operations, counting, and cleanup.
"""

import inspect
import logging
import time
from pathlib import Path
from typing import Optional, Set, Callable, Any
from sqlalchemy import select, text, func, and_, or_, not_
from sqlalchemy.exc import OperationalError
from sqlalchemy.orm import Session

log = logging.getLogger(__name__)


def retry_on_db_lock(func: Callable, max_retries: int = 3, base_delay: float = 0.5) -> Any:
    """
    Retry a database operation if it fails due to database lock.
    Uses exponential backoff: 0.5s, 1s, 2s

    Args:
        func: Function to retry
        max_retries: Maximum number of retry attempts
        base_delay: Base delay in seconds (doubles each retry)

    Returns:
        Result from the function

    Raises:
        Last exception if all retries fail
    """
    last_exception = None
    for attempt in range(max_retries + 1):
        try:
            return func()
        except OperationalError as e:
            last_exception = e
            if 'database is locked' in str(e).lower() and attempt < max_retries:
                delay = base_delay * (2 ** attempt)
                log.warning(f"Database locked, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
                time.sleep(delay)
            else:
                raise

    # This should never be reached, but just in case
    raise last_exception

# Import Open WebUI modules using compatibility layer (handles pip/docker/git installs)
try:
    from prune_imports import (
        Users, Chat, Chats, ChatFile, ChatMessage, Message, File, Files, Note, Notes,
        Prompt, Prompts, Model, Models, Knowledge, Knowledges,
        Function, Functions, Tool, Tools, Skill, Skills,
        Folder, Folders, FolderModel, Storage,
        get_db, get_db_context, CACHE_DIR
    )
except ImportError as e:
    log.error(f"Failed to import Open WebUI modules: {e}")
    log.error("This module requires Open WebUI backend modules to be importable")
    raise

from prune_models import PruneDataForm
from prune_core import collect_file_ids_from_dict


# API Compatibility Helpers
def get_all_folders(db: Optional[Session] = None):
    """
    Get all folders from database.
    Compatibility helper for newer Folders API that doesn't have get_all_folders().

    Args:
        db: Optional database session to reuse (for efficient bulk operations)
    """
    try:
        # Try new API first - if get_all_folders exists, use it
        if hasattr(Folders, 'get_all_folders'):
            # Check if the method supports db parameter
            if 'db' in inspect.signature(Folders.get_all_folders).parameters:
                return Folders.get_all_folders(db=db)
            else:
                return Folders.get_all_folders()

        # Otherwise query directly from database
        with get_db_context(db) as session:
            folders = session.query(Folder).all()
            # Convert to FolderModel instances
            return [FolderModel.model_validate(f) for f in folders]
    except Exception as e:
        log.error(f"Error getting all folders: {e}")
        return []


def count_inactive_users(
    inactive_days: Optional[int], exempt_admin: bool, exempt_pending: bool, all_users=None
) -> int:
    """Count users that would be deleted for inactivity.

    Args:
        inactive_days: Number of days of inactivity before deletion
        exempt_admin: Whether to exempt admin users
        exempt_pending: Whether to exempt pending users
        all_users: Optional pre-fetched list of users to avoid duplicate queries
    """
    if inactive_days is None:
        return 0

    cutoff_time = int(time.time()) - (inactive_days * 86400)
    count = 0

    try:
        if all_users is None:
            all_users = Users.get_users()["users"]
        for user in all_users:
            if exempt_admin and user.role == "admin":
                continue
            if exempt_pending and user.role == "pending":
                continue
            if user.last_active_at < cutoff_time:
                count += 1
    except Exception as e:
        log.debug(f"Error counting inactive users: {e}")

    return count


def count_old_chats(
    days: Optional[int], exempt_archived: bool, exempt_in_folders: bool
) -> int:
    """Count chats that would be deleted by age.

    Uses a SQL COUNT query instead of loading full ORM objects,
    avoiding the expensive deserialization of large JSONB chat columns.
    """
    if days is None:
        return 0

    cutoff_time = int(time.time()) - (days * 86400)

    try:
        with get_db_context() as db:
            # Build filter conditions
            conditions = [Chat.updated_at < cutoff_time]

            if exempt_archived:
                conditions.append(or_(Chat.archived == False, Chat.archived == None))

            if exempt_in_folders:
                conditions.append(and_(
                    Chat.folder_id == None,
                    or_(Chat.pinned == False, Chat.pinned == None)
                ))

            count = db.query(func.count(Chat.id)).filter(*conditions).scalar()
            return count or 0
    except Exception as e:
        log.debug(f"Error counting old chats: {e}")
        return 0


def count_orphaned_records(
    form_data: PruneDataForm,
    active_file_ids: Set[str],
    active_user_ids: Set[str]
) -> dict:
    """Count orphaned database records that would be deleted.

    Uses SQL COUNT queries instead of loading full ORM objects,
    avoiding the expensive deserialization of large JSONB columns
    (chat history, tool specs, function content, etc.).
    """
    counts = {
        "chats": 0,
        "files": 0,
        "tools": 0,
        "functions": 0,
        "prompts": 0,
        "knowledge_bases": 0,
        "models": 0,
        "notes": 0,
        "skills": 0,
        "folders": 0,
        "chat_messages": 0,
    }

    try:
        with get_db_context() as db:
            # Count orphaned files (not in active_file_ids OR owner not in active_user_ids)
            counts["files"] = db.query(func.count(File.id)).filter(
                or_(
                    not_(File.id.in_(active_file_ids)) if active_file_ids else True,
                    not_(File.user_id.in_(active_user_ids)) if active_user_ids else True,
                )
            ).scalar() or 0

            # Count other orphaned records by user ownership
            _table_flag_map = [
                ("chats",          Chat,      Chat.user_id,      form_data.delete_orphaned_chats),
                ("tools",          Tool,      Tool.user_id,      form_data.delete_orphaned_tools),
                ("functions",      Function,  Function.user_id,  form_data.delete_orphaned_functions),
                ("prompts",        Prompt,    Prompt.user_id,    form_data.delete_orphaned_prompts),
                ("knowledge_bases", Knowledge, Knowledge.user_id, form_data.delete_orphaned_knowledge_bases),
                ("models",         Model,     Model.user_id,     form_data.delete_orphaned_models),
                ("notes",          Note,      Note.user_id,      form_data.delete_orphaned_notes),
                ("skills",         Skill,     Skill.user_id,     form_data.delete_orphaned_skills),
                ("folders",        Folder,    Folder.user_id,    form_data.delete_orphaned_folders),
            ]

            for key, table_cls, user_id_col, enabled in _table_flag_map:
                if enabled and active_user_ids:
                    counts[key] = db.query(func.count()).select_from(table_cls).filter(
                        not_(user_id_col.in_(active_user_ids))
                    ).scalar() or 0

            # Count orphaned chat_messages (chat_id references a chat that no longer exists)
            if form_data.delete_orphaned_chat_messages:
                try:
                    counts["chat_messages"] = db.query(
                        func.count(ChatMessage.id)
                    ).filter(
                        not_(ChatMessage.chat_id.in_(
                            select(Chat.id)
                        ))
                    ).scalar() or 0
                except Exception as e:
                    log.debug(f"Error counting orphaned chat_messages (table may not exist yet): {e}")

    except Exception as e:
        log.debug(f"Error counting orphaned records: {e}")

    return counts


def count_orphaned_chat_messages() -> int:
    """Count orphaned chat_message rows whose parent chat no longer exists.

    These are left behind on SQLite because it does not enforce
    ON DELETE CASCADE unless PRAGMA foreign_keys is enabled.
    """
    try:
        with get_db_context() as db:
            return db.query(
                func.count(ChatMessage.id)
            ).filter(
                not_(ChatMessage.chat_id.in_(select(Chat.id)))
            ).scalar() or 0
    except Exception as e:
        log.debug(f"Error counting orphaned chat_messages: {e}")
        return 0


def delete_orphaned_chat_messages() -> int:
    """Delete chat_message rows whose parent chat no longer exists.

    Returns the number of rows deleted.
    """
    try:
        with get_db_context() as db:
            orphaned_ids = db.query(ChatMessage.id).filter(
                not_(ChatMessage.chat_id.in_(select(Chat.id)))
            ).all()
            orphan_id_list = [r.id for r in orphaned_ids]

            if not orphan_id_list:
                return 0

            # Delete in batches to avoid SQLite variable limits
            deleted = 0
            batch_size = 500
            for i in range(0, len(orphan_id_list), batch_size):
                batch = orphan_id_list[i:i + batch_size]
                deleted += db.query(ChatMessage).filter(
                    ChatMessage.id.in_(batch)
                ).delete()
            db.commit()

            if deleted > 0:
                log.info(f"Deleted {deleted} orphaned chat_message rows")
            return deleted
    except Exception as e:
        log.error(f"Error deleting orphaned chat_messages: {e}")
        return 0


def count_orphaned_uploads(active_file_ids: Set[str]) -> int:
    """Count orphaned files in uploads directory."""
    upload_dir = Path(CACHE_DIR).parent / "uploads"
    if not upload_dir.exists():
        return 0

    count = 0
    try:
        for file_path in upload_dir.iterdir():
            if not file_path.is_file():
                continue

            filename = file_path.name
            file_id = None

            # Extract file ID from filename patterns
            if len(filename) > 36:
                potential_id = filename[:36]
                if potential_id.count("-") == 4:
                    file_id = potential_id

            if not file_id and filename.count("-") == 4 and len(filename) == 36:
                file_id = filename

            if not file_id:
                for active_id in active_file_ids:
                    if active_id in filename:
                        file_id = active_id
                        break

            if file_id and file_id not in active_file_ids:
                count += 1
    except Exception as e:
        log.debug(f"Error counting orphaned uploads: {e}")

    return count


def count_audio_cache_files(max_age_days: Optional[int]) -> int:
    """Count audio cache files that would be deleted."""
    if max_age_days is None:
        return 0

    cutoff_time = time.time() - (max_age_days * 86400)
    count = 0

    audio_dirs = [
        Path(CACHE_DIR) / "audio" / "speech",
        Path(CACHE_DIR) / "audio" / "transcriptions",
    ]

    for audio_dir in audio_dirs:
        if not audio_dir.exists():
            continue

        try:
            for file_path in audio_dir.iterdir():
                if file_path.is_file() and file_path.stat().st_mtime < cutoff_time:
                    count += 1
        except Exception as e:
            log.debug(f"Error counting audio files in {audio_dir}: {e}")

    return count


def get_active_file_ids(knowledge_bases=None, active_user_ids=None) -> Set[str]:
    """
    Get all file IDs that are actively referenced by knowledge bases, chats, folders, messages, and models.

    Args:
        knowledge_bases: Optional pre-fetched list of knowledge bases to avoid duplicate queries
        active_user_ids: Optional set of active user IDs to filter knowledge bases
    """
    active_file_ids = set()

    try:
        # Preload all valid file IDs to avoid N database queries during validation
        # This is O(1) set lookup instead of O(n) DB queries
        # Use retry logic in case database is locked
        all_file_ids = retry_on_db_lock(lambda: {f.id for f in Files.get_files()})
        log.debug(f"Preloaded {len(all_file_ids)} file IDs for validation")

        # Scan knowledge bases for file references
        # Note: Since v0.6.41, knowledge.data column was removed and replaced with
        # knowledge_file table. We now use the existing API to query files per KB.
        if knowledge_bases is None:
            knowledge_bases = Knowledges.get_knowledge_bases()
        log.debug(f"Found {len(knowledge_bases)} knowledge bases")

        # Memory-safe processing: iterate through KBs and extract file IDs incrementally
        # We don't keep file objects in memory, just collect IDs
        for kb in knowledge_bases:
            # CRITICAL FIX: Skip KBs owned by inactive/deleted users to maintain
            # consistency with active_kb_ids filtering. This prevents false positives
            # where files are considered "active" but their KB is marked as orphaned,
            # leading to incorrectly deleted vector collections.
            if active_user_ids is not None and kb.user_id not in active_user_ids:
                log.debug(f"Skipping KB {kb.id} - owner {kb.user_id} not in active users")
                continue

            try:
                # Use existing API method that queries knowledge_file table
                # get_files_by_id() performs:
                # SELECT * FROM file JOIN knowledge_file WHERE knowledge_id = kb.id
                kb_files = Knowledges.get_files_by_id(kb.id)

                # Extract file IDs only (memory efficient - don't keep full objects)
                for file in kb_files:
                    if file.id and file.id in all_file_ids:
                        active_file_ids.add(file.id)

                # Help GC by clearing the list immediately after processing
                del kb_files

            except Exception as e:
                log.debug(f"Error scanning files for knowledge base {kb.id}: {e}")

        # Scan chats for file references
        # Stream chats using Core SELECT to avoid ORM overhead
        # Wrap in retry logic in case of database lock
        def scan_chats():
            chat_count = 0
            with get_db() as db:
                stmt = select(Chat.id, Chat.chat)
                # SQLAlchemy 2.0+ compatibility: execution_options moved to statement
                try:
                    result = db.execute(stmt.execution_options(stream_results=True))
                except AttributeError:
                    # Fallback for older SQLAlchemy versions
                    result = db.execution_options(stream_results=True).execute(stmt)

                while True:
                    rows = result.fetchmany(1000)
                    if not rows:
                        break

                    for chat_id, chat_dict in rows:
                        chat_count += 1

                        # Skip if no chat data or not a dict
                        if not chat_dict or not isinstance(chat_dict, dict):
                            continue

                        try:
                            # Direct dict traversal (no json.dumps needed)
                            collect_file_ids_from_dict(chat_dict, active_file_ids, all_file_ids)
                        except Exception as e:
                            log.debug(f"Error processing chat {chat_id} for file references: {e}")

            return chat_count

        chat_count = retry_on_db_lock(scan_chats)
        log.debug(f"Scanned {chat_count} chats for file references")

        # Scan chat_file table for file references
        # Note: Since v0.6.41+, chat files are stored in dedicated chat_file junction table.
        # We scan both the chat.chat JSON (legacy) and chat_file table (new) to ensure completeness.
        try:
            with get_db() as db:
                stmt = select(ChatFile.file_id)
                # SQLAlchemy 2.0+ compatibility
                try:
                    result = db.execute(stmt.execution_options(stream_results=True))
                except AttributeError:
                    result = db.execution_options(stream_results=True).execute(stmt)

                chat_file_count = 0
                while True:
                    rows = result.fetchmany(1000)
                    if not rows:
                        break

                    for (file_id,) in rows:
                        chat_file_count += 1
                        if file_id and file_id in all_file_ids:
                            active_file_ids.add(file_id)

                log.debug(f"Scanned {chat_file_count} chat_file entries for file references")
        except Exception as e:
            # chat_file table might not exist in older database versions
            log.debug(f"Error scanning chat_file table (table may not exist yet): {e}")

        # Scan folders for file references
        # Stream folders using Core SELECT to avoid ORM overhead
        try:
            with get_db() as db:
                stmt = select(Folder.id, Folder.items, Folder.data)
                # SQLAlchemy 2.0+ compatibility: execution_options moved to statement
                try:
                    result = db.execute(stmt.execution_options(stream_results=True))
                except AttributeError:
                    # Fallback for older SQLAlchemy versions
                    result = db.execution_options(stream_results=True).execute(stmt)

                while True:
                    rows = result.fetchmany(100)
                    if not rows:
                        break

                    for folder_id, items_dict, data_dict in rows:
                        # Process folder.items
                        if items_dict:
                            try:
                                # Direct dict traversal (no json.dumps needed)
                                collect_file_ids_from_dict(items_dict, active_file_ids, all_file_ids)
                            except Exception as e:
                                log.debug(f"Error processing folder {folder_id} items: {e}")

                        # Process folder.data
                        if data_dict:
                            try:
                                # Direct dict traversal (no json.dumps needed)
                                collect_file_ids_from_dict(data_dict, active_file_ids, all_file_ids)
                            except Exception as e:
                                log.debug(f"Error processing folder {folder_id} data: {e}")

        except Exception as e:
            log.debug(f"Error scanning folders for file references: {e}")

        # Scan standalone messages for file references
        # Stream messages using Core SELECT to avoid text() and yield_per issues
        try:
            with get_db() as db:
                stmt = select(Message.id, Message.data).where(Message.data.isnot(None))
                # SQLAlchemy 2.0+ compatibility: execution_options moved to statement
                try:
                    result = db.execute(stmt.execution_options(stream_results=True))
                except AttributeError:
                    # Fallback for older SQLAlchemy versions
                    result = db.execution_options(stream_results=True).execute(stmt)

                while True:
                    rows = result.fetchmany(1000)
                    if not rows:
                        break

                    for message_id, message_data_dict in rows:
                        if message_data_dict:
                            try:
                                # Direct dict traversal (no json.dumps needed)
                                collect_file_ids_from_dict(message_data_dict, active_file_ids, all_file_ids)
                            except Exception as e:
                                log.debug(f"Error processing message {message_id} data: {e}")

        except Exception as e:
            log.debug(f"Error scanning messages for file references: {e}")

        # Scan models for file references in params and meta fields
        # Models can have files attached (e.g. in meta or params JSON fields)
        try:
            with get_db() as db:
                stmt = select(Model.id, Model.params, Model.meta)
                # SQLAlchemy 2.0+ compatibility
                try:
                    result = db.execute(stmt.execution_options(stream_results=True))
                except AttributeError:
                    result = db.execution_options(stream_results=True).execute(stmt)

                model_count = 0
                while True:
                    rows = result.fetchmany(100)
                    if not rows:
                        break

                    for model_id, params_dict, meta_dict in rows:
                        model_count += 1

                        # Scan params JSON field for file references
                        if params_dict and isinstance(params_dict, dict):
                            try:
                                collect_file_ids_from_dict(params_dict, active_file_ids, all_file_ids)
                            except Exception as e:
                                log.debug(f"Error processing model {model_id} params: {e}")

                        # Scan meta JSON field for file references
                        if meta_dict and isinstance(meta_dict, dict):
                            try:
                                collect_file_ids_from_dict(meta_dict, active_file_ids, all_file_ids)
                            except Exception as e:
                                log.debug(f"Error processing model {model_id} meta: {e}")

                log.debug(f"Scanned {model_count} models for file references")

        except Exception as e:
            log.debug(f"Error scanning models for file references: {e}")

    except Exception as e:
        log.error(f"Error determining active file IDs: {e}")
        return set()

    log.info(f"Found {len(active_file_ids)} active file IDs")
    return active_file_ids


def safe_delete_file_by_id(file_id: str, vector_cleaner, db: Optional[Session] = None) -> bool:
    """
    Safely delete a file record and its associated vector collections and physical storage.

    This function mirrors the cleanup logic from Open WebUI's delete_file_by_id endpoint:
    1. Cleans KB vector embeddings (filter by file_id and hash)
    2. Deletes the standalone file-{id} vector collection
    3. Deletes the file record from DB (CASCADE handles chat_file, channel_file, knowledge_file)
    4. Deletes the physical file from storage

    Args:
        file_id: The file ID to delete
        vector_cleaner: Vector database cleaner instance
        db: Optional database session to reuse (for efficient bulk operations)

    Returns:
        True if deletion succeeded, False otherwise
    """
    try:
        with get_db_context(db) as session:
            file_record = Files.get_file_by_id(file_id, db=session)
            if not file_record:
                return True

            # Clean KB vector embeddings (mirrors delete_file_by_id endpoint logic)
            # This removes embeddings from knowledge base collections that reference this file
            try:
                knowledges = Knowledges.get_knowledges_by_file_id(file_id, db=session)
                for kb in knowledges:
                    try:
                        # Delete by file_id filter
                        vector_cleaner.delete(collection_name=kb.id, filter={"file_id": file_id})
                        # Also delete by hash if available (covers hash-based lookups)
                        if file_record.hash:
                            vector_cleaner.delete(collection_name=kb.id, filter={"hash": file_record.hash})
                    except Exception as e:
                        log.debug(f"KB embedding cleanup for {kb.id}: {e}")
            except Exception as e:
                log.debug(f"Error getting knowledges for file {file_id}: {e}")

            # Delete standalone file vector collection
            collection_name = f"file-{file_id}"
            vector_cleaner.delete_collection(collection_name)

            # Delete from DB - CASCADE handles chat_file, channel_file, knowledge_file
            Files.delete_file_by_id(file_id, db=session)

            # Delete physical file from storage
            if file_record.path:
                try:
                    Storage.delete_file(file_record.path)
                except Exception as e:
                    log.debug(f"Error deleting physical file {file_record.path}: {e}")

            return True

    except Exception as e:
        log.error(f"Error deleting file {file_id}: {e}")
        return False


def delete_user_files(user_id: str, vector_cleaner, db: Optional[Session] = None) -> int:
    """
    Delete all files owned by a user.

    This should be called before deleting an inactive user to ensure proper cleanup
    of file-related data (vector embeddings, physical storage, etc.).

    Args:
        user_id: The user ID whose files should be deleted
        vector_cleaner: Vector database cleaner instance
        db: Optional database session to reuse

    Returns:
        Number of files successfully deleted
    """
    deleted_count = 0
    try:
        files = Files.get_files_by_user_id(user_id, db=db)
        log.debug(f"Found {len(files)} files for user {user_id}")

        for file in files:
            if safe_delete_file_by_id(file.id, vector_cleaner, db=db):
                deleted_count += 1

        if deleted_count > 0:
            log.info(f"Deleted {deleted_count} files for user {user_id}")

    except Exception as e:
        log.error(f"Error deleting files for user {user_id}: {e}")

    return deleted_count


def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> int:
    """
    Clean up orphaned files in the uploads directory.

    Returns the number of files deleted.
    """
    upload_dir = Path(CACHE_DIR).parent / "uploads"
    if not upload_dir.exists():
        return 0

    deleted_count = 0

    try:
        for file_path in upload_dir.iterdir():
            if not file_path.is_file():
                continue

            filename = file_path.name
            file_id = None

            # Extract file ID from filename patterns
            if len(filename) > 36:
                potential_id = filename[:36]
                if potential_id.count("-") == 4:
                    file_id = potential_id

            if not file_id and filename.count("-") == 4 and len(filename) == 36:
                file_id = filename

            if not file_id:
                for active_id in active_file_ids:
                    if active_id in filename:
                        file_id = active_id
                        break

            if file_id and file_id not in active_file_ids:
                try:
                    file_path.unlink()
                    deleted_count += 1
                except Exception as e:
                    log.error(f"Failed to delete upload file {filename}: {e}")

    except Exception as e:
        log.error(f"Error cleaning uploads directory: {e}")

    if deleted_count > 0:
        log.info(f"Deleted {deleted_count} orphaned upload files")

    return deleted_count


def delete_inactive_users(
    inactive_days: int,
    vector_cleaner=None,
    exempt_admin: bool = True,
    exempt_pending: bool = True
) -> int:
    """
    Delete users who have been inactive for the specified number of days.

    If vector_cleaner is provided, also cleans up user files (embeddings, physical storage)
    before deleting the user.

    Args:
        inactive_days: Number of days of inactivity before deletion
        vector_cleaner: Optional vector database cleaner for file cleanup
        exempt_admin: Whether to exempt admin users from deletion
        exempt_pending: Whether to exempt pending users from deletion

    Returns the number of users deleted.
    """
    if inactive_days is None:
        return 0

    cutoff_time = int(time.time()) - (inactive_days * 86400)
    deleted_count = 0
    total_files_deleted = 0

    try:
        users_to_delete = []

        # Get all users and check activity
        all_users = Users.get_users()["users"]

        for user in all_users:
            # Skip if user is exempt
            if exempt_admin and user.role == "admin":
                continue
            if exempt_pending and user.role == "pending":
                continue

            # Check if user is inactive based on last_active_at
            if user.last_active_at < cutoff_time:
                users_to_delete.append(user)

        # Delete inactive users with shared database session
        with get_db() as db:
            for user in users_to_delete:
                try:
                    # Delete user's files first (if vector_cleaner provided)
                    # This ensures proper cleanup of embeddings, physical storage, etc.
                    if vector_cleaner is not None:
                        files_deleted = delete_user_files(user.id, vector_cleaner, db=db)
                        total_files_deleted += files_deleted

                    # Delete the user - CASCADE handles remaining associations
                    Users.delete_user_by_id(user.id, db=db)
                    deleted_count += 1
                    log.info(
                        f"Deleted inactive user: {user.email} (last active: {user.last_active_at})"
                    )
                except Exception as e:
                    log.error(f"Failed to delete user {user.id}: {e}")

    except Exception as e:
        log.error(f"Error during inactive user deletion: {e}")

    if total_files_deleted > 0:
        log.info(f"Total files deleted from inactive users: {total_files_deleted}")

    return deleted_count


def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> int:
    """
    Clean up audio cache files older than specified days.

    Returns:
        Number of files deleted
    """
    if max_age_days is None:
        log.info("Skipping audio cache cleanup (max_age_days is None)")
        return 0

    cutoff_time = time.time() - (max_age_days * 86400)
    deleted_count = 0
    total_size_deleted = 0

    audio_dirs = [
        Path(CACHE_DIR) / "audio" / "speech",
        Path(CACHE_DIR) / "audio" / "transcriptions",
    ]

    for audio_dir in audio_dirs:
        if not audio_dir.exists():
            continue

        try:
            for file_path in audio_dir.iterdir():
                if not file_path.is_file():
                    continue

                stat_info = file_path.stat()
                file_mtime = stat_info.st_mtime
                if file_mtime < cutoff_time:
                    try:
                        file_size = stat_info.st_size
                        file_path.unlink()
                        deleted_count += 1
                        total_size_deleted += file_size
                        log.debug(f"Deleted audio cache file: {file_path} ({file_size} bytes)")
                    except Exception as e:
                        log.error(f"Failed to delete audio file {file_path}: {e}")

        except Exception as e:
            log.error(f"Error cleaning audio directory {audio_dir}: {e}")

    log.info(f"Audio cache cleanup: deleted {deleted_count} files, freed {total_size_deleted} bytes")
    return deleted_count