chunk migrate compress method

Fallen-Breath · Fallen-Breath · commit fc6baaebf86a · 2026-02-24T01:19:55.000+08:00
diff --git a/TODO.md b/TODO.md
@@ -19,7 +19,7 @@ CDC chunking
 - [ ] Performance optimization
 - [ ] Fuse support
 - [ ] TODO cleanup
-- [ ] Compress migration
+- [x] Compress migration
 - [ ] Document
 
 ## QOL
diff --git a/prime_backup/action/__init__.py b/prime_backup/action/__init__.py
@@ -4,9 +4,10 @@
 import logging
 import threading
 from abc import ABC, abstractmethod
-from typing import TypeVar, Generic
+from typing import TypeVar, Generic, Optional
 
 _T = TypeVar('_T')
+_S = TypeVar('_S')
 
 
 class Action(Generic[_T], ABC):
@@ -18,6 +19,8 @@ def __init__(self):
 		self.logger: logging.Logger = logger.get()
 		self.config: Config = Config.get()
 
+		self.__running_action: Optional[Action] = None
+
 	@abstractmethod
 	def run(self) -> _T:
 		...
@@ -27,5 +30,16 @@ def is_interruptable(self) -> bool:
 
 	def interrupt(self):
 		self.is_interrupted.set()
-
-	# TODO: run_sub_action for interrupt chain
+		if (action := self.__running_action) is not None:
+			action.interrupt()
+
+	def run_action(self, action: 'Action[_S]', auto_interrupt: bool = True) -> _S:
+		if self.__running_action is not None:
+			raise RuntimeError('Cannot run action twice at the same time, current: {}, new: {}'.format(self.__running_action, action))
+		self.__running_action = action
+		try:
+			if auto_interrupt and self.is_interrupted.is_set():
+				action.interrupt()
+			return action.run()
+		finally:
+			self.__running_action = None
diff --git a/prime_backup/action/migrate_compress_method_action.py b/prime_backup/action/migrate_compress_method_action.py
@@ -1,7 +1,8 @@
 import shutil
 import time
+from concurrent.futures import Future
 from pathlib import Path
-from typing import List, Tuple, Set
+from typing import List, Tuple, Set, Dict
 
 from typing_extensions import override
 
@@ -10,8 +11,10 @@
 from prime_backup.db import schema
 from prime_backup.db.access import DbAccess
 from prime_backup.db.session import DbSession
+from prime_backup.db.values import BlobStorageMethod
 from prime_backup.types.size_diff import SizeDiff
-from prime_backup.utils import blob_utils
+from prime_backup.utils import blob_utils, chunk_utils
+from prime_backup.utils.thread_pool import FailFastBlockingThreadPool
 
 _OLD_BLOB_SUFFIX = '_old'
 
@@ -20,16 +23,35 @@ class MigrateCompressMethodAction(Action[SizeDiff]):
 	def __init__(self, new_compress_method: CompressMethod):
 		super().__init__()
 		self.new_compress_method = new_compress_method
+
+		# records changed files
 		self.__migrated_blob_hashes: List[str] = []
+		self.__migrated_chunk_hashes: List[str] = []
+
+		# records affects stuffs for next migration phase
+		self.__affected_chunk_groups_ids: Set[int] = set()
 		self.__affected_fileset_ids: Set[int] = set()
 
+	def __update_files_for_blob_change(self, session: DbSession, changed_blobs_by_hash: Dict[str, schema.Blob]):
+		for file in session.get_file_by_blob_hashes(list(changed_blobs_by_hash.keys())):
+			blob = changed_blobs_by_hash[file.blob_hash]
+			file.blob_compress = blob.compress
+			file.blob_stored_size = blob.stored_size
+			self.__affected_fileset_ids.add(file.fileset_id)
+
 	@classmethod
 	def __get_blob_paths(cls, h: str) -> Tuple[Path, Path]:
 		blob_path = blob_utils.get_blob_path(h)
 		old_trash_path = blob_path.parent / (blob_path.name + _OLD_BLOB_SUFFIX)
 		return blob_path, old_trash_path
 
-	def __migrate_blob(self, blob: schema.Blob) -> bool:
+	@classmethod
+	def __get_chunk_paths(cls, h: str) -> Tuple[Path, Path]:
+		chunk_path = chunk_utils.get_chunk_path(h)
+		old_trash_path = chunk_path.parent / (chunk_path.name + _OLD_BLOB_SUFFIX)
+		return chunk_path, old_trash_path
+
+	def __migrate_single_direct_blob(self, blob: schema.Blob) -> bool:
 		new_compress_method = self.config.backup.get_compress_method_from_size(blob.raw_size, compress_method_override=self.new_compress_method)
 		decompressor = Compressor.create(blob.compress)
 		compressor = Compressor.create(new_compress_method)
@@ -46,26 +68,86 @@ def __migrate_blob(self, blob: schema.Blob) -> bool:
 		blob.stored_size = writer.get_write_len()
 		return True
 
-	def __migrate_blobs_and_sync_files(self, session: DbSession, blobs: List[schema.Blob]):
-		blob_mapping = {}
-		for blob in blobs:
-			try:
-				changed = self.__migrate_blob(blob)
-			except Exception as e:
-				self.logger.error('Migrate blob {} failed: {}'.format(blob, e))
-				raise
-
-			if changed:
-				blob_mapping[blob.hash] = blob
-				self.__migrated_blob_hashes.append(blob.hash)
-
-		for file in session.get_file_by_blob_hashes(list(blob_mapping.keys())):
-			blob = blob_mapping[file.blob_hash]
-			file.blob_compress = blob.compress
-			file.blob_stored_size = blob.stored_size
-			self.__affected_fileset_ids.add(file.fileset_id)
+	def __migrate_direct_blobs_and_sync_files(self, session: DbSession, blobs: List[schema.Blob]):
+		changed_blobs_by_hash: Dict[str, schema.Blob] = {}
+
+		with FailFastBlockingThreadPool('migration') as pool:
+			future_pairs: List[Tuple[schema.Blob, 'Future[bool]']] = []
+			for blob in blobs:
+				future = pool.submit(self.__migrate_single_direct_blob, blob)
+				future_pairs.append((blob, future))
+			for blob, future in future_pairs:
+				try:
+					changed = future.result()
+				except Exception as e:
+					self.logger.error('Migrate blob {} failed: {}'.format(blob, e))
+					raise
+				if changed:
+					changed_blobs_by_hash[blob.hash] = blob
+					self.__migrated_blob_hashes.append(blob.hash)
+
+		self.__update_files_for_blob_change(session, changed_blobs_by_hash)
+
+	def __migrate_single_chunk(self, chunk: schema.Chunk) -> bool:
+		new_compress_method = self.config.backup.get_compress_method_from_size(chunk.raw_size, compress_method_override=self.new_compress_method)
+		decompressor = Compressor.create(chunk.compress)
+		compressor = Compressor.create(new_compress_method)
+		if decompressor.get_method() == compressor.get_method():
+			return False
+
+		chunk_path, old_trash_path = self.__get_chunk_paths(chunk.hash)
+		chunk_path.replace(old_trash_path)
+		with decompressor.open_decompressed(old_trash_path) as f_src:
+			with compressor.open_compressed_bypassed(chunk_path) as (writer, f_dst):
+				shutil.copyfileobj(f_src, f_dst)
+
+		chunk.compress = new_compress_method.name
+		chunk.stored_size = writer.get_write_len()
+		return True
+
+	def __migrate_chunks(self, session: DbSession, chunks: List[schema.Chunk]):
+		changed_chunk_ids: Set[int] = set()
+
+		with FailFastBlockingThreadPool('migration') as pool:
+			future_pairs: List[Tuple[schema.Chunk, 'Future[bool]']] = []
+			for chunk in chunks:
+				future = pool.submit(self.__migrate_single_chunk, chunk)
+				future_pairs.append((chunk, future))
+			for chunk, future in future_pairs:
+				try:
+					changed = future.result()
+				except Exception as e:
+					self.logger.error('Migrate chunk {} failed: {}'.format(chunk, e))
+					raise
+				if changed:
+					changed_chunk_ids.add(chunk.id)
+					self.__migrated_chunk_hashes.append(chunk.hash)
+
+		self.__affected_chunk_groups_ids.update(session.get_chunk_group_ids_by_chunk_ids(list(changed_chunk_ids)))
+
+	def __update_chunk_group_and_blobs_for_chunk_changes(self, session: DbSession):
+		if len(self.__affected_chunk_groups_ids) == 0:
+			return
+
+		chunk_group_id_list = list(self.__affected_chunk_groups_ids)
+		chunk_groups = session.get_chunk_groups_by_ids(chunk_group_id_list)
+		for chunk_group in chunk_groups.values():
+			chunk_group.chunk_stored_size_sum = session.calc_chunk_group_stored_size_sum(chunk_group.id)
+
+		affected_blobs = session.get_blobs_by_chunk_group_ids(chunk_group_id_list)
+		for blob in affected_blobs:
+			if blob.storage_method != BlobStorageMethod.chunked.value:
+				raise AssertionError('Blob {!r} is not a chunked blob'.format(blob))
+			blob.stored_size = session.calc_chunked_blob_stored_size_sum(blob.id)
+		self.__update_files_for_blob_change(session, {blob.hash: blob for blob in affected_blobs})
+
+		self.logger.info('Syncing {} affected chunk groups and {} affected blobs for chunk changes'.format(len(chunk_groups), len(affected_blobs)))
+		session.flush_and_expunge_all()
 
 	def __update_fileset_and_backups(self, session: DbSession):
+		if len(self.__affected_fileset_ids) == 0:
+			return
+
 		fileset_ids = set(self.__affected_fileset_ids)
 		backup_ids = session.get_backup_ids_by_fileset_ids(list(fileset_ids))
 		self.logger.info('Syncing {} affected filesets and {} associated backups'.format(len(fileset_ids), len(backup_ids)))
@@ -86,24 +168,34 @@ def __update_fileset_and_backups(self, session: DbSession):
 			fs_base = filesets[backup.fileset_id_base]
 			fs_delta = filesets[backup.fileset_id_delta]
 			backup.file_stored_size_sum = fs_base.file_stored_size_sum + fs_delta.file_stored_size_sum
+		session.flush_and_expunge_all()
 
-	def __erase_old_blobs(self):
+	def __erase_old_blob_and_chunk_files(self):
 		for h in self.__migrated_blob_hashes:
 			_, old_trash_path = self.__get_blob_paths(h)
 			old_trash_path.unlink()
+		for h in self.__migrated_chunk_hashes:
+			_, old_trash_path = self.__get_chunk_paths(h)
+			old_trash_path.unlink()
 
 	def __rollback(self):
 		for h in self.__migrated_blob_hashes:
 			blob_path, old_trash_path = self.__get_blob_paths(h)
 			if old_trash_path.is_file():
 				old_trash_path.replace(blob_path)
+		for h in self.__migrated_chunk_hashes:
+			chunk_path, old_trash_path = self.__get_chunk_paths(h)
+			if old_trash_path.is_file():
+				old_trash_path.replace(chunk_path)
 
 	@override
 	def run(self) -> SizeDiff:
 		# Notes: requires 2x disk usage of the blob store, stores all blob hashes in memory
-		self.__migrated_blob_hashes.clear()
 		self.logger.info('Migrating compress method to {} (compress threshold = {})'.format(self.new_compress_method.name, self.config.backup.compress_threshold))
 
+		def get_actual_blob_store_stored_size_sum() -> int:
+			return session.get_direct_blob_stored_size_sum() + session.get_chunk_stored_size_sum()
+
 		try:
 			# Blob operation steps:
 			# 1. move xxx -> xxx_old
@@ -112,28 +204,43 @@ def run(self) -> SizeDiff:
 			with DbAccess.open_session() as session:
 				# 0. fetch information before the migration
 				t = time.time()
-				before_size = session.get_blob_stored_size_sum()
-				total_blob_count = session.get_blob_count()
+				before_size = get_actual_blob_store_stored_size_sum()
 
-				# 1. migrate blob objects
+				# 1. migrate direct blob objects
 				cnt = 0
+				total_blob_count = session.get_blob_count()
 				for blobs in session.iterate_blob_batch(batch_size=1000):
 					cnt += len(blobs)
 					self.logger.info('Processing blobs {} / {}'.format(cnt, total_blob_count))
-					self.__migrate_blobs_and_sync_files(session, blobs)
+					self.__migrate_direct_blobs_and_sync_files(session, [
+						blob for blob in blobs
+						if blob.storage_method == BlobStorageMethod.direct.value
+					])
 					session.flush_and_expunge_all()
-
 				if len(self.__migrated_blob_hashes) == 0:
 					self.logger.info('No blob needs a compress method change, nothing to migrate')
 				else:
-					self.logger.info('Migrated {} blobs and related files'.format(len(self.__migrated_blob_hashes)))
+					self.logger.info('Migrated {} direct blobs and related files'.format(len(self.__migrated_blob_hashes)))
 
-					# 3. migrate backup data
-					self.__update_fileset_and_backups(session)
+				# 2. migrate chunks objects, and sync chunk groups and chunked blobs
+				cnt = 0
+				total_chunk_count = session.get_chunk_count()
+				for chunks in session.iterate_chunk_batch(batch_size=1000):
+					cnt += len(chunks)
+					self.logger.info('Processing chunks {} / {}'.format(cnt, total_chunk_count))
+					self.__migrate_chunks(session, chunks)
 					session.flush_and_expunge_all()
+				if len(self.__migrated_chunk_hashes) == 0:
+					self.logger.info('No chunk needs a compress method change, nothing to migrate')
+				else:
+					self.logger.info('Migrated {} chunks and related blob and files'.format(len(self.__migrated_chunk_hashes)))
+
+				# 3. migrate affected fileset and backup data
+				self.__update_chunk_group_and_blobs_for_chunk_changes(session)
+				self.__update_fileset_and_backups(session)
 
 				# 4. output
-				after_size = session.get_blob_stored_size_sum()
+				after_size = get_actual_blob_store_stored_size_sum()
 
 		except Exception:
 			self.logger.warning('Error occurs during compress method migration, applying rollback')
@@ -142,8 +249,8 @@ def run(self) -> SizeDiff:
 
 		else:
 			# 5. migration done, do some cleanup
-			self.logger.info('Cleaning up old blobs')
-			self.__erase_old_blobs()
+			self.logger.info('Cleaning up old blob and chunk files')
+			self.__erase_old_blob_and_chunk_files()
 
 			self.config.backup.compress_method = self.new_compress_method
 			self.logger.info('Compress method migration done, cost {}s'.format(round(time.time() - t, 2)))
diff --git a/prime_backup/db/session.py b/prime_backup/db/session.py
@@ -245,6 +245,12 @@ def get_blob_stored_size_sum(self) -> int:
 		# FIXME: chunked blob
 		return _int_or_0(self.session.execute(func.sum(schema.Blob.stored_size).select()).scalar_one())
 
+	def get_direct_blob_stored_size_sum(self) -> int:
+		return _int_or_0(self.session.execute(
+			func.sum(schema.Blob.stored_size).select().
+			where(schema.Blob.storage_method == BlobStorageMethod.direct.value)
+		).scalar_one())
+
 	def get_blob_raw_size_sum(self) -> int:
 		return _int_or_0(self.session.execute(func.sum(schema.Blob.raw_size).select()).scalar_one())
 
@@ -270,6 +276,13 @@ def filtered_orphan_blob_hashes(self, hashes: List[str]) -> List[str]:
 			if blob_hash not in good_hashes
 		]
 
+	def calc_chunked_blob_stored_size_sum(self, blob_id: int) -> int:
+		return _int_or_0(self.session.execute(
+			select(func.sum(schema.ChunkGroup.chunk_stored_size_sum)).
+			join(schema.BlobChunkGroupBinding, schema.ChunkGroup.id == schema.BlobChunkGroupBinding.chunk_group_id).
+			where(schema.BlobChunkGroupBinding.blob_id == blob_id)
+		).scalar_one())
+
 	# ===================================== Chunk =====================================
 
 	class CreateChunkKwargs(TypedDict):
@@ -332,6 +345,9 @@ def get_chunks_by_hashes(self, hashes: List[str]) -> Dict[str, Optional[schema.C
 				result[chunk.hash] = chunk
 		return result
 
+	def get_chunk_stored_size_sum(self) -> int:
+		return _int_or_0(self.session.execute(func.sum(schema.Chunk.stored_size).select()).scalar_one())
+
 	def list_chunks(self, limit: Optional[int] = None, offset: Optional[int] = None) -> List[schema.Chunk]:
 		s = select(schema.Chunk)
 		if limit is not None:
@@ -763,6 +779,13 @@ def delete_blob_chunk_group_bindings(self, identifiers: List[BlobChunkGroupBindi
 			combined_condition = or_(*conditions)
 			self.session.execute(delete(schema.BlobChunkGroupBinding).where(combined_condition))
 
+	def calc_chunk_group_stored_size_sum(self, chunk_group_id: int) -> int:
+		return _int_or_0(self.session.execute(
+			select(func.sum(schema.Chunk.stored_size)).
+			join(schema.ChunkGroupChunkBinding, schema.Chunk.id == schema.ChunkGroupChunkBinding.chunk_id).
+			where(schema.ChunkGroupChunkBinding.chunk_group_id == chunk_group_id)
+		).scalar_one())
+
 	# ===================================== File =====================================
 
 	class CreateFileKwargs(TypedDict):
diff --git a/prime_backup/utils/thread_pool.py b/prime_backup/utils/thread_pool.py
@@ -2,12 +2,15 @@
 import queue
 import threading
 from concurrent.futures import ThreadPoolExecutor, Future
-from typing import List
+from typing import List, TypeVar, Callable
 
-from typing_extensions import override
+from typing_extensions import override, ParamSpec
 
 from prime_backup.utils import misc_utils
 
+_T = TypeVar('_T')
+_P = ParamSpec('_P')
+
 
 class FailFastBlockingThreadPool(ThreadPoolExecutor):
 	"""
@@ -27,7 +30,7 @@ def __init__(self, name: str):
 		self.__error_futures: 'queue.Queue[Future]' = queue.Queue()
 
 	@override
-	def submit(self, __fn, *args, **kwargs):
+	def submit(self, __fn: Callable[_P, _T], *args: _P.args, **kwargs: _P.kwargs) -> Future[_T]:
 		func = functools.partial(__fn, *args, **kwargs)
 
 		def wrapper_func():