skip chunk calc in pre calc for non-coarse-grained chunk method

Fallen-Breath · Fallen-Breath · commit 2d48fb65091f · 2026-06-05T00:38:28.000+08:00
to avoid performance degradation on chunking if concurrency &gt; 1
diff --git a/prime_backup/action/helpers/blob_creator_chunked.py b/prime_backup/action/helpers/blob_creator_chunked.py
@@ -5,7 +5,7 @@
 import time
 from concurrent.futures import Future
 from pathlib import Path
-from typing import BinaryIO, Dict, List, Optional
+from typing import BinaryIO, Dict, List, Optional, Union
 
 from prime_backup.action.helpers.blob_creator_common import BlobLookupRoutine, BlobCreateContext, BlobCreatorBase, _BLOB_ALLOC_PERF_MODE
 from prime_backup.action.helpers.blob_pre_calc_result import BlobPrecalculateResult
@@ -88,14 +88,14 @@ def get_or_create(self) -> BlobLookupRoutine[schema.Blob]:
 
 		with contextlib.ExitStack() as es:
 			actual_path_to_read = self.__prepare_path_to_read(es, plan.policy)
-			snapshot = self.__load_or_cut_chunks(actual_path_to_read, plan.pre_cal_result, src_path_str)
+
+			snapshot_or_blob = yield from self.__load_or_cut_chunks(actual_path_to_read, plan.pre_cal_result, src_path_str)
+			if isinstance(snapshot_or_blob, schema.Blob):
+				return snapshot_or_blob
+			snapshot: _ChunkedBlobSnapshot = snapshot_or_blob
 			if snapshot.blob_size == 0:
 				self.log_and_raise_blob_file_changed('Blob size becomes zero', self.args.last_chance)
 
-			if (cache := (yield from self.query_cached_blob(snapshot.blob_hash))) is not None:
-				self.logger.debug('Chunked file {} (hash {}) already exists in DB'.format(src_path_str, snapshot.blob_hash))
-				return cache
-
 			# notes: the following code cannot be interrupted (yield).
 			# The blob is specifically generated by the generator
 			# if any yield is done, ensure to check blob_by_hash_cache again
@@ -155,26 +155,42 @@ def __prepare_path_to_read(self, es: contextlib.ExitStack, policy: _ChunkedBlobC
 			return temp_file_path
 		raise AssertionError('bad policy {!r}'.format(policy))
 
-	def __load_or_cut_chunks(self, actual_path_to_read: Path, pre_cal_result: Optional[BlobPrecalculateResult], src_path_str: str) -> _ChunkedBlobSnapshot:
+	def __load_or_cut_chunks(self, actual_path_to_read: Path, pre_cal_result: Optional[BlobPrecalculateResult], src_path_str: str) -> BlobLookupRoutine[Union[_ChunkedBlobSnapshot, schema.Blob]]:
+		pre_calc_blob_hash: Optional[str] = None
 		if pre_cal_result is not None:
-			chunks = pre_cal_result.chunks
-			blob_hash = pre_cal_result.hash
-			blob_size = pre_cal_result.size
-			self.logger.debug('Cut and hashed file {} with size {} into {} chunks using {} (precalc)'.format(
-				src_path_str, ByteCount(blob_size).auto_str(), len(chunks), self.args.chunk_method.name,
-			))
-			return _ChunkedBlobSnapshot(chunks, blob_hash, blob_size)
+			pre_calc_blob_hash = pre_cal_result.hash
+			if pre_cal_result.chunks is not None:
+				chunks = pre_cal_result.chunks
+				blob_size = pre_cal_result.size
+				self.logger.debug('Cut and hashed file {} with size {} into {} chunks using {} (precalc)'.format(
+					src_path_str, ByteCount(blob_size).auto_str(), len(chunks), self.args.chunk_method.name,
+				))
+				if (cache := (yield from self.query_cached_blob(pre_calc_blob_hash))) is not None:
+					self.logger.debug('Chunked file {} (hash {}) already exists in DB'.format(src_path_str, pre_calc_blob_hash))
+					return cache
+				return _ChunkedBlobSnapshot(chunks, pre_calc_blob_hash, blob_size)
+		
+		if pre_calc_blob_hash is not None:
+			if (cache := (yield from self.query_cached_blob(pre_calc_blob_hash))) is not None:
+				self.logger.debug('Chunked file {} (hash {}) already exists in DB'.format(src_path_str, pre_calc_blob_hash))
+				return cache
 
 		previous_chunks = self.ctx.file_lookup.get_previous_chunks(self.args.src_path) if self.args.chunk_method.needs_previous_chunks() else None
 		chunker = self.args.chunk_method.create_file_chunker(actual_path_to_read, need_entire_file_hash=True, previous_chunks=previous_chunks)
 		with self.ctx.time_costs.measure_time_cost(CreateBackupTimeCostKey.kind_io_read) as chunking_cost:
 			chunks = chunker.cut_all()
 		blob_hash = chunker.get_entire_file_hash()
 		blob_size = chunker.get_read_file_size()
+		if pre_calc_blob_hash is not None and pre_calc_blob_hash != blob_hash:
+			self.log_and_raise_blob_file_changed('Blob hash mismatch, pre calc {}, chunked {}'.format(pre_calc_blob_hash, blob_hash), self.args.last_chance)
+
 		self.logger.debug('Cut and hashed file {} with size {} into {} chunks using {} in {:.2f}s ({}/s)'.format(
 			src_path_str, ByteCount(blob_size).auto_str(), len(chunks), self.args.chunk_method.name, chunking_cost(),
 			ByteCount(blob_size / chunking_cost() if chunking_cost() > 0 else 0).auto_str(),
 		))
+		if pre_calc_blob_hash is None and (cache := (yield from self.query_cached_blob(blob_hash))) is not None:
+			self.logger.debug('Chunked file {} (hash {}) already exists in DB'.format(src_path_str, blob_hash))
+			return cache
 		return _ChunkedBlobSnapshot(chunks, blob_hash, blob_size)
 
 	def __create_missing_chunks(self, actual_path_to_read: Path, snapshot: _ChunkedBlobSnapshot, known_db_chunks: Dict[str, Optional[schema.Chunk]]) -> _ChunkWriteResult:
diff --git a/prime_backup/action/helpers/blob_pre_calc_result.py b/prime_backup/action/helpers/blob_pre_calc_result.py
@@ -1,4 +1,5 @@
 import dataclasses
+import enum
 from pathlib import Path
 from typing import List, IO, Optional, Iterable
 
@@ -8,6 +9,21 @@
 from prime_backup.utils.hash_utils import SizeAndHash
 
 
+class CalcChunkPolicy(enum.Enum):
+	AUTO = enum.auto()
+	FALSE = enum.auto()
+	TRUE = enum.auto()
+
+	def should_calculate_chunks(self, chunk_method: ChunkMethod) -> bool:
+		if self == CalcChunkPolicy.AUTO:
+			return chunk_method.should_precalculate_chunks()
+		if self == CalcChunkPolicy.FALSE:
+			return False
+		if self == CalcChunkPolicy.TRUE:
+			return True
+		raise ValueError('unknown calc chunk policy {!r}'.format(self))
+
+
 @dataclasses.dataclass(frozen=True)
 class BlobPrecalculateResult:
 	class SizeMismatched(Exception):
@@ -16,24 +32,27 @@ class SizeMismatched(Exception):
 	size: int
 	hash: str
 	should_be_chunked: bool
-	chunks: List[PrettyChunk]
+	chunks: Optional[List[PrettyChunk]]
 
 	def simple_repr(self) -> str:
 		return misc_utils.represent(self, attrs={
 			'size': self.size,
 			'hash': self.hash,
 			'should_be_chunked': self.should_be_chunked,
-			'chunks_len': len(self.chunks),
+			'chunks_len': len(self.chunks) if self.chunks is not None else None,
 		})
 
 	@classmethod
-	def from_stream(cls, stream: IO[bytes], rel_path: Path, size: int) -> 'BlobPrecalculateResult':
+	def from_stream(cls, stream: IO[bytes], rel_path: Path, size: int, *, calc_chunk_policy: CalcChunkPolicy = CalcChunkPolicy.AUTO) -> 'BlobPrecalculateResult':
 		chunk_method = ChunkMethod.get_for_file(rel_path, size)
-		chunks: List[PrettyChunk] = []
+		chunks: Optional[List[PrettyChunk]] = None
 		if chunk_method is not None:
-			chunker = chunk_method.create_stream_chunker(stream, need_entire_file_hash=True)
-			chunks = chunker.cut_all()
-			sah = SizeAndHash(chunker.get_read_file_size(), chunker.get_entire_file_hash())
+			if calc_chunk_policy.should_calculate_chunks(chunk_method):
+				chunker = chunk_method.create_stream_chunker(stream, need_entire_file_hash=True)
+				chunks = chunker.cut_all()
+				sah = SizeAndHash(chunker.get_read_file_size(), chunker.get_entire_file_hash())
+			else:
+				sah = hash_utils.calc_reader_size_and_hash(stream)
 		else:
 			sah = hash_utils.calc_reader_size_and_hash(stream)
 		if sah.size != size:
@@ -47,13 +66,16 @@ def from_stream(cls, stream: IO[bytes], rel_path: Path, size: int) -> 'BlobPreca
 		)
 
 	@classmethod
-	def from_file(cls, path: Path, rel_path: Path, size: int, *, previous_chunks: Optional[Iterable[PrettyChunk]] = None) -> 'BlobPrecalculateResult':
+	def from_file(cls, path: Path, rel_path: Path, size: int, *, calc_chunk_policy: CalcChunkPolicy = CalcChunkPolicy.AUTO, previous_chunks: Optional[Iterable[PrettyChunk]] = None) -> 'BlobPrecalculateResult':
 		chunk_method = ChunkMethod.get_for_file(rel_path, size)
-		chunks: List[PrettyChunk] = []
+		chunks: Optional[List[PrettyChunk]] = None
 		if chunk_method is not None:
-			chunker = chunk_method.create_file_chunker(path, need_entire_file_hash=True, previous_chunks=previous_chunks)
-			chunks = chunker.cut_all()
-			sah = SizeAndHash(chunker.get_read_file_size(), chunker.get_entire_file_hash())
+			if calc_chunk_policy.should_calculate_chunks(chunk_method):
+				chunker = chunk_method.create_file_chunker(path, need_entire_file_hash=True, previous_chunks=previous_chunks)
+				chunks = chunker.cut_all()
+				sah = SizeAndHash(chunker.get_read_file_size(), chunker.get_entire_file_hash())
+			else:
+				sah = hash_utils.calc_file_size_and_hash(path)
 		else:
 			sah = hash_utils.calc_file_size_and_hash(path)
 		if sah.size != size:
diff --git a/prime_backup/action/import_backup_action.py b/prime_backup/action/import_backup_action.py
@@ -6,7 +6,7 @@
 
 from prime_backup.action import Action
 from prime_backup.action.helpers.backup_finalizer import BackupFinalizer
-from prime_backup.action.helpers.blob_pre_calc_result import BlobPrecalculateResult
+from prime_backup.action.helpers.blob_pre_calc_result import BlobPrecalculateResult, CalcChunkPolicy
 from prime_backup.action.helpers.blob_recorder import BlobRecorder
 from prime_backup.action.helpers.chunk_grouper import ChunkGrouper
 from prime_backup.action.helpers.pack_writer import PackWriter
@@ -251,14 +251,14 @@ def __import_packed_backup_file(self, session: DbSession, file_holder: PackedBac
 		for i, member in enumerate(members):
 			if member.is_file():
 				with member.open() as f:
-					pre_cal_dict[i] = BlobPrecalculateResult.from_stream(f, Path(member.path), member.size)
+					pre_cal_dict[i] = BlobPrecalculateResult.from_stream(f, Path(member.path), member.size, calc_chunk_policy=CalcChunkPolicy.TRUE)
 
 		for h, blob in session.get_blobs_by_hashes_opt([res.hash for res in pre_cal_dict.values()]).items():
 			if blob is not None:
 				self.__blob_cache[h] = blob
 
 		for h, chunk in session.get_chunks_by_hashes_opt(collection_utils.deduplicated_list(
-			c.hash for res in pre_cal_dict.values() for c in res.chunks
+			c.hash for res in pre_cal_dict.values() for c in (res.chunks or [])
 		)).items():
 			if chunk is not None:
 				self.__chunk_cache[h] = chunk
diff --git a/prime_backup/types/chunk_method.py b/prime_backup/types/chunk_method.py
@@ -52,3 +52,10 @@ def create_stream_chunker(self, stream: IO[bytes], need_entire_file_hash: bool)
 
 	def needs_previous_chunks(self) -> bool:
 		return self.value.needs_previous_chunks()
+
+	def should_precalculate_chunks(self) -> bool:
+		"""
+		Return True if this method is coarse-grained enough for parallel precalculation.
+		Finer chunks can reduce throughput when multiple files are chunked concurrently.
+		"""
+		return self in (ChunkMethod.fastcdc_1m, ChunkMethod.fixed_1m)