Skip to content

Commit 65c5166

Browse files
committed
use binary to store hash in db attempt
1 parent 91dab98 commit 65c5166

29 files changed

Lines changed: 224 additions & 113 deletions

prime_backup/action/delete_backup_action.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from prime_backup.types.backup_info import BackupInfo
1313
from prime_backup.types.blob_info import BlobDeltaSummary
1414
from prime_backup.types.units import ByteCount
15-
from prime_backup.utils import misc_utils
15+
from prime_backup.utils import misc_utils, hash_utils
1616

1717

1818
@dataclasses.dataclass(frozen=True)
@@ -46,7 +46,7 @@ def run(self) -> DeleteBackupResult:
4646
base_fileset_alive = False
4747
for file in session.get_fileset_files(fileset.id):
4848
if file.blob_hash is not None:
49-
deleted_file_hashes.add(file.blob_hash)
49+
deleted_file_hashes.add(hash_utils.to_hash_hex(file.blob_hash))
5050
session.delete_file(file)
5151
session.delete_fileset(fileset)
5252

prime_backup/action/delete_backup_file_action.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from prime_backup.types.backup_info import BackupInfo
1515
from prime_backup.types.blob_info import BlobDeltaSummary
1616
from prime_backup.types.units import ByteCount
17+
from prime_backup.utils import hash_utils
1718
from prime_backup.utils.path_like import PathLike
1819

1920

@@ -87,8 +88,9 @@ def run(self) -> BlobDeltaSummary:
8788
else:
8889
raise AssertionError('unexpected delta file role {} for file {!r}'.format(file_existing.role, file_existing))
8990
if file_existing.blob_hash is not None:
90-
deleted_blob_hashes.add(file_existing.blob_hash)
91-
self.logger.debug('Added blob_hash {} to deleted_blob_hashes'.format(file_existing.blob_hash))
91+
fbh_hex = hash_utils.to_hash_hex(file_existing.blob_hash)
92+
deleted_blob_hashes.add(fbh_hex)
93+
self.logger.debug('Added blob_hash {} to deleted_blob_hashes'.format(fbh_hex))
9294
session.delete_file(file_existing)
9395
elif file_existing.fileset_id != fileset_id_base:
9496
raise AssertionError('unexpected fileset id {} for file {!r}'.format(file_existing.fileset_id, file_existing))

prime_backup/action/delete_file_action.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from prime_backup.db.values import FileIdentifier
1212
from prime_backup.exceptions import FilesetFileNotFound
1313
from prime_backup.types.file_info import FileListSummary
14+
from prime_backup.utils import hash_utils
1415

1516

1617
class DeleteFilesStep(Action[FileListSummary]):
@@ -31,7 +32,7 @@ def run(self) -> FileListSummary:
3132
deleted_blob_hashes: Set[str] = set()
3233
for file in self.files:
3334
if file.blob_hash is not None:
34-
deleted_blob_hashes.add(file.blob_hash)
35+
deleted_blob_hashes.add(hash_utils.to_hash_hex(file.blob_hash))
3536
self.session.delete_file(file)
3637
deleted_file_count = len(self.files)
3738

prime_backup/action/export_backup_action_base.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from prime_backup.types.backup_info import BackupInfo
1212
from prime_backup.types.export_failure import ExportFailures
1313
from prime_backup.types.file_info import FileInfo
14-
from prime_backup.utils import misc_utils
14+
from prime_backup.utils import misc_utils, hash_utils
1515

1616

1717
class ExportBackupActionCommonInitKwargs(TypedDict):
@@ -69,7 +69,9 @@ def _on_unsupported_file_mode(cls, file: schema.File):
6969

7070
@classmethod
7171
def _verify_exported_blob(cls, file: schema.File, written_size: int, written_hash: str):
72+
if file.blob_hash is None:
73+
raise AssertionError('File {!r} has no blob_hash'.format(file))
7274
if written_size != file.blob_raw_size:
7375
raise VerificationError('raw size mismatched for {}, expected {}, actual written {}'.format(file.path, file.blob_raw_size, written_size))
74-
if written_hash != file.blob_hash:
75-
raise VerificationError('hash mismatched for {}, expected {}, actual written {}'.format(file.path, file.blob_hash, written_hash))
76+
if written_hash != (fbh_hex := hash_utils.to_hash_hex(file.blob_hash)):
77+
raise VerificationError('hash mismatched for {}, expected {}, actual written {}'.format(file.path, fbh_hex, written_hash))

prime_backup/action/get_blob_action.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from prime_backup.db.session import DbSession
1010
from prime_backup.exceptions import BlobHashNotFound, BlobHashNotUnique
1111
from prime_backup.types.blob_info import BlobInfo
12+
from prime_backup.utils import hash_utils
1213

1314

1415
class _GetBlobActionBase(Action[BlobInfo], ABC):
@@ -24,8 +25,9 @@ def run(self) -> BlobInfo:
2425
"""
2526
with DbAccess.open_session() as session:
2627
blob = self._do_get_blob(session)
27-
file_count = session.get_file_count_by_blob_hashes([blob.hash]) if self.count_files else 0
28-
file_samples = session.get_file_by_blob_hashes([blob.hash], limit=self.sample_file_num) if self.sample_file_num is not None else None
28+
blob_hash_hex = hash_utils.to_hash_hex(blob.hash)
29+
file_count = session.get_file_count_by_blob_hashes([blob_hash_hex]) if self.count_files else 0
30+
file_samples = session.get_file_by_blob_hashes([blob_hash_hex], limit=self.sample_file_num) if self.sample_file_num is not None else None
2931
return BlobInfo.of(blob, file_count=file_count, file_samples=file_samples)
3032

3133
@abstractmethod

prime_backup/action/helpers/backup_finalizer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from prime_backup.action.helpers.fileset_allocator import FilesetAllocateArgs, FilesetAllocator
55
from prime_backup.db import schema
66
from prime_backup.db.session import DbSession
7+
from prime_backup.utils import hash_utils
78

89

910
class BackupFinalizer:
@@ -19,12 +20,12 @@ def finalize_files_and_backup(self, backup: schema.Backup, files: List[schema.Fi
1920
self.session.flush() # ensure all blobs has their blob.id allocated
2021

2122
file_blobs = self.session.get_blobs_by_hashes_opt([
22-
file.blob_hash for file in files
23+
hash_utils.to_hash_hex(file.blob_hash) for file in files
2324
if file.blob_hash is not None
2425
])
2526
for file in files:
2627
if file.blob_hash is not None:
27-
file_blob = file_blobs[file.blob_hash]
28+
file_blob = file_blobs[hash_utils.to_hash_hex(file.blob_hash)]
2829
if file_blob is None:
2930
raise AssertionError('blob of file does not exists: {}'.format(file))
3031
file.blob_id = file_blob.id

prime_backup/action/helpers/blob_allocator.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -517,17 +517,19 @@ def __try_get_or_create_chunked_blob(self, src_path: Path, src_path_md5: str, st
517517
compress_method: CompressMethod = self.config.backup.get_compress_method_from_size(chunk.length)
518518
compressor = Compressor.create(compress_method)
519519

520+
chunk_hash_hex = chunk.hash
520521
db_chunk = self.session.create_chunk(
521-
hash=chunk.hash,
522+
hash=chunk_hash_hex,
522523
compress=compress_method.name,
523524
raw_size=len(chunk_buf),
524525
stored_size=-1,
525526
)
526527
new_db_chunks.append(db_chunk)
527-
known_db_chunks[db_chunk.hash] = db_chunk
528+
known_db_chunks[chunk_hash_hex] = db_chunk
528529

529-
def write_task(db_chunk_=db_chunk, compressor_=compressor, chunk_buf_=chunk_buf):
530-
chunk_path = chunk_utils.get_chunk_path(db_chunk_.hash)
530+
def write_task(db_chunk_=db_chunk, compressor_=compressor, chunk_buf_=chunk_buf, chunk_hash_hex_=chunk_hash_hex):
531+
# WARNING: cannot reference any not-captured locals since they might be modified in other thread
532+
chunk_path = chunk_utils.get_chunk_path(chunk_hash_hex_)
531533
self.__blob_recorder.add_remove_file_rollbacker(chunk_path)
532534

533535
with compressor_.open_compressed_bypassed(chunk_path) as (writer, f):
@@ -724,7 +726,7 @@ def callback(query_rsp: BqmRsp, g=gen):
724726
def __update_blob_cache(self, blob: schema.Blob):
725727
if blob is not None:
726728
self.__blob_by_size_cache[blob.raw_size] = True
727-
self.__blob_by_hash_cache[blob.hash] = blob
729+
self.__blob_by_hash_cache[hash_utils.to_hash_hex(blob.hash)] = blob
728730

729731
def add_existing_sizes(self, existing_sizes: Dict[int, bool]):
730732
self.__blob_by_size_cache.update(existing_sizes)

prime_backup/action/helpers/blob_exporter.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,8 @@ def __verify_exported_blob(self, written_size: int, written_hash: str):
258258
self.__verify_exported_data(lambda: 'blob', self.blob.raw_size, self.blob.hash, written_size, written_hash)
259259

260260
def __verify_exported_chunk(self, chunk: Union[schema.Chunk, ChunkInfo], written_size: int, written_hash: str):
261-
self.__verify_exported_data(lambda: f'chunk {chunk.hash}', chunk.raw_size, chunk.hash, written_size, written_hash)
261+
hash_hex = hash_utils.to_hash_hex(chunk.hash)
262+
self.__verify_exported_data(lambda: f'chunk {hash_hex}', chunk.raw_size, hash_hex, written_size, written_hash)
262263

263264
def __verify_exported_data(self, what: Callable[[], str], expected_size: int, expected_hash: str, written_size: int, written_hash: str):
264265
if written_size != expected_size:

prime_backup/action/helpers/chunk_grouper.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from prime_backup.constants import chunk_constants
88
from prime_backup.db import schema
99
from prime_backup.db.session import DbSession
10-
from prime_backup.utils import chunk_utils, collection_utils
10+
from prime_backup.utils import chunk_utils, collection_utils, hash_utils
1111
from prime_backup.utils.time_cost_stats import TimeCostStats
1212

1313

@@ -57,9 +57,9 @@ def create_chunk_groups(self, blob: schema.Blob, blob_chunks: Dict[int, schema.C
5757
needs_cut |= i == len(blob_chunks) - 1 # last chunk group
5858
if len(current_group.chunks) >= chunk_constants.CHUNK_GROUP_MIN_SIZE:
5959
needs_cut |= len(current_group.chunks) >= chunk_constants.CHUNK_GROUP_MAX_SIZE # reach max size
60-
needs_cut |= chunk.hash.endswith('00') # 1/256 chance
60+
needs_cut |= chunk.hash.endswith(b'\0') # last byte is 0 -- 1/256 chance
6161
if needs_cut:
62-
current_group.hash = chunk_utils.create_chunk_group_hash(chunk.hash for chunk in current_group.chunks)
62+
current_group.hash = chunk_utils.create_chunk_group_hash(hash_utils.to_hash_hex(chunk.hash) for chunk in current_group.chunks)
6363
raw_chunk_groups.append(current_group)
6464
chunk_group_hashes_to_chunks[current_group.hash] = current_group.chunks
6565
current_group = _RawChunkGroup()
@@ -77,7 +77,7 @@ def create_chunk_groups(self, blob: schema.Blob, blob_chunks: Dict[int, schema.C
7777
chunk_stored_size_sum=sum({chunk.hash: chunk.stored_size for chunk in cg_chunks}.values()),
7878
)
7979
known_chunk_groups[cg_hash] = new_chunk_group
80-
new_chunk_group_hashes.append(new_chunk_group.hash)
80+
new_chunk_group_hashes.append(hash_utils.to_hash_hex(new_chunk_group.hash))
8181
if len(new_chunk_group_hashes) > 0:
8282
with self.__time_costs.measure_time_cost(CreateBackupTimeCostKey.kind_db):
8383
self.session.flush() # creates chunk_group.id

prime_backup/action/import_backup_action.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from prime_backup.types.standalone_backup_format import StandaloneBackupFormat
2525
from prime_backup.types.tar_format import TarFormat
2626
from prime_backup.types.units import ByteCount
27-
from prime_backup.utils import blob_utils, misc_utils, chunk_utils, collection_utils, file_utils
27+
from prime_backup.utils import blob_utils, misc_utils, chunk_utils, collection_utils, file_utils, hash_utils
2828
from prime_backup.utils.chunker import Chunker
2929
from prime_backup.utils.hash_utils import SizeAndHash
3030

@@ -142,7 +142,7 @@ def __create_blob(self, session: DbSession, file_path: str, file_reader: IO[byte
142142
else:
143143
blob = self.__create_blob_direct(session, file_reader, SizeAndHash(pre_cal_result.size, pre_cal_result.hash))
144144

145-
self.__blob_cache[blob.hash] = blob
145+
self.__blob_cache[hash_utils.to_hash_hex(blob.hash)] = blob
146146
return blob
147147

148148
@classmethod

0 commit comments

Comments
 (0)