Skip to content

Commit fb25e86

Browse files
authored
fix: drop latin-1 decode in source title and userIds (#306)
blaming the lines points to this issue for which the fix was made: #71 and it was probably related to the issue that the title was transported from context_chat PHP to backend in the headers which only support latin-1 mostly, finding back the sources for it, these two links pop up: python/cpython#105505 (comment) https://stackoverflow.com/questions/4400678/what-character-encoding-should-i-use-for-a-http-header/4410331#4410331 now that we don't have that limitation after reversal of the indexing direction, it can be dropped. Signed-off-by: kyteinsky <kyteinsky@gmail.com>
1 parent fe0c3a2 commit fb25e86

1 file changed

Lines changed: 4 additions & 12 deletions

File tree

context_chat_backend/chain/ingest/injest.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ def _sources_to_indocuments(
257257

258258
metadata = {
259259
'source': source.reference,
260-
'title': _decode_latin_1(source.title),
260+
'title': source.title,
261261
'type': source.type,
262262
}
263263
doc = Document(page_content=content, metadata=metadata)
@@ -271,7 +271,7 @@ def _sources_to_indocuments(
271271

272272
indocuments[db_id] = InDocument(
273273
documents=split_docs,
274-
userIds=list(map(_decode_latin_1, source.userIds)),
274+
userIds=source.userIds,
275275
source_id=source.reference,
276276
provider=source.provider,
277277
modified=source.modified, # pyright: ignore[reportArgumentType]
@@ -299,7 +299,7 @@ def _increase_access_for_existing_sources(
299299
try:
300300
vectordb.update_access(
301301
UpdateAccessOp.ALLOW,
302-
list(map(_decode_latin_1, source.userIds)),
302+
source.userIds,
303303
source.reference,
304304
)
305305
results[db_id] = None
@@ -390,22 +390,14 @@ def _process_sources(
390390
return source_proc_results
391391

392392

393-
def _decode_latin_1(s: str) -> str:
394-
try:
395-
return s.encode('latin-1').decode('utf-8')
396-
except UnicodeDecodeError:
397-
logger.error('Failed to decode latin-1: %s', s)
398-
return s
399-
400-
401393
def embed_sources(
402394
vectordb_loader: VectorDBLoader,
403395
config: TConfig,
404396
sources: Mapping[int, SourceItem | ReceivedFileItem]
405397
) -> Mapping[int, IndexingError | None]:
406398
logger.debug('Embedding sources:', extra={
407399
'source_ids': [
408-
f'{source.reference} ({_decode_latin_1(source.title)})'
400+
f'{source.reference} ({source.title})'
409401
for source in sources.values()
410402
],
411403
'len(source_ids)': len(sources),

0 commit comments

Comments
 (0)