Skip to content

Commit c40974a

Browse files
authored
fix(pgvector): make doc deletion query faster (#289)
CI logging for slow queries has also been enabled, not sure if we will see that in the CI though. Sample output for the slow deletion query where a missing index on the `source_id` foreign key in access_list table was the culprit. Calculated time: 3.495 + 0.310 + 0.129 = 3.934 ms Actual time: 201177.123 ms or 201 s ``` Query Text: DELETE FROM docs WHERE docs.source_id IN ($1::VARCHAR, $2::VARCHAR, ..., $275::VARCHAR) RETURNING docs.chunks Query Parameters: ... Delete on docs (cost=1126.32..2018.25 rows=275 width=6) (actual time=0.192..3.495 rows=218 loops=1) -> Bitmap Heap Scan on docs (cost=1126.32..2018.25 rows=275 width=6) (actual time=0.144..0.310 rows=218 loops=1) Recheck Cond: ((source_id)::text = ANY ('{"files__default: 20392","files__default: 23092", ... }'::text[])) Heap Blocks: exact=25 -> Bitmap Index Scan on docs_pkey (cost=0.00..1125.56 rows=275 width=0) (actual time=0.129..0.129 rows=218 loops=1) Index Cond: ((source_id)::text = ANY ('{"files__default: 20392", ... 2026-03-19 11:28:59.760 UTC [6703] LOG: duration: 201177.123 ms execute <unnamed>: DELETE FROM docs WHERE docs.source_id IN ($1::VARCHAR, $2::VARCHAR, ..., $275::VARCHAR) RETURNING docs.chunks 2026-03-19 11:28:59.760 UTC [6703] DETAIL: Parameters: $1 = 'files__default: 20392', $2 = ... ``` (put the chunking part in a different PR)
2 parents 0d3308f + a10e8ce commit c40974a

5 files changed

Lines changed: 158 additions & 10 deletions

File tree

.github/workflows/integration-test.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,41 @@ jobs:
9292
options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5 --name postgres --hostname postgres
9393

9494
steps:
95+
- name: Enable PostgreSQL slow query logging and auto_explain
96+
run: |
97+
docker exec postgres bash -c "\
98+
cat >> /var/lib/postgresql/data/postgresql.conf <<EOF
99+
100+
# slow query logging
101+
log_min_duration_statement = 20
102+
103+
# auto_explain for slow queries
104+
session_preload_libraries = 'auto_explain'
105+
auto_explain.log_min_duration = '20ms'
106+
auto_explain.log_analyze = on
107+
auto_explain.log_buffers = on
108+
auto_explain.log_nested_statements = on
109+
auto_explain.log_verbose = on
110+
111+
# file-based logging
112+
logging_collector = on
113+
log_directory = '/var/log/pg_log'
114+
log_filename = 'postgresql.log'
115+
log_file_mode = 0644
116+
EOF"
117+
118+
# create the log directory and set ownership for postgres user (logging_collector needs it)
119+
docker exec postgres bash -c "mkdir -p /var/log/pg_log && chown -R postgres:postgres /var/log/pg_log"
120+
121+
# logging_collector requires a restart to take effect
122+
docker restart postgres
123+
# wait for postgres to be ready again (max 60 seconds)
124+
timeout 60 bash -c 'until docker exec postgres pg_isready -U root; do sleep 1; done' || { echo "Error: PostgreSQL did not become ready within 60 seconds"; docker logs postgres --tail 50; exit 1; }
125+
126+
# verify the config has been loaded
127+
docker exec postgres psql -U root -d nextcloud -c "SHOW log_min_duration_statement;"
128+
docker exec postgres psql -U root -d nextcloud -c "SHOW session_preload_libraries;"
129+
95130
- name: Checkout server
96131
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
97132
with:
@@ -378,6 +413,11 @@ jobs:
378413
/tmp/0_pgdump_nextcloud
379414
/tmp/1_pgdump_nextcloud
380415
416+
- name: Show PostgreSQL slow query logs
417+
if: always()
418+
run: |
419+
docker exec postgres cat /var/log/pg_log/postgresql.log
420+
381421
- name: Final stats log
382422
if: always()
383423
run: |

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,10 @@ v2.1.0 introduces repair steps. These run on app startup.
124124

125125
`repair2001_date20240412153300.py` removes the existing config.yaml in the persistent storage for the
126126
hardware detection to run and place a suitable config (based on accelerator detected) in its place.
127-
To skip this step (or steps in the future), populate the `repair.info` file with the repair file name(s).
127+
To skip this step (or steps in the future), add the repair filename(s) to `repair.info` in the persistent storage, one filename per line.
128128
Use the below command inside the container or add the repair filename manually in the repair.info file inside the docker container at `/nc_app_context_chat_backend_data`
129129

130-
`echo repair2001_date20240412153300.py > "$APP_PERSISTENT_STORAGE/repair.info"`
130+
`echo repair2001_date20240412153300.py >> "$APP_PERSISTENT_STORAGE/repair.info"`
131131

132132
#### How to generate a repair step file
133133
`APP_VERSION` should at least be incremented at the minor level (MAJOR.MINOR.PATCH)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#
2+
# SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
3+
# SPDX-License-Identifier: AGPL-3.0-or-later
4+
#
5+
import os
6+
7+
import sqlalchemy as sa
8+
9+
'''
10+
Add an index on access_list.source_id to speed up ON DELETE CASCADE
11+
triggered when deleting from the docs table.
12+
Without this index, the CASCADE performs a sequential scan of access_list
13+
for each deleted doc row, causing very slow batch deletes.
14+
'''
15+
16+
17+
def run(_previous_version: int):
18+
db_url = os.environ.get('CCB_DB_URL')
19+
if not db_url:
20+
print('CCB_DB_URL not set, skipping access_list index migration', flush=True)
21+
return
22+
23+
engine = sa.create_engine(db_url)
24+
with engine.connect() as conn:
25+
conn.execute(sa.text(
26+
'CREATE INDEX IF NOT EXISTS idx_access_list_source_id ON access_list (source_id)'
27+
))
28+
conn.commit()

context_chat_backend/repair/runner.py

Lines changed: 84 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
import re
88
from importlib import import_module
99

10+
REPAIR_DIR = 'context_chat_backend/repair'
11+
VERSION_INFO_FILE = 'version.info'
12+
REPAIR_SKIP_FILE = 'repair.info'
13+
PARTIAL_REPAIR_FILE = 'partial_repair.tmp'
14+
1015

1116
def get_previous_version(version_info_path: str) -> tuple[int, bool]:
1217
'''
@@ -15,8 +20,15 @@ def get_previous_version(version_info_path: str) -> tuple[int, bool]:
1520
if not os.path.exists(version_info_path):
1621
return (0, False)
1722

18-
with open(version_info_path) as f:
19-
version_string = f.read().strip()
23+
try:
24+
with open(version_info_path) as f:
25+
version_string = f.read().strip()
26+
except OSError as e:
27+
print(
28+
f'Warning: could not read {version_info_path}, assuming no previous version was installed: {e}',
29+
flush=True,
30+
)
31+
return (0, False)
2032

2133
if not version_string:
2234
return (0, False)
@@ -33,25 +45,55 @@ def get_previous_version(version_info_path: str) -> tuple[int, bool]:
3345
return (int(major + minor.zfill(3)), repairs_pending)
3446

3547

48+
def get_skipped_repairs(persistent_storage_path: str) -> set[str]:
49+
repair_info_path = os.path.join(persistent_storage_path, REPAIR_SKIP_FILE)
50+
if not os.path.exists(repair_info_path):
51+
return set()
52+
53+
try:
54+
with open(repair_info_path) as f:
55+
return {line.strip() for line in f if line.strip()}
56+
except OSError as e:
57+
print(f'Warning: could not read {repair_info_path}, no repairs will be skipped: {e}', flush=True)
58+
return set()
59+
60+
3661
def main():
3762
'''
3863
Run repairs that have not been run before.
3964
Repair files can either have no functions or a run() function.
65+
To skip a repair, add its filename to repair.info in the persistent storage.
4066
'''
4167
print('Running repairs...', flush=True)
4268

4369
persistent_storage_path = os.getenv('APP_PERSISTENT_STORAGE', 'persistent_storage')
44-
version_info_path = os.path.join(persistent_storage_path, 'version.info')
70+
version_info_path = os.path.join(persistent_storage_path, VERSION_INFO_FILE)
71+
partial_repair_path = os.path.join(persistent_storage_path, PARTIAL_REPAIR_FILE)
4572

46-
all_filenames = os.listdir('context_chat_backend/repair')
47-
repair_filenames = [f for f in all_filenames if f.startswith('repair') and f.endswith('.py')]
73+
try:
74+
all_filenames = os.listdir(REPAIR_DIR)
75+
except OSError as e:
76+
print(f'Error: could not list repair directory to get all the eligible repairs: {e}', flush=True)
77+
raise
78+
repair_filenames = sorted(f for f in all_filenames if f.startswith('repair') and f.endswith('.py'))
4879

4980
(previous_app_version, repairs_pending) = get_previous_version(version_info_path)
5081

5182
if not repairs_pending:
5283
print('No repairs are required.', flush=True)
5384
return
5485

86+
skipped_repairs = get_skipped_repairs(persistent_storage_path)
87+
88+
try:
89+
with open(partial_repair_path) as f:
90+
partial_repairs = {line.strip() for line in f if line.strip()}
91+
except FileNotFoundError:
92+
partial_repairs = set()
93+
except OSError as e:
94+
print(f'Warning: could not read {partial_repair_path}, all pending repairs will be re-run: {e}', flush=True)
95+
partial_repairs = set()
96+
5597
for repair_filename in repair_filenames:
5698
pattern = re.compile(r'^repair(\d+)_date\d+\.py$')
5799
matches = pattern.match(repair_filename)
@@ -65,16 +107,50 @@ def main():
65107
print(f'No repairs to run for version {introduced_version}.', flush=True)
66108
continue
67109

110+
if repair_filename in skipped_repairs:
111+
print(f'Skipping repair {repair_filename} (listed in repair.info).', flush=True)
112+
continue
113+
114+
if repair_filename in partial_repairs:
115+
print(f'Skipping repair {repair_filename} (already completed in partial run).', flush=True)
116+
continue
117+
68118
print(f'Running repair {repair_filename}...', flush=True, end='')
69119

70120
mod = import_module(f'.repair.{repair_filename[:-3]}', 'context_chat_backend')
71121
if hasattr(mod, 'run'):
72-
mod.run(previous_app_version)
122+
try:
123+
mod.run(previous_app_version)
124+
except Exception:
125+
print(
126+
'failed.\n'
127+
'The app will not continue further until this repair step succeeds, '
128+
'or is skipped through the method described in https://github.com/nextcloud/context_chat_backend/#repair \n' # noqa: E501
129+
'If not skipped, it will be tried again in the next app startup.',
130+
flush=True,
131+
)
132+
raise
133+
134+
try:
135+
with open(partial_repair_path, 'a') as f:
136+
f.write(repair_filename + '\n')
137+
except OSError as e:
138+
print(f'Warning: could not write to {partial_repair_path}: {e}', flush=True)
73139

74140
print('completed.', flush=True)
75141

76-
with open(version_info_path, 'w') as f:
77-
f.write(os.environ['APP_VERSION'] + '+')
142+
try:
143+
if os.path.exists(partial_repair_path):
144+
os.unlink(partial_repair_path)
145+
except OSError as e:
146+
print(f'Warning: could not remove {partial_repair_path}: {e}', flush=True)
147+
148+
try:
149+
with open(version_info_path, 'w') as f:
150+
f.write(os.environ['APP_VERSION'] + '+')
151+
except OSError as e:
152+
print(f'Error: could not write {version_info_path}: {e}', flush=True)
153+
return
78154

79155
print('Repairs completed.', flush=True)
80156

context_chat_backend/vectordb/pgvector.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ class AccessListStore(Base):
9797
'source_id',
9898
unique=True,
9999
),
100+
sa.Index(
101+
'idx_access_list_source_id',
102+
'source_id',
103+
)
100104
)
101105

102106
@classmethod

0 commit comments

Comments
 (0)