fix: handle validation errors of files and content providers individually #918
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors | |
| # SPDX-License-Identifier: AGPL-3.0-or-later | |
| name: Integration test | |
| on: | |
| pull_request: | |
| push: | |
| branches: | |
| - master | |
| - stable* | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: integration-test-${{ github.head_ref || github.run_id }} | |
| cancel-in-progress: true | |
| jobs: | |
| changes: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| pull-requests: read | |
| outputs: | |
| src: ${{ steps.changes.outputs.src}} | |
| steps: | |
| - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 | |
| id: changes | |
| continue-on-error: true | |
| with: | |
| filters: | | |
| src: | |
| - 'main.py' | |
| - 'main_em.py' | |
| - 'config.cpu.yaml' | |
| - 'config.gpu.yaml' | |
| - 'context_chat_backend/**' | |
| - 'appinfo/**' | |
| - 'example.env' | |
| - 'hwdetect.sh' | |
| - 'persistent_storage/**' | |
| - 'project.toml' | |
| - 'requirements.txt' | |
| - 'logger_config.yaml' | |
| - 'logger_config_em.yaml' | |
| - 'supervisord.conf' | |
| - '.github/workflows/integration-test.yml' | |
| integration: | |
| runs-on: ubuntu-24.04 | |
| needs: changes | |
| if: needs.changes.outputs.src != 'false' | |
| strategy: | |
| # do not stop on another job's failure | |
| fail-fast: false | |
| matrix: | |
| php-versions: [ '8.2' ] | |
| databases: [ 'pgsql' ] | |
| server-versions: [ 'stable32', 'stable33', 'master' ] | |
| name: Integration test on ${{ matrix.server-versions }} php@${{ matrix.php-versions }} | |
| env: | |
| MYSQL_PORT: 4444 | |
| PGSQL_PORT: 4445 | |
| # use the same db for ccb and nextcloud | |
| CCB_DB_URL: postgresql+psycopg://root:rootpassword@localhost:4445/nextcloud | |
| services: | |
| mysql: | |
| image: mariadb:10.5 | |
| ports: | |
| - 4444:3306/tcp | |
| env: | |
| MYSQL_ROOT_PASSWORD: rootpassword | |
| options: --health-cmd="mysqladmin ping" --health-interval 5s --health-timeout 2s --health-retries 5 | |
| postgres: | |
| image: pgvector/pgvector:pg17 | |
| ports: | |
| - 4445:5432/tcp | |
| env: | |
| POSTGRES_USER: root | |
| POSTGRES_PASSWORD: rootpassword | |
| POSTGRES_DB: nextcloud | |
| options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5 --name postgres --hostname postgres | |
| steps: | |
| - name: Enable PostgreSQL slow query logging and auto_explain | |
| run: | | |
| docker exec postgres bash -c "\ | |
| cat >> /var/lib/postgresql/data/postgresql.conf <<EOF | |
| # slow query logging | |
| log_min_duration_statement = 20 | |
| # auto_explain for slow queries | |
| session_preload_libraries = 'auto_explain' | |
| auto_explain.log_min_duration = '20ms' | |
| auto_explain.log_analyze = on | |
| auto_explain.log_buffers = on | |
| auto_explain.log_nested_statements = on | |
| auto_explain.log_verbose = on | |
| # file-based logging | |
| logging_collector = on | |
| log_directory = '/var/log/pg_log' | |
| log_filename = 'postgresql.log' | |
| log_file_mode = 0644 | |
| EOF" | |
| # create the log directory and set ownership for postgres user (logging_collector needs it) | |
| docker exec postgres bash -c "mkdir -p /var/log/pg_log && chown -R postgres:postgres /var/log/pg_log" | |
| # logging_collector requires a restart to take effect | |
| docker restart postgres | |
| # wait for postgres to be ready again (max 60 seconds) | |
| timeout 60 bash -c 'until docker exec postgres pg_isready -U root; do sleep 1; done' || { echo "Error: PostgreSQL did not become ready within 60 seconds"; docker logs postgres --tail 50; exit 1; } | |
| # verify the config has been loaded | |
| docker exec postgres psql -U root -d nextcloud -c "SHOW log_min_duration_statement;" | |
| docker exec postgres psql -U root -d nextcloud -c "SHOW session_preload_libraries;" | |
| - name: Checkout server | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 | |
| with: | |
| repository: nextcloud/server | |
| ref: ${{ matrix.server-versions }} | |
| submodules: 'recursive' | |
| persist-credentials: false | |
| - name: Set up php ${{ matrix.php-versions }} | |
| uses: shivammathur/setup-php@9e72090525849c5e82e596468b86eb55e9cc5401 # v2 | |
| with: | |
| php-version: ${{ matrix.php-versions }} | |
| tools: phpunit | |
| extensions: mbstring, iconv, fileinfo, intl, sqlite, pdo_mysql, pdo_sqlite, pgsql, pdo_pgsql, gd, zip | |
| - name: Checkout context_chat php app | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 | |
| with: | |
| repository: nextcloud/context_chat | |
| path: apps/context_chat | |
| persist-credentials: false | |
| - name: Checkout backend | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 | |
| with: | |
| path: context_chat_backend/ | |
| persist-credentials: false | |
| - name: Checkout app_api | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 | |
| with: | |
| repository: nextcloud/app_api | |
| ref: ${{ matrix.server-versions == 'master' && 'main' || matrix.server-versions }} | |
| path: apps/app_api | |
| persist-credentials: false | |
| - name: Get app version | |
| id: appinfo | |
| uses: skjnldsv/xpath-action@7e6a7c379d0e9abc8acaef43df403ab4fc4f770c # master | |
| with: | |
| filename: context_chat_backend/appinfo/info.xml | |
| expression: "/info/version/text()" | |
| - name: Set up Nextcloud MYSQL | |
| if: ${{ matrix.databases != 'pgsql'}} | |
| run: | | |
| sleep 25 | |
| mkdir data | |
| ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$MYSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password | |
| composer run serve & | |
| - name: Set up Nextcloud PGSQL | |
| if: ${{ matrix.databases == 'pgsql'}} | |
| run: | | |
| sleep 25 | |
| mkdir data | |
| ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$PGSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password | |
| composer run serve & | |
| - name: Enable context_chat, app_api and testing | |
| run: ./occ app:enable -vvv -f context_chat app_api testing | |
| - name: Checkout documentation | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 | |
| with: | |
| repository: nextcloud/documentation | |
| path: data/admin/files/documentation | |
| persist-credentials: false | |
| - name: Prepare docs | |
| run: | | |
| cd data/admin/files | |
| mv documentation/admin_manual . | |
| cp -R documentation/developer_manual . | |
| cd developer_manual | |
| find . -type f -name "*.rst" -exec bash -c 'mv "$0" "${0%.rst}.md"' {} \; | |
| cd .. | |
| cp -R documentation/developer_manual ./developer_manual2 | |
| cd developer_manual2 | |
| find . -type f -name "*.rst" -exec bash -c 'mv "$0" "${0%.rst}.txt"' {} \; | |
| cd .. | |
| rm -rf documentation | |
| - name: Run files scan | |
| run: | | |
| ./occ files:scan --all | |
| - name: Setup python 3.11 | |
| uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5 | |
| with: | |
| python-version: '3.11' | |
| cache: 'pip' | |
| cache-dependency-path: | | |
| requirements.txt | |
| - name: Install and init backend | |
| run: | | |
| cd context_chat_backend | |
| pip install --upgrade pip setuptools wheel | |
| # use the cpu version of torch to not run out of space | |
| pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu | |
| sed -i '/torch(vision)?/d' requirements.txt | |
| pip install -r requirements.txt | |
| cp example.env .env | |
| echo "NEXTCLOUD_URL=http://localhost:8080" >> .env | |
| python3 -u ./main_em.py > em_backend_logs 2>&1 & | |
| python3 -u ./main.py > backend_logs 2>&1 & | |
| echo $! > ../pid.txt # Save the process ID (PID) | |
| sleep 60 # Wait for the backend to get ready | |
| - name: Register backend | |
| run: | | |
| timeout 10 ./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080 | |
| timeout 120 ./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"${{ fromJson(steps.appinfo.outputs.result).version }}\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish | |
| ls -la context_chat_backend/persistent_storage/* | |
| - name: Initial memory usage check | |
| run: | | |
| ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem | |
| ps -p $(cat pid.txt) -o %mem --no-headers > initial_mem.txt | |
| - name: Run cron jobs | |
| run: | | |
| # every 10 seconds indefinitely | |
| while true; do | |
| php cron.php | |
| sleep 10 | |
| done & | |
| sleep 30 | |
| # list all the bg jobs | |
| ./occ background-job:list | |
| - name: Initial dump of DB with context_chat_queue populated | |
| if: always() | |
| run: | | |
| docker exec postgres pg_dump nextcloud > /tmp/0_pgdump_nextcloud | |
| - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files | |
| run: | | |
| success=0 | |
| echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files" | |
| for i in {1..90}; do | |
| echo "Checking stats, attempt $i..." | |
| stats_err=$(mktemp) | |
| stats_exit=0 | |
| stats=$(timeout 30 ./occ context_chat:stats --json 2>"$stats_err") || stats_exit=$? | |
| echo "Stats output:" | |
| echo "$stats" | |
| if [ -s "$stats_err" ]; then | |
| echo "Stderr:" | |
| cat "$stats_err" | |
| fi | |
| echo "---" | |
| rm -f "$stats_err" | |
| # Check for critical errors in output | |
| if [ $stats_exit -ne 0 ] || echo "$stats" | grep -q "Error during request"; then | |
| echo "Backend connection error detected (exit=$stats_exit), retrying..." | |
| sleep 10 | |
| continue | |
| fi | |
| # Extract total eligible files | |
| total_eligible_files=$(echo "$stats" | jq '.eligible_files_count' || echo "") | |
| # Extract indexed documents count (files__default) | |
| indexed_count=$(echo "$stats" | jq '.vectordb_document_counts.files__default' || echo "") | |
| echo "Total eligible files: $total_eligible_files" | |
| echo "Indexed documents (files__default): $indexed_count" | |
| diff=$((total_eligible_files - indexed_count)) | |
| threshold=$((total_eligible_files * 3 / 100)) | |
| # Check if difference is within tolerance | |
| if [ $diff -le $threshold ]; then | |
| echo "Indexing within 3% tolerance (diff=$diff, threshold=$threshold)" | |
| success=1 | |
| break | |
| else | |
| progress=$((diff * 100 / total_eligible_files)) | |
| echo "Outside 3% tolerance: diff=$diff (${progress}%), threshold=$threshold" | |
| fi | |
| # Check if backend is still alive | |
| ccb_alive=$(ps -p $(cat pid.txt) -o cmd= | grep -c "main.py" || echo "0") | |
| if [ "$ccb_alive" -eq 0 ]; then | |
| echo "Error: Context Chat Backend process is not running. Exiting." | |
| exit 1 | |
| fi | |
| sleep 10 | |
| done | |
| echo "::endgroup::" | |
| if [ $success -ne 1 ]; then | |
| echo "Max attempts reached" | |
| exit 1 | |
| fi | |
| - name: Run the prompts | |
| run: | | |
| ./occ background-job:worker 'OC\TaskProcessing\SynchronousBackgroundJob' > worker1_logs 2>&1 & | |
| ./occ background-job:worker 'OC\TaskProcessing\SynchronousBackgroundJob' > worker2_logs 2>&1 & | |
| echo ::group::English prompt | |
| OUT1=$(./occ context_chat:prompt admin "Which factors are taken into account for the Ethical AI Rating?") | |
| echo "$OUT1" | |
| echo "$OUT1" | grep -q "If all of these points are met, we give a Green label." || exit 1 | |
| echo ::endgroup:: | |
| echo ::group::German prompt | |
| OUT2=$(./occ context_chat:prompt admin "Welche Faktoren beeinflussen das Ethical AI Rating?") | |
| echo "$OUT2" | |
| echo "$OUT2" | grep -q "If all of these points are met, we give a Green label." || exit 1 | |
| echo ::endgroup:: | |
| - name: Wait for files queue to drain to zero | |
| run: | | |
| echo "::group::Waiting up to 5 minutes for documents queue to reach zero" | |
| for i in {1..30}; do | |
| stats=$(./occ context_chat:stats --json) | |
| scheduled=$(echo "$stats" | jq '[.queued_documents_counts | to_entries[].value] | add // 0') | |
| echo "Attempt $i: queued_documents=$scheduled" | |
| if [ "$scheduled" = "0" ]; then | |
| echo "Queue is empty" | |
| break | |
| fi | |
| if [ "$i" = "30" ]; then | |
| echo "Timeout: queue did not drain to zero (queued_documents=$scheduled)" | |
| exit 1 | |
| fi | |
| sleep 10 | |
| done | |
| echo "::endgroup::" | |
| - name: Check if the empty files get identified, not indexed and don't cause any issues with the batch | |
| run: | | |
| NEW_FILES_COUNT=5 | |
| echo "::group::Stats before the empty files are added" | |
| prev_stats=$(./occ context_chat:stats --json) | |
| echo "$prev_stats" | |
| prev_scheduled=$(echo "$prev_stats" | jq '.queued_documents_counts.files__default // 0') | |
| prev_eligible=$(echo "$prev_stats" | jq '.eligible_files_count') | |
| prev_indexed=$(echo "$prev_stats" | jq '.vectordb_document_counts.files__default // 0') | |
| echo "queued_documents_counts.files__default before scan: $prev_scheduled" | |
| echo "::endgroup::" | |
| # create empty markdown files in admin's home folder | |
| for i in $(seq 1 $NEW_FILES_COUNT); do | |
| touch "data/admin/files/test_empty_$i.md" | |
| done | |
| # create one new file with content | |
| echo "hello world" > "data/admin/files/test_filled.md" | |
| # run files scan so Nextcloud registers the new files | |
| ./occ files:scan admin | |
| echo "::group::Confirming new files appear in the queue after scan" | |
| stats=$(./occ context_chat:stats --json) | |
| echo "$stats" | |
| scheduled=$(echo "$stats" | jq '.queued_documents_counts.files__default // 0') | |
| echo "queued_documents_counts.files__default after scan: $scheduled" | |
| expected_scheduled=$((prev_scheduled + NEW_FILES_COUNT + 1)) # +1 non-empty file | |
| if [ "$scheduled" -ne "$expected_scheduled" ]; then | |
| echo "Error: expected exactly $expected_scheduled files in the queue (prev=$prev_scheduled + new=$NEW_FILES_COUNT), got $scheduled" | |
| exit 1 | |
| fi | |
| echo "::endgroup::" | |
| echo "::group::Waiting up to 5 minutes for queue to drain to zero again" | |
| for i in {1..30}; do | |
| stats=$(./occ context_chat:stats --json) | |
| scheduled=$(echo "$stats" | jq '[.queued_documents_counts | to_entries[].value] | add // 0') | |
| locked=$(echo "$stats" | jq '[.queued_documents_locked_counts | to_entries[].value] | add // 0') | |
| echo "Attempt $i: queued_documents=$scheduled locked=$locked" | |
| if [ "$scheduled" = "0" ]; then | |
| echo "Queue drained" | |
| break | |
| fi | |
| if [ "$i" = "30" ]; then | |
| echo "Timeout: queue did not drain to zero (queued_documents=$scheduled)" | |
| exit 1 | |
| fi | |
| sleep 10 | |
| done | |
| echo "::endgroup::" | |
| # locked must be zero | |
| if [ "$locked" != "0" ]; then | |
| echo "Error: expected locked count to be 0, got $locked" | |
| exit 1 | |
| fi | |
| echo "Locked count is 0 as expected" | |
| # the new empty files +1 non-empty file are eligible (valid .md mime type) | |
| # the gap (eligible - indexed) must have grown by exactly NEW_FILES_COUNT, the empty files count | |
| eligible=$(echo "$stats" | jq '.eligible_files_count') | |
| indexed=$(echo "$stats" | jq '.vectordb_document_counts.files__default // 0') | |
| echo "prev: eligible=$prev_eligible indexed=$prev_indexed" | |
| echo "eligible_files_count=$eligible vectordb_document_counts.files__default=$indexed" | |
| prev_gap=$((prev_eligible - prev_indexed)) | |
| curr_gap=$((eligible - indexed)) | |
| gap_diff=$((curr_gap - prev_gap)) | |
| echo "gap before=$prev_gap gap after=$curr_gap difference=$gap_diff expected=$NEW_FILES_COUNT" | |
| if [ "$gap_diff" -ne "$NEW_FILES_COUNT" ]; then | |
| echo "Error: expected the eligible-indexed gap to grow by exactly $NEW_FILES_COUNT, got $gap_diff" | |
| exit 1 | |
| fi | |
| echo "PASS: empty markdown files counted in eligible_files_count but absent from vectordb_document_counts" | |
| - name: Check python memory usage | |
| run: | | |
| ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem | |
| ps -p $(cat pid.txt) -o %mem --no-headers > after_prompt_mem.txt | |
| - name: Compare memory usage and detect leak | |
| run: | | |
| initial_mem=$(cat initial_mem.txt | tr -d ' ') | |
| final_mem=$(cat after_scan_mem.txt | tr -d ' ') | |
| echo "Initial Memory Usage: $initial_mem%" | |
| echo "Memory Usage after scan: $final_mem%" | |
| if (( $(echo "$final_mem > $initial_mem" | bc -l) )); then | |
| echo "Memory usage has increased during scan. Possible memory leak detected!" | |
| else | |
| echo "Memory usage during scan is stable. No memory leak detected." | |
| fi | |
| - name: Final dump of DB with vectordb populated | |
| if: always() | |
| run: | | |
| docker exec postgres pg_dump nextcloud > /tmp/1_pgdump_nextcloud | |
| - name: Show server logs | |
| if: always() | |
| run: | | |
| cat data/nextcloud.log | |
| - name: Show context_chat specific logs | |
| if: always() | |
| run: | | |
| cat data/context_chat.log | |
| - name: Show task processing worker logs | |
| if: always() | |
| run: | | |
| tail -v -n +1 worker?_logs || echo "No worker logs" | |
| - name: Show main app logs | |
| if: always() | |
| run: | | |
| cat context_chat_backend/backend_logs || echo "No main backend logs" | |
| - name: Show main app JSON logs | |
| if: always() | |
| run: | | |
| tail -v -n +1 context_chat_backend/persistent_storage/logs/ccb.log* || echo "No logs in logs directory" | |
| - name: Show embedding server logs | |
| if: always() | |
| run: | | |
| cat context_chat_backend/em_backend_logs || echo "No main backend logs" | |
| - name: Show embedding server JSON logs | |
| if: always() | |
| run: | | |
| tail -v -n +1 context_chat_backend/persistent_storage/logs/em_server.log* || echo "No logs in logs directory" | |
| - name: Upload database dumps | |
| uses: actions/upload-artifact@v4 | |
| if: always() | |
| with: | |
| name: database-dumps-${{ matrix.server-versions }}-php@${{ matrix.php-versions }} | |
| path: | | |
| /tmp/0_pgdump_nextcloud | |
| /tmp/1_pgdump_nextcloud | |
| - name: Show PostgreSQL slow query logs | |
| if: always() | |
| run: | | |
| docker exec postgres cat /var/log/pg_log/postgresql.log | |
| - name: Final stats log | |
| if: always() | |
| run: | | |
| ./occ context_chat:stats | |
| ./occ context_chat:stats --json | |
| summary: | |
| permissions: | |
| contents: none | |
| runs-on: ubuntu-latest-low | |
| needs: [changes, integration] | |
| if: always() | |
| # This is the summary, we just avoid to rename it so that branch protection rules still match | |
| name: integration-test | |
| steps: | |
| - name: Summary status | |
| run: if ${{ needs.changes.outputs.src != 'false' && needs.integration.result != 'success' }}; then exit 1; fi |