Skip to content

Commit 184695d

Browse files
perf(importer): batch existing-id reads via git cat-file --batch
The first cut of the existing-IDs pre-pass called `git show HEAD:<file>` once per importer-owned TOML file. For a typical snapshot (~44k files), that's 44k fork+exec roundtrips which took 7+ minutes to complete on the second run. Replace with a single `git cat-file --batch` subprocess that streams blob contents in one stdin/stdout exchange. Verified against the full 44k-file snapshot — pre-pass now finishes in seconds. Also add a test verifying the "single-record-change" criterion from the plan: importing the same dataset twice with one project's Title flipped produces a commit whose diff is exactly that file. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6e3af63 commit 184695d

2 files changed

Lines changed: 172 additions & 20 deletions

File tree

apps/api/scripts/import-laddr/importer.ts

Lines changed: 124 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,10 @@ function describe(err: unknown): string {
791791
* Reads from `refs/heads/<branch>` if it exists, then `refs/remotes/origin/
792792
* <branch>`, then the configured fallback. Returns an empty map if no parent
793793
* exists yet (first run).
794+
*
795+
* Implementation note: `git cat-file --batch` is used to stream blob contents
796+
* in a single subprocess rather than fork+exec per-file. Snapshots can have
797+
* 40k+ files; per-file `git show` calls take many minutes.
794798
*/
795799
async function collectExistingIds(
796800
repo: string,
@@ -814,44 +818,144 @@ async function collectExistingIds(
814818
}
815819
if (ref === null) return ids;
816820

821+
// `ls-tree -r` gives us mode + sha + filename for every file under the
822+
// commit's tree. We need both blob sha (for cat-file --batch lookup) and
823+
// path (so we know which sheet the record belongs to).
817824
let listing: string;
818825
try {
819-
const { stdout } = await git(repo, 'ls-tree', '-r', '--name-only', ref);
826+
const { stdout } = await git(repo, 'ls-tree', '-r', ref);
820827
listing = stdout;
821828
} catch {
822829
return ids;
823830
}
824831

825-
const paths = listing.split('\n').filter((p) => {
826-
if (!p.endsWith('.toml')) return false;
832+
interface Entry {
833+
readonly sha: string;
834+
readonly path: string;
835+
}
836+
const entries: Entry[] = [];
837+
for (const line of listing.split('\n')) {
838+
// Format: `<mode> <type> <sha>\t<path>`
839+
const tabIdx = line.indexOf('\t');
840+
if (tabIdx === -1) continue;
841+
const meta = line.slice(0, tabIdx).split(/\s+/);
842+
const path = line.slice(tabIdx + 1);
843+
if (meta.length < 3) continue;
844+
if (!path.endsWith('.toml')) continue;
845+
let owned = false;
827846
for (const dir of IMPORTER_OWNED_DIRS) {
828-
if (p.startsWith(`${dir}/`)) return true;
847+
if (path.startsWith(`${dir}/`)) {
848+
owned = true;
849+
break;
850+
}
829851
}
830-
return false;
831-
});
852+
if (!owned) continue;
853+
entries.push({ sha: meta[2]!, path });
854+
}
832855

833-
for (const path of paths) {
834-
const content = await readFileFromRef(repo, ref, path);
856+
if (entries.length === 0) return ids;
857+
858+
// Spawn `git cat-file --batch` once; feed it newline-separated SHAs on stdin,
859+
// parse the streamed `<sha> blob <size>\n<content>\n` responses.
860+
const blobs = await batchCatFile(repo, entries.map((e) => e.sha));
861+
for (let i = 0; i < entries.length; i++) {
862+
const content = blobs[i] ?? '';
835863
const id = extractTomlString(content, 'id');
836864
if (id) {
837-
const key = path.replace(/\.toml$/, '');
865+
const key = entries[i]!.path.replace(/\.toml$/, '');
838866
ids.byFile.set(key, id);
839867
}
840868
}
841869
return ids;
842870
}
843871

844-
async function readFileFromRef(
845-
repo: string,
846-
ref: string,
847-
path: string,
848-
): Promise<string> {
849-
try {
850-
const { stdout } = await git(repo, 'show', `${ref}:${path}`);
851-
return stdout;
852-
} catch {
853-
return '';
854-
}
872+
/**
873+
* Stream blob contents via a single `git cat-file --batch` invocation. Each
874+
* input SHA produces one entry in the returned array, in the same order.
875+
*
876+
* The protocol: emit one SHA per line on stdin; for each, git emits a header
877+
* line `<sha> <type> <size>\n` followed by `<size>` bytes of content and a
878+
* trailing `\n`. On `missing` (unknown SHA), git emits `<sha> missing\n` and
879+
* no content. We treat missing as empty.
880+
*/
881+
async function batchCatFile(repo: string, shas: readonly string[]): Promise<string[]> {
882+
if (shas.length === 0) return [];
883+
const { spawn } = await import('node:child_process');
884+
return await new Promise<string[]>((resolve, reject) => {
885+
const child = spawn('git', ['cat-file', '--batch'], {
886+
cwd: repo,
887+
stdio: ['pipe', 'pipe', 'pipe'],
888+
});
889+
890+
const results: string[] = [];
891+
let stderrAcc = '';
892+
let buf = Buffer.alloc(0);
893+
let mode: 'header' | 'content' = 'header';
894+
let expected = 0;
895+
896+
child.stderr.setEncoding('utf8');
897+
child.stderr.on('data', (chunk: string) => {
898+
stderrAcc += chunk;
899+
});
900+
901+
child.stdout.on('data', (chunk: Buffer) => {
902+
buf = Buffer.concat([buf, chunk]);
903+
while (true) {
904+
if (mode === 'header') {
905+
const nl = buf.indexOf(0x0a);
906+
if (nl === -1) return;
907+
const header = buf.slice(0, nl).toString('utf8');
908+
buf = buf.slice(nl + 1);
909+
// header is `<sha> <type> <size>` or `<sha> missing`
910+
const parts = header.split(' ');
911+
if (parts.length === 3 && parts[1] !== 'missing') {
912+
expected = parseInt(parts[2]!, 10);
913+
mode = 'content';
914+
} else {
915+
// missing — no content body
916+
results.push('');
917+
if (results.length === shas.length) {
918+
try {
919+
child.stdin.end();
920+
} catch {
921+
// ignore
922+
}
923+
}
924+
}
925+
} else {
926+
// content mode: wait for `expected` bytes + the trailing newline
927+
if (buf.length < expected + 1) return;
928+
const content = buf.slice(0, expected).toString('utf8');
929+
buf = buf.slice(expected + 1); // skip trailing newline
930+
results.push(content);
931+
mode = 'header';
932+
if (results.length === shas.length) {
933+
try {
934+
child.stdin.end();
935+
} catch {
936+
// ignore
937+
}
938+
}
939+
}
940+
}
941+
});
942+
943+
child.on('close', (code) => {
944+
if (code !== 0 && results.length !== shas.length) {
945+
reject(new Error(`git cat-file --batch exited ${code}: ${stderrAcc}`));
946+
} else {
947+
resolve(results);
948+
}
949+
});
950+
child.on('error', reject);
951+
952+
// Feed SHAs as a single write — git's batch mode reads to EOL.
953+
const payload = shas.join('\n') + '\n';
954+
child.stdin.write(payload);
955+
// Don't end stdin yet — close it when all entries have been read so the
956+
// batch process drains cleanly. (Closing early on a slow consumer would
957+
// truncate output.)
958+
});
855959
}
856960

857961
function extractTomlString(content: string, key: string): string | null {

apps/api/tests/import-laddr.test.ts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,4 +659,52 @@ describe('importLaddrFromJson — orchestrator', () => {
659659
await cleanup();
660660
}
661661
});
662+
663+
it('a modified single record produces a commit whose diff is that one record', async () => {
664+
const { path: repo, cleanup } = await makeRepo();
665+
try {
666+
// First run with baseline data
667+
const first = await importLaddrFromJson({
668+
sourceHost: 'example.test',
669+
dataRepo: repo,
670+
branch: 'legacy-import',
671+
initialParent: 'empty',
672+
now: '2026-05-18T00:00:00.000Z',
673+
delayMs: 0,
674+
pageSize: 200,
675+
fetchImpl: makeFetch(mockRoutes()),
676+
});
677+
expect(first.commitHash).not.toBeNull();
678+
679+
// Second run with a single tweak: the transit-app project's Title
680+
// changed. Everything else (including UUIDs, since they're carried
681+
// forward from the first commit's tree) stays identical.
682+
const tweaked = mockRoutes();
683+
// Walk the queue and overwrite the projects response with a Title change.
684+
const projectsKey = '/projects?format=json&include=Tags%2CMemberships&limit=200&offset=0';
685+
const projectsResp = tweaked.responses.get(projectsKey)![0] as { data: Array<{ Title: string }> };
686+
projectsResp.data[0]!.Title = 'Transit App — Renamed';
687+
688+
const second = await importLaddrFromJson({
689+
sourceHost: 'example.test',
690+
dataRepo: repo,
691+
branch: 'legacy-import',
692+
initialParent: 'empty',
693+
now: '2026-05-18T00:00:00.000Z',
694+
delayMs: 0,
695+
pageSize: 200,
696+
fetchImpl: makeFetch(tweaked),
697+
});
698+
expect(second.commitHash).not.toBeNull();
699+
expect(second.noChanges).toBe(false);
700+
701+
// The diff between the two commits should touch exactly one file:
702+
// projects/100.toml.
703+
const diff = await exec('git', ['diff', '--name-only', `${first.commitHash}..${second.commitHash}`], { cwd: repo });
704+
const changed = diff.stdout.split('\n').filter(Boolean);
705+
expect(changed).toEqual(['projects/100.toml']);
706+
} finally {
707+
await cleanup();
708+
}
709+
});
662710
});

0 commit comments

Comments
 (0)