@@ -791,6 +791,10 @@ function describe(err: unknown): string {
791791 * Reads from `refs/heads/<branch>` if it exists, then `refs/remotes/origin/
792792 * <branch>`, then the configured fallback. Returns an empty map if no parent
793793 * exists yet (first run).
794+ *
795+ * Implementation note: `git cat-file --batch` is used to stream blob contents
796+ * in a single subprocess rather than fork+exec per-file. Snapshots can have
797+ * 40k+ files; per-file `git show` calls take many minutes.
794798 */
795799async function collectExistingIds (
796800 repo : string ,
@@ -814,44 +818,144 @@ async function collectExistingIds(
814818 }
815819 if ( ref === null ) return ids ;
816820
821+ // `ls-tree -r` gives us mode + sha + filename for every file under the
822+ // commit's tree. We need both blob sha (for cat-file --batch lookup) and
823+ // path (so we know which sheet the record belongs to).
817824 let listing : string ;
818825 try {
819- const { stdout } = await git ( repo , 'ls-tree' , '-r' , '--name-only' , ref ) ;
826+ const { stdout } = await git ( repo , 'ls-tree' , '-r' , ref ) ;
820827 listing = stdout ;
821828 } catch {
822829 return ids ;
823830 }
824831
825- const paths = listing . split ( '\n' ) . filter ( ( p ) => {
826- if ( ! p . endsWith ( '.toml' ) ) return false ;
832+ interface Entry {
833+ readonly sha : string ;
834+ readonly path : string ;
835+ }
836+ const entries : Entry [ ] = [ ] ;
837+ for ( const line of listing . split ( '\n' ) ) {
838+ // Format: `<mode> <type> <sha>\t<path>`
839+ const tabIdx = line . indexOf ( '\t' ) ;
840+ if ( tabIdx === - 1 ) continue ;
841+ const meta = line . slice ( 0 , tabIdx ) . split ( / \s + / ) ;
842+ const path = line . slice ( tabIdx + 1 ) ;
843+ if ( meta . length < 3 ) continue ;
844+ if ( ! path . endsWith ( '.toml' ) ) continue ;
845+ let owned = false ;
827846 for ( const dir of IMPORTER_OWNED_DIRS ) {
828- if ( p . startsWith ( `${ dir } /` ) ) return true ;
847+ if ( path . startsWith ( `${ dir } /` ) ) {
848+ owned = true ;
849+ break ;
850+ }
829851 }
830- return false ;
831- } ) ;
852+ if ( ! owned ) continue ;
853+ entries . push ( { sha : meta [ 2 ] ! , path } ) ;
854+ }
832855
833- for ( const path of paths ) {
834- const content = await readFileFromRef ( repo , ref , path ) ;
856+ if ( entries . length === 0 ) return ids ;
857+
858+ // Spawn `git cat-file --batch` once; feed it newline-separated SHAs on stdin,
859+ // parse the streamed `<sha> blob <size>\n<content>\n` responses.
860+ const blobs = await batchCatFile ( repo , entries . map ( ( e ) => e . sha ) ) ;
861+ for ( let i = 0 ; i < entries . length ; i ++ ) {
862+ const content = blobs [ i ] ?? '' ;
835863 const id = extractTomlString ( content , 'id' ) ;
836864 if ( id ) {
837- const key = path . replace ( / \. t o m l $ / , '' ) ;
865+ const key = entries [ i ] ! . path . replace ( / \. t o m l $ / , '' ) ;
838866 ids . byFile . set ( key , id ) ;
839867 }
840868 }
841869 return ids ;
842870}
843871
844- async function readFileFromRef (
845- repo : string ,
846- ref : string ,
847- path : string ,
848- ) : Promise < string > {
849- try {
850- const { stdout } = await git ( repo , 'show' , `${ ref } :${ path } ` ) ;
851- return stdout ;
852- } catch {
853- return '' ;
854- }
872+ /**
873+ * Stream blob contents via a single `git cat-file --batch` invocation. Each
874+ * input SHA produces one entry in the returned array, in the same order.
875+ *
876+ * The protocol: emit one SHA per line on stdin; for each, git emits a header
877+ * line `<sha> <type> <size>\n` followed by `<size>` bytes of content and a
878+ * trailing `\n`. On `missing` (unknown SHA), git emits `<sha> missing\n` and
879+ * no content. We treat missing as empty.
880+ */
881+ async function batchCatFile ( repo : string , shas : readonly string [ ] ) : Promise < string [ ] > {
882+ if ( shas . length === 0 ) return [ ] ;
883+ const { spawn } = await import ( 'node:child_process' ) ;
884+ return await new Promise < string [ ] > ( ( resolve , reject ) => {
885+ const child = spawn ( 'git' , [ 'cat-file' , '--batch' ] , {
886+ cwd : repo ,
887+ stdio : [ 'pipe' , 'pipe' , 'pipe' ] ,
888+ } ) ;
889+
890+ const results : string [ ] = [ ] ;
891+ let stderrAcc = '' ;
892+ let buf = Buffer . alloc ( 0 ) ;
893+ let mode : 'header' | 'content' = 'header' ;
894+ let expected = 0 ;
895+
896+ child . stderr . setEncoding ( 'utf8' ) ;
897+ child . stderr . on ( 'data' , ( chunk : string ) => {
898+ stderrAcc += chunk ;
899+ } ) ;
900+
901+ child . stdout . on ( 'data' , ( chunk : Buffer ) => {
902+ buf = Buffer . concat ( [ buf , chunk ] ) ;
903+ while ( true ) {
904+ if ( mode === 'header' ) {
905+ const nl = buf . indexOf ( 0x0a ) ;
906+ if ( nl === - 1 ) return ;
907+ const header = buf . slice ( 0 , nl ) . toString ( 'utf8' ) ;
908+ buf = buf . slice ( nl + 1 ) ;
909+ // header is `<sha> <type> <size>` or `<sha> missing`
910+ const parts = header . split ( ' ' ) ;
911+ if ( parts . length === 3 && parts [ 1 ] !== 'missing' ) {
912+ expected = parseInt ( parts [ 2 ] ! , 10 ) ;
913+ mode = 'content' ;
914+ } else {
915+ // missing — no content body
916+ results . push ( '' ) ;
917+ if ( results . length === shas . length ) {
918+ try {
919+ child . stdin . end ( ) ;
920+ } catch {
921+ // ignore
922+ }
923+ }
924+ }
925+ } else {
926+ // content mode: wait for `expected` bytes + the trailing newline
927+ if ( buf . length < expected + 1 ) return ;
928+ const content = buf . slice ( 0 , expected ) . toString ( 'utf8' ) ;
929+ buf = buf . slice ( expected + 1 ) ; // skip trailing newline
930+ results . push ( content ) ;
931+ mode = 'header' ;
932+ if ( results . length === shas . length ) {
933+ try {
934+ child . stdin . end ( ) ;
935+ } catch {
936+ // ignore
937+ }
938+ }
939+ }
940+ }
941+ } ) ;
942+
943+ child . on ( 'close' , ( code ) => {
944+ if ( code !== 0 && results . length !== shas . length ) {
945+ reject ( new Error ( `git cat-file --batch exited ${ code } : ${ stderrAcc } ` ) ) ;
946+ } else {
947+ resolve ( results ) ;
948+ }
949+ } ) ;
950+ child . on ( 'error' , reject ) ;
951+
952+ // Feed SHAs as a single write — git's batch mode reads to EOL.
953+ const payload = shas . join ( '\n' ) + '\n' ;
954+ child . stdin . write ( payload ) ;
955+ // Don't end stdin yet — close it when all entries have been read so the
956+ // batch process drains cleanly. (Closing early on a slow consumer would
957+ // truncate output.)
958+ } ) ;
855959}
856960
857961function extractTomlString ( content : string , key : string ) : string | null {
0 commit comments