Skip to content

Commit 353fc4b

Browse files
committed
Add streaming for Kokoro
1 parent ba97936 commit 353fc4b

File tree

1 file changed

+54
-7
lines changed

1 file changed

+54
-7
lines changed

src/lib/kokoro-tts.js

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22

33
import {chunkText, cleanTextForTTS} from '../utils/text-cleaner.js';
44

5-
// Text splitting stream to break text into chunks
5+
// Text splitting stream to break text into chunks (enhanced for streaming)
66
export class TextSplitterStream {
77
constructor() {
88
this.chunks = [];
9+
this.pendingText = '';
910
this.closed = false;
1011
}
1112

@@ -15,19 +16,58 @@ export class TextSplitterStream {
1516
return chunkText(cleanedText);
1617
}
1718

18-
push(text) {
19-
// Simple sentence splitting for now
20-
const sentences = this.chunkText(text) || [text];
21-
this.chunks.push(...sentences);
19+
push(...texts) {
20+
// Support both single text and multiple texts like the official implementation
21+
for (const text of texts) {
22+
this.pendingText += text;
23+
24+
// Check if we have complete sentences to process
25+
const sentences = this.pendingText.split(/(?<=[.!?])\s+/);
26+
27+
// Keep the last fragment in case it's incomplete
28+
if (sentences.length > 1) {
29+
this.pendingText = sentences.pop();
30+
31+
// Process complete sentences
32+
for (const sentence of sentences) {
33+
if (sentence.trim()) {
34+
const chunks = this.chunkText(sentence) || [sentence];
35+
this.chunks.push(...chunks);
36+
}
37+
}
38+
}
39+
}
40+
}
41+
42+
flush() {
43+
// Process any remaining text without waiting for sentence completion
44+
if (this.pendingText.trim()) {
45+
const chunks = this.chunkText(this.pendingText) || [this.pendingText];
46+
this.chunks.push(...chunks);
47+
this.pendingText = '';
48+
}
2249
}
2350

2451
close() {
52+
// Flush any remaining text and close the stream
53+
this.flush();
2554
this.closed = true;
2655
}
2756

2857
async *[Symbol.asyncIterator]() {
29-
for (const chunk of this.chunks) {
30-
yield chunk;
58+
let processedIndex = 0;
59+
60+
while (!this.closed || processedIndex < this.chunks.length) {
61+
// Yield any new chunks that have been added
62+
while (processedIndex < this.chunks.length) {
63+
yield this.chunks[processedIndex];
64+
processedIndex++;
65+
}
66+
67+
// If not closed, wait a bit for more chunks
68+
if (!this.closed) {
69+
await new Promise(resolve => setTimeout(resolve, 10));
70+
}
3171
}
3272
}
3373
}
@@ -287,6 +327,10 @@ export class KokoroTTS {
287327
if (this.session && this.voiceEmbeddings[voice]) {
288328
try {
289329
const language = voice.startsWith('a') ? 'a' : 'b'; // Determine language from voice ID
330+
331+
// Get phonemes for the text chunk
332+
const phonemes = await this.phonemize(text, language);
333+
290334
const tokenIds = await this.tokenizeText(text, language);
291335
const inputIds = new BigInt64Array(tokenIds.map(id => BigInt(id)));
292336

@@ -361,8 +405,10 @@ export class KokoroTTS {
361405
}
362406
}
363407

408+
// Yield with phonemes information like the official implementation
364409
yield {
365410
text,
411+
phonemes, // Include phonemes in the output
366412
audio: new RawAudio(finalAudioData, sampleRate)
367413
};
368414
} catch (modelError) {
@@ -375,6 +421,7 @@ export class KokoroTTS {
375421
// Yield silence in case of error
376422
yield {
377423
text,
424+
phonemes: text, // Fallback phonemes
378425
audio: new RawAudio(new Float32Array(24000), 24000)
379426
};
380427
}

0 commit comments

Comments
 (0)