src/inference/plugins/tts/base.py
2,157 bytes · 63 lines · capsule://quake0day/[email protected]
raw on github
from abc import abstractmethod
from typing import AsyncIterator
import numpy as np
from inference.core.types import AudioChunk, TTSRequestConfig
from inference.plugins.base import CyberVersePlugin
class TTSPlugin(CyberVersePlugin):
@abstractmethod
async def synthesize_stream(
self,
text_stream: AsyncIterator[str],
request_config: TTSRequestConfig | None = None,
) -> AsyncIterator[AudioChunk]:
...
class AudioRechunker:
"""Rechunk variable-length TTS audio into fixed-size chunks aligned with Avatar model.
FlashHead: frame_num=33, motion_frames=5, effective=28
chunk_duration = 28/25 = 1.12s = 17920 samples @ 16kHz
"""
def __init__(self, chunk_samples: int = 17920, sample_rate: int = 16000):
self.chunk_samples = chunk_samples
self.sample_rate = sample_rate
self.buffer = np.array([], dtype=np.float32)
def feed(self, audio: np.ndarray) -> list[AudioChunk]:
self.buffer = np.concatenate([self.buffer, audio])
chunks = []
while len(self.buffer) >= self.chunk_samples:
chunk_data = self.buffer[: self.chunk_samples]
self.buffer = self.buffer[self.chunk_samples :]
chunks.append(
AudioChunk(
data=chunk_data.astype(np.float32).tobytes(),
sample_rate=self.sample_rate,
duration_ms=int(self.chunk_samples / self.sample_rate * 1000),
is_final=False,
)
)
return chunks
def flush(self) -> AudioChunk | None:
if len(self.buffer) > 0:
padded = np.zeros(self.chunk_samples, dtype=np.float32)
padded[: len(self.buffer)] = self.buffer
self.buffer = np.array([], dtype=np.float32)
return AudioChunk(
data=padded.tobytes(),
sample_rate=self.sample_rate,
duration_ms=int(self.chunk_samples / self.sample_rate * 1000),
is_final=True,
)
return None
def reset(self) -> None:
self.buffer = np.array([], dtype=np.float32)