capsule AI-native Unix-like composition layer

src/inference/plugins/tts/base.py

2,157 bytes · 63 lines · capsule://quake0day/[email protected] raw on github

from abc import abstractmethod
from typing import AsyncIterator

import numpy as np

from inference.core.types import AudioChunk, TTSRequestConfig
from inference.plugins.base import CyberVersePlugin


class TTSPlugin(CyberVersePlugin):
    @abstractmethod
    async def synthesize_stream(
        self,
        text_stream: AsyncIterator[str],
        request_config: TTSRequestConfig | None = None,
    ) -> AsyncIterator[AudioChunk]:
        ...


class AudioRechunker:
    """Rechunk variable-length TTS audio into fixed-size chunks aligned with Avatar model.

    FlashHead: frame_num=33, motion_frames=5, effective=28
    chunk_duration = 28/25 = 1.12s = 17920 samples @ 16kHz
    """

    def __init__(self, chunk_samples: int = 17920, sample_rate: int = 16000):
        self.chunk_samples = chunk_samples
        self.sample_rate = sample_rate
        self.buffer = np.array([], dtype=np.float32)

    def feed(self, audio: np.ndarray) -> list[AudioChunk]:
        self.buffer = np.concatenate([self.buffer, audio])
        chunks = []
        while len(self.buffer) >= self.chunk_samples:
            chunk_data = self.buffer[: self.chunk_samples]
            self.buffer = self.buffer[self.chunk_samples :]
            chunks.append(
                AudioChunk(
                    data=chunk_data.astype(np.float32).tobytes(),
                    sample_rate=self.sample_rate,
                    duration_ms=int(self.chunk_samples / self.sample_rate * 1000),
                    is_final=False,
                )
            )
        return chunks

    def flush(self) -> AudioChunk | None:
        if len(self.buffer) > 0:
            padded = np.zeros(self.chunk_samples, dtype=np.float32)
            padded[: len(self.buffer)] = self.buffer
            self.buffer = np.array([], dtype=np.float32)
            return AudioChunk(
                data=padded.tobytes(),
                sample_rate=self.sample_rate,
                duration_ms=int(self.chunk_samples / self.sample_rate * 1000),
                is_final=True,
            )
        return None

    def reset(self) -> None:
        self.buffer = np.array([], dtype=np.float32)