capsule AI-native Unix-like composition layer

src/models/SoulX-LiveAct/kokoro/__main__.py

4,089 bytes · 149 lines · capsule://quake0day/[email protected] raw on github

"""Kokoro TTS CLI
Example usage:
python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug

echo "Bom dia mundo, como vão vocês" > text.txt
python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav

Common issues:
pip not installed: `uv pip install pip`
(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)

espeak not installed: `apt-get install espeak-ng`
"""

import argparse
import wave
from pathlib import Path
from typing import Generator, TYPE_CHECKING

import numpy as np
from loguru import logger

languages = [
    "a",  # American English
    "b",  # British English
    "h",  # Hindi
    "e",  # Spanish
    "f",  # French
    "i",  # Italian
    "p",  # Brazilian Portuguese
    "j",  # Japanese
    "z",  # Mandarin Chinese
]

if TYPE_CHECKING:
    from kokoro import KPipeline


def generate_audio(
    text: str, kokoro_language: str, voice: str, speed=1
) -> Generator["KPipeline.Result", None, None]:
    from kokoro import KPipeline

    if not voice.startswith(kokoro_language):
        logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
    pipeline = KPipeline(lang_code=kokoro_language)
    yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")


def generate_and_save_audio(
    output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
) -> None:
    with wave.open(str(output_file.resolve()), "wb") as wav_file:
        wav_file.setnchannels(1)  # Mono audio
        wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit audio)
        wav_file.setframerate(24000)  # Sample rate

        for result in generate_audio(
            text, kokoro_language=kokoro_language, voice=voice, speed=speed
        ):
            logger.debug(result.phonemes)
            if result.audio is None:
                continue
            audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
            wav_file.writeframes(audio_bytes)


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-m",
        "--voice",
        default="af_heart",
        help="Voice to use",
    )
    parser.add_argument(
        "-l",
        "--language",
        help="Language to use (defaults to the one corresponding to the voice)",
        choices=languages,
    )
    parser.add_argument(
        "-o",
        "--output-file",
        "--output_file",
        type=Path,
        help="Path to output WAV file",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input-file",
        "--input_file",
        type=Path,
        help="Path to input text file (default: stdin)",
    )
    parser.add_argument(
        "-t",
        "--text",
        help="Text to use instead of reading from stdin",
    )
    parser.add_argument(
        "-s",
        "--speed",
        type=float,
        default=1.0,
        help="Speech speed",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Print DEBUG messages to console",
    )
    args = parser.parse_args()
    if args.debug:
        logger.level("DEBUG")
    logger.debug(args)

    lang = args.language or args.voice[0]

    if args.text is not None and args.input_file is not None:
        raise Exception("You cannot specify both 'text' and 'input_file'")
    elif args.text:
        text = args.text
    elif args.input_file:
        file: Path = args.input_file
        text = file.read_text()
    else:
        import sys
        print("Press Ctrl+D to stop reading input and start generating", flush=True)
        text = '\n'.join(sys.stdin)

    logger.debug(f"Input text: {text!r}")

    out_file: Path = args.output_file
    if not out_file.suffix == ".wav":
        logger.warning("The output file name should end with .wav")
    generate_and_save_audio(
        output_file=out_file,
        text=text,
        kokoro_language=lang,
        voice=args.voice,
        speed=args.speed,
    )


if __name__ == "__main__":
    main()