Skip to content

xil-pipeline SDK Documentation

Xilu004 Sample Voices T2S

Xilu004 Sample Voices T2S

src.xil_pipeline.XILU004_sample_voices_T2S

Audition cast voices by generating a short sample MP3 per voice.

Reads a cast configuration file and calls the configured TTS backend to produce one sample MP3 per assigned voice. Each sample says:

"I am <full_name> not yo momma"

Outputs to voice_samples/<TAG>/<backend>/<actor>.mp3 so samples from different backends sit side-by-side for direct comparison.

Usage:

xil-sample --episode S02E03 --dry-run
xil-sample --episode S02E03
xil-sample --episode S02E03 --backend gtts
xil-sample --episode S02E03 --backend chatterbox
xil-sample --episode S02E03 --force

logger `module-attribute`

logger = get_logger(__name__)

client `module-attribute`

client = ElevenLabs(api_key=os.environ.get('ELEVENLABS_API_KEY'))

VOICE_SAMPLES_DIR `module-attribute`

VOICE_SAMPLES_DIR = str(get_workspace_root() / 'voice_samples')

HAS_GTTS `module-attribute`

HAS_GTTS = True

check_elevenlabs_quota

check_elevenlabs_quota() -> int | None

Display current ElevenLabs API character usage and return remaining quota.

Source code in src/xil_pipeline/XILU004_sample_voices_T2S.py

def check_elevenlabs_quota() -> int | None:
    """Display current ElevenLabs API character usage and return remaining quota."""
    try:
        user_info = client.user.get()
        sub = user_info.subscription
        used = sub.character_count
        limit = sub.character_limit
        remaining = limit - used
        logger.info("\n" + "="*40)
        logger.info("ELEVENLABS API STATUS:")
        logger.info(f"  Tier:      {sub.tier.upper()}")
        logger.info(f"  Usage:     {used:,} / {limit:,} characters")
        logger.info(f"  Remaining: {remaining:,}")
        logger.info("="*40 + "\n")
        return remaining
    except ApiError as e:
        logger.warning("API Error: Unable to fetch user subscription data.")
        logger.warning(f"    Details: {e}")
        return None

has_enough_characters

has_enough_characters(text_to_generate: str) -> bool

Return True if the ElevenLabs quota can cover text_to_generate.

Source code in src/xil_pipeline/XILU004_sample_voices_T2S.py

def has_enough_characters(text_to_generate: str) -> bool:
    """Return True if the ElevenLabs quota can cover *text_to_generate*."""
    try:
        user_info = client.user.get()
        remaining = user_info.subscription.character_limit - user_info.subscription.character_count
        required = len(text_to_generate)
        if remaining >= required:
            logger.info(f" [Guard] Quota OK: {required} required, {remaining:,} left.")
            return True
        else:
            logger.info(f" [Guard] STOP: Line requires {required} chars, but only {remaining:,} remain.")
            return False
    except ApiError:
        logger.info(" [Guard] Warning: Permission 'user_read' missing. Skipping quota check.")
        return True

get_best_model_for_budget

get_best_model_for_budget() -> str

Return the TTS model to use, always eleven_v3.

Source code in src/xil_pipeline/XILU004_sample_voices_T2S.py

def get_best_model_for_budget() -> str:
    """Return the TTS model to use, always ``eleven_v3``."""
    SAFE_THRESHOLD = 5000
    try:
        user_info = client.user.get()
        remaining = user_info.subscription.character_limit - user_info.subscription.character_count
        if remaining > SAFE_THRESHOLD:
            logger.info(f" [Budget] Healthy Balance: {remaining:,} left. Using 'eleven_v3'.")
            return "eleven_v3"
        else:
            logger.info(f" [Budget] LOW BALANCE: {remaining:,} left. Switching to 'eleven_flash_v2_5'.")
            return "eleven_flash_v2_5"
    except ApiError:
        logger.info(" [Budget] API Check Failed. Defaulting to 'eleven_v3'.")
        return "eleven_v3"

get_parser

get_parser() -> argparse.ArgumentParser

Return the argument parser for xil-sample.

Source code in src/xil_pipeline/XILU004_sample_voices_T2S.py

def get_parser() -> argparse.ArgumentParser:
    """Return the argument parser for xil-sample."""
    parser = argparse.ArgumentParser(
        prog="xil-sample",
        description="Generate a voice sample MP3 for each cast member via the chosen TTS backend.",
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        "--episode", metavar="TAG",
        help="Episode tag (e.g. S02E03); derives cast config path",
    )
    group.add_argument(
        "--cast", metavar="PATH",
        help="Explicit path to cast JSON file",
    )
    parser.add_argument("--show", default=None, help="Show name override (default: from project.json)")
    parser.add_argument(
        "--backend", choices=["elevenlabs", "gtts", "chatterbox", "chatterbox-turbo"], default="elevenlabs",
        help=(
            "TTS backend for sample generation. 'elevenlabs' (default) calls the ElevenLabs API "
            "and uses the voice_id from the cast config. 'gtts' generates a flat-voice draft via "
            "Google Translate TTS at no cost (ignores voice_id). 'chatterbox' uses local GPU TTS "
            "with zero-shot voice cloning from voice_refs/<key>.wav clips. 'chatterbox-turbo' uses "
            "the Chatterbox Turbo model in the same venv-chatterbox — it renders 19 native "
            "paralinguistic tags ([angry] [fear] [surprised] [happy] [crying] [sarcastic] "
            "[whispering] [dramatic] [narration] [advertisement] [laugh] [chuckle] [sigh] [gasp] "
            "[groan] [cough] [sniff] [shush] [clear throat]; exact spelling, no plurals), strips "
            "all other tags, needs reference clips >5s, and ignores --exaggeration. Put a tag in "
            "--sample-text to audition a cue. "
            "Output lands in voice_samples/<TAG>/<backend>/ for side-by-side comparison."
        ),
    )
    parser.add_argument(
        "--chatterbox-python", default=None, metavar="PATH",
        help="Path to the chatterbox venv Python (default: auto-detect ./venv-chatterbox/bin/python3). "
             "Used with --backend chatterbox or chatterbox-turbo.",
    )
    parser.add_argument(
        "--voice-refs", default="voice_refs", metavar="DIR",
        help="Directory containing <speaker_key>.wav reference clips for Chatterbox "
             "zero-shot voice cloning (default: voice_refs/). "
             "Used with --backend chatterbox or chatterbox-turbo.",
    )
    parser.add_argument(
        "--exaggeration", type=float, default=0.5, metavar="FLOAT",
        help="Chatterbox emotion exaggeration level: 0.0 = flat, 1.0 = dramatic (default: 0.5). "
             "Used only with --backend chatterbox (ignored by chatterbox-turbo).",
    )
    parser.add_argument(
        "--sample-text", default=None, metavar="TEXT",
        help=(
            "Override the sample text spoken by each voice. Use {name} as a placeholder "
            "for the speaker's full name. "
            "Default: \"I am {name} not yo momma\""
        ),
    )
    parser.add_argument(
        "--dry-run", action="store_true",
        help="Print what would be generated without calling any TTS API",
    )
    parser.add_argument(
        "--force", action="store_true",
        help="Regenerate samples even if files already exist on disk",
    )
    return parser

main

main() -> None

CLI entry point for cast voice sample generation.

Source code in src/xil_pipeline/XILU004_sample_voices_T2S.py

def main() -> None:
    """CLI entry point for cast voice sample generation."""
    configure_logging()
    with run_banner():
        args = get_parser().parse_args()

        backend = args.backend

        if not args.dry_run and backend == "elevenlabs" and not os.environ.get("ELEVENLABS_API_KEY"):
            sys.exit("Error: ELEVENLABS_API_KEY environment variable is not set.")

        # Resolve cast config path
        if args.cast:
            cast_path = args.cast
        else:
            slug = resolve_slug(args.show)
            p = derive_paths(slug, args.episode)
            cast_path = p["cast"]

        if not os.path.exists(cast_path):
            logger.warning(f"Cast config not found: {cast_path}")
            raise SystemExit(1)

        with open(cast_path, encoding="utf-8") as f:
            cast_data = json.load(f)
        cast_cfg = CastConfiguration(**cast_data)
        tag = cast_cfg.tag

        out_dir = os.path.join(VOICE_SAMPLES_DIR, tag, backend)

        logger.info(f"Cast config : {cast_path}")
        logger.info(f"Episode tag : {tag}")
        logger.info(f"Backend     : {backend}")
        logger.info(f"Output dir  : {out_dir}")
        logger.info(f"Cast members: {len(cast_cfg.cast)}")
        logger.info("")

        if not args.dry_run:
            if backend == "elevenlabs":
                check_elevenlabs_quota()
            os.makedirs(out_dir, exist_ok=True)

        # Resolve chatterbox python path
        chatterbox_client: _ChatterboxClient | None = None
        if backend in ("chatterbox", "chatterbox-turbo") and not args.dry_run:
            python_path = args.chatterbox_python or os.path.join("venv-chatterbox", "bin", "python3")
            if not os.path.exists(python_path):
                sys.exit(f"Error: Chatterbox Python not found at {python_path}. "
                         "Use --chatterbox-python to specify the path.")
            chatterbox_client = _ChatterboxClient(
                python_path=python_path,
                voice_refs_dir=args.voice_refs,
                exaggeration=args.exaggeration,
                turbo=(backend == "chatterbox-turbo"),
            )

        generated = 0
        skipped_tbd = 0
        skipped_exists = 0

        try:
            for key, member in cast_cfg.cast.items():
                # ElevenLabs requires a real voice_id; free backends ignore it
                if backend == "elevenlabs" and member.voice_id == "TBD":
                    logger.info(f"  [ SKIP] {key:12s}  voice_id=TBD")
                    skipped_tbd += 1
                    continue

                out_path = os.path.join(out_dir, f"{key}.mp3")
                template = args.sample_text or "I am {name} not yo momma"
                text = template.format(name=member.full_name)

                if not args.force and os.path.exists(out_path):
                    logger.info(f"  [EXISTS] {key:12s}  {out_path}")
                    skipped_exists += 1
                    continue

                if args.dry_run:
                    ref_note = ""
                    if backend in ("chatterbox", "chatterbox-turbo"):
                        ref = os.path.join(args.voice_refs, f"{key}.wav")
                        ref_note = f"  ref={'✓' if os.path.exists(ref) else '✗ (default voice)'}"
                    logger.info(f"  [DRY RUN] {key:12s}  ({member.full_name})  →  {out_path}{ref_note}")
                    generated += 1
                    continue

                if backend == "elevenlabs" and not has_enough_characters(text):
                    logger.info(f"  [ STOP] {key:12s}  insufficient quota")
                    break

                logger.info(f"  [   GEN] {key:12s}  {member.full_name}  …")

                if backend == "gtts":
                    _gtts_generate(text, out_path)
                    tts_comment = "gtts"

                elif backend in ("chatterbox", "chatterbox-turbo"):
                    assert chatterbox_client is not None
                    chatterbox_client.generate(text, out_path, speaker_key=key)
                    tts_comment = backend

                else:  # elevenlabs
                    current_model = get_best_model_for_budget()
                    audio_stream = client.text_to_speech.convert(
                        text=text,
                        voice_id=member.voice_id,
                        model_id=current_model,
                        output_format="mp3_44100_128",
                    )
                    with open(out_path, "wb") as f:
                        for chunk in audio_stream:
                            if chunk:
                                f.write(chunk)
                    tts_comment = current_model

                tag_mp3(
                    out_path,
                    title=f"Sample: {member.full_name}",
                    artist=member.full_name,
                    lyrics=text,
                    comments=tts_comment,
                )
                logger.info(f"  saved → {out_path}")
                generated += 1

        finally:
            if chatterbox_client is not None:
                chatterbox_client.close()

        logger.info("")
        if args.dry_run:
            logger.info(f"Dry run: {generated} would be generated, {skipped_tbd} TBD skipped.")
        else:
            logger.info(f"Done: {generated} generated, {skipped_exists} already existed, {skipped_tbd} TBD skipped.")