Skip to content

Sfx Backends

src.xil_pipeline.sfx_backends

Pluggable backends for SFX / music / ambience asset generation.

The pipeline historically generated every non-silence sound effect through the ElevenLabs Sound Effects API. This module introduces a thin :class:SfxBackend adapter so the shared generation path in :mod:xil_pipeline.sfx_common no longer talks to the ElevenLabs client directly. Two backends are provided:

  • :class:ElevenLabsSfxBackend — wraps client.text_to_sound_effects.convert with stream-to-temp, atomic rename, and 429 / 5xx / network retry handling.
  • :class:AudioLDM2SfxBackend — drives a local AudioLDM 2 Large diffusion model via a persistent worker subprocess (:mod:xil_pipeline.audioldm2_worker) in a dedicated venv-audioldm2 virtualenv. Free, GPU-accelerated, no API credits.

Both expose the same minimal contract::

backend.generate_to(out_path, prompt, duration_seconds, prompt_influence)
backend.close()

Use :func:make_sfx_backend to construct the right backend from a CLI flag.

logger module-attribute

logger = get_logger(__name__)

SfxBackend

Bases: Protocol

Minimal contract for a sound-effect generation backend.

Source code in src/xil_pipeline/sfx_backends.py
@runtime_checkable
class SfxBackend(Protocol):
    """Minimal contract for a sound-effect generation backend."""

    name: str

    def generate_to(
        self,
        out_path: str,
        prompt: str,
        duration_seconds: float,
        prompt_influence: float,
    ) -> None:
        """Generate audio for *prompt* and write it to *out_path*."""
        ...

    def close(self) -> None:
        """Release any resources (subprocess, sockets). No-op for stateless backends."""
        ...

name instance-attribute

name: str

generate_to

generate_to(out_path: str, prompt: str, duration_seconds: float, prompt_influence: float) -> None

Generate audio for prompt and write it to out_path.

Source code in src/xil_pipeline/sfx_backends.py
def generate_to(
    self,
    out_path: str,
    prompt: str,
    duration_seconds: float,
    prompt_influence: float,
) -> None:
    """Generate audio for *prompt* and write it to *out_path*."""
    ...

close

close() -> None

Release any resources (subprocess, sockets). No-op for stateless backends.

Source code in src/xil_pipeline/sfx_backends.py
def close(self) -> None:
    """Release any resources (subprocess, sockets). No-op for stateless backends."""
    ...

ElevenLabsSfxBackend

SFX backend backed by the ElevenLabs Sound Effects API.

Wraps client.text_to_sound_effects.convert with the streaming download, atomic temp-file rename, and retry behaviour that previously lived inline in :func:xil_pipeline.sfx_common.ensure_shared_sfx. Retries 429 (rate limit), 5xx (server error), and network transport errors up to five times with linear backoff (10s, 20s, …).

Source code in src/xil_pipeline/sfx_backends.py
class ElevenLabsSfxBackend:
    """SFX backend backed by the ElevenLabs Sound Effects API.

    Wraps ``client.text_to_sound_effects.convert`` with the streaming
    download, atomic temp-file rename, and retry behaviour that previously
    lived inline in :func:`xil_pipeline.sfx_common.ensure_shared_sfx`.
    Retries 429 (rate limit), 5xx (server error), and network transport
    errors up to five times with linear backoff (10s, 20s, …).
    """

    name = "elevenlabs"

    def __init__(self, client) -> None:
        self._client = client

    def generate_to(
        self,
        out_path: str,
        prompt: str,
        duration_seconds: float,
        prompt_influence: float,
    ) -> None:
        if self._client is None:
            raise ValueError(
                "ElevenLabs client is required to generate SFX "
                "(set ELEVENLABS_API_KEY or use --sfx-backend audioldm2)."
            )
        logger.info("   [api] text-to-sound-effects → %r (%.1fs)", prompt, duration_seconds)
        tmp_path = None
        try:
            max_retries, delay = 5, 10
            for attempt in range(1, max_retries + 1):
                try:
                    audio_stream = self._client.text_to_sound_effects.convert(
                        text=prompt,
                        duration_seconds=duration_seconds,
                        prompt_influence=prompt_influence,
                    )
                    tmp_fd, tmp_path = tempfile.mkstemp(
                        dir=os.path.dirname(out_path) or ".", suffix=".tmp"
                    )
                    with os.fdopen(tmp_fd, "wb") as f:
                        for chunk in audio_stream:
                            if chunk:
                                f.write(chunk)
                    os.replace(tmp_path, out_path)
                    tmp_path = None
                    logger.info("   [api] saved %s", os.path.basename(out_path))
                    return
                except (ApiError, httpx.TransportError) as exc:
                    if tmp_path is not None:
                        with contextlib.suppress(FileNotFoundError):
                            os.unlink(tmp_path)
                        tmp_path = None
                    is_rate_limit = isinstance(exc, ApiError) and exc.status_code == 429
                    is_server_error = (
                        isinstance(exc, ApiError)
                        and exc.status_code is not None
                        and exc.status_code >= 500
                    )
                    is_network_error = isinstance(exc, httpx.TransportError)
                    is_retryable = is_rate_limit or is_server_error or is_network_error
                    if is_retryable and attempt < max_retries:
                        wait = delay * attempt
                        if is_rate_limit:
                            reason = "429 rate limited"
                        elif is_server_error:
                            reason = f"{exc.status_code} server error"
                        else:
                            reason = f"network error ({type(exc).__name__})"
                        logger.warning(
                            "[%s] — retrying in %ds (attempt %d/%d)",
                            reason, wait, attempt, max_retries,
                        )
                        time.sleep(wait)
                    else:
                        raise
        finally:
            if tmp_path is not None:
                with contextlib.suppress(FileNotFoundError):
                    os.unlink(tmp_path)

    def close(self) -> None:
        # The ElevenLabs client is owned by the caller; nothing to release here.
        return

name class-attribute instance-attribute

name = 'elevenlabs'

__init__

__init__(client) -> None
Source code in src/xil_pipeline/sfx_backends.py
def __init__(self, client) -> None:
    self._client = client

generate_to

generate_to(out_path: str, prompt: str, duration_seconds: float, prompt_influence: float) -> None
Source code in src/xil_pipeline/sfx_backends.py
def generate_to(
    self,
    out_path: str,
    prompt: str,
    duration_seconds: float,
    prompt_influence: float,
) -> None:
    if self._client is None:
        raise ValueError(
            "ElevenLabs client is required to generate SFX "
            "(set ELEVENLABS_API_KEY or use --sfx-backend audioldm2)."
        )
    logger.info("   [api] text-to-sound-effects → %r (%.1fs)", prompt, duration_seconds)
    tmp_path = None
    try:
        max_retries, delay = 5, 10
        for attempt in range(1, max_retries + 1):
            try:
                audio_stream = self._client.text_to_sound_effects.convert(
                    text=prompt,
                    duration_seconds=duration_seconds,
                    prompt_influence=prompt_influence,
                )
                tmp_fd, tmp_path = tempfile.mkstemp(
                    dir=os.path.dirname(out_path) or ".", suffix=".tmp"
                )
                with os.fdopen(tmp_fd, "wb") as f:
                    for chunk in audio_stream:
                        if chunk:
                            f.write(chunk)
                os.replace(tmp_path, out_path)
                tmp_path = None
                logger.info("   [api] saved %s", os.path.basename(out_path))
                return
            except (ApiError, httpx.TransportError) as exc:
                if tmp_path is not None:
                    with contextlib.suppress(FileNotFoundError):
                        os.unlink(tmp_path)
                    tmp_path = None
                is_rate_limit = isinstance(exc, ApiError) and exc.status_code == 429
                is_server_error = (
                    isinstance(exc, ApiError)
                    and exc.status_code is not None
                    and exc.status_code >= 500
                )
                is_network_error = isinstance(exc, httpx.TransportError)
                is_retryable = is_rate_limit or is_server_error or is_network_error
                if is_retryable and attempt < max_retries:
                    wait = delay * attempt
                    if is_rate_limit:
                        reason = "429 rate limited"
                    elif is_server_error:
                        reason = f"{exc.status_code} server error"
                    else:
                        reason = f"network error ({type(exc).__name__})"
                    logger.warning(
                        "[%s] — retrying in %ds (attempt %d/%d)",
                        reason, wait, attempt, max_retries,
                    )
                    time.sleep(wait)
                else:
                    raise
    finally:
        if tmp_path is not None:
            with contextlib.suppress(FileNotFoundError):
                os.unlink(tmp_path)

close

close() -> None
Source code in src/xil_pipeline/sfx_backends.py
def close(self) -> None:
    # The ElevenLabs client is owned by the caller; nothing to release here.
    return

AudioLDM2SfxBackend

SFX backend backed by a local AudioLDM 2 Large diffusion model.

Adherence to the prompt is governed by guidance/steps (configured at construction), so the ElevenLabs-specific prompt_influence argument is accepted for interface compatibility but ignored.

Source code in src/xil_pipeline/sfx_backends.py
class AudioLDM2SfxBackend:
    """SFX backend backed by a local AudioLDM 2 Large diffusion model.

    Adherence to the prompt is governed by ``guidance``/``steps`` (configured
    at construction), so the ElevenLabs-specific ``prompt_influence`` argument
    is accepted for interface compatibility but ignored.
    """

    name = "audioldm2"

    def __init__(self, client: _AudioLDM2Client) -> None:
        self._client = client

    def generate_to(
        self,
        out_path: str,
        prompt: str,
        duration_seconds: float,
        prompt_influence: float,  # noqa: ARG002 — AudioLDM 2 uses guidance_scale instead
    ) -> None:
        logger.info("   [audioldm2] generating → %r (%.1fs)", prompt, duration_seconds)
        self._client.generate(prompt, out_path, duration_seconds)
        logger.info("   [audioldm2] saved %s", os.path.basename(out_path))

    def close(self) -> None:
        self._client.close()

name class-attribute instance-attribute

name = 'audioldm2'

__init__

__init__(client: _AudioLDM2Client) -> None
Source code in src/xil_pipeline/sfx_backends.py
def __init__(self, client: _AudioLDM2Client) -> None:
    self._client = client

generate_to

generate_to(out_path: str, prompt: str, duration_seconds: float, prompt_influence: float) -> None
Source code in src/xil_pipeline/sfx_backends.py
def generate_to(
    self,
    out_path: str,
    prompt: str,
    duration_seconds: float,
    prompt_influence: float,  # noqa: ARG002 — AudioLDM 2 uses guidance_scale instead
) -> None:
    logger.info("   [audioldm2] generating → %r (%.1fs)", prompt, duration_seconds)
    self._client.generate(prompt, out_path, duration_seconds)
    logger.info("   [audioldm2] saved %s", os.path.basename(out_path))

close

close() -> None
Source code in src/xil_pipeline/sfx_backends.py
def close(self) -> None:
    self._client.close()

make_sfx_backend

make_sfx_backend(name: str, client=None, *, audioldm2_python: str | None = None, device: str = 'cuda', guidance: float = 3.5, steps: int = 200, negative_prompt: str = 'low quality, noise') -> SfxBackend

Construct an :class:SfxBackend for the given backend name.

Parameters:

  • name (str) –

    "elevenlabs" or "audioldm2".

  • client

    ElevenLabs client (used only for "elevenlabs").

  • audioldm2_python (str | None, default: None ) –

    Explicit path to the venv-audioldm2 Python; auto-detected when None.

  • device (str, default: 'cuda' ) –

    "cuda" (default) or "cpu" for AudioLDM 2.

  • guidance (float, default: 3.5 ) –

    AudioLDM 2 guidance_scale.

  • steps (int, default: 200 ) –

    AudioLDM 2 num_inference_steps.

  • negative_prompt (str, default: 'low quality, noise' ) –

    AudioLDM 2 negative prompt.

Returns:

Source code in src/xil_pipeline/sfx_backends.py
def make_sfx_backend(
    name: str,
    client=None,
    *,
    audioldm2_python: str | None = None,
    device: str = "cuda",
    guidance: float = 3.5,
    steps: int = 200,
    negative_prompt: str = "low quality, noise",
) -> SfxBackend:
    """Construct an :class:`SfxBackend` for the given backend *name*.

    Args:
        name: ``"elevenlabs"`` or ``"audioldm2"``.
        client: ElevenLabs client (used only for ``"elevenlabs"``).
        audioldm2_python: Explicit path to the venv-audioldm2 Python; auto-detected
            when ``None``.
        device: ``"cuda"`` (default) or ``"cpu"`` for AudioLDM 2.
        guidance: AudioLDM 2 ``guidance_scale``.
        steps: AudioLDM 2 ``num_inference_steps``.
        negative_prompt: AudioLDM 2 negative prompt.

    Returns:
        A ready-to-use backend instance.
    """
    if name == "elevenlabs":
        return ElevenLabsSfxBackend(client)
    if name == "audioldm2":
        py = _find_audioldm2_python(audioldm2_python)
        worker_client = _AudioLDM2Client(
            python_path=py,
            device=device,
            guidance=guidance,
            steps=steps,
            negative_prompt=negative_prompt,
        )
        return AudioLDM2SfxBackend(worker_client)
    raise ValueError(f"Unknown sfx backend: {name!r}")