Audio pipeline spec template¶
Reference template for the audio pipeline. Every encode strategy and every generate option is documented inline. Copy this into ~/.config/mm/pipelines/audio/{mode}.yaml (replacing {mode} with fast or accurate) and edit only the fields you want to override — omitted keys fall back to the built-in defaults.
Source: python/mm/pipelines/audio/spec.yaml.template
# Audio pipeline spec — all available strategies and their options.
#
# Copy this file to ~/.config/mm/pipelines/audio/{mode}.yaml
# and customise the values you need. Omitted strategy_opts fall back
# to their defaults. The generate block is optional — leave it out
# for encode-only (transcript-only) pipelines.
kind: audio
mode: fast # fast | accurate
# ── encode ──────────────────────────────────────────────────────────
encode:
# ── transcribe (default) ──────────────────────────────────
# Transcribes audio via the modular backend system. By default
# uses the VLM Run gateway (OpenAI-compatible). Override base_url
# to point at localhost or OpenAI directly.
strategy: transcribe
# model: nvidia/parakeet-tdt-0.6b-v3 # encoder model (default for gateway)
# backend: openai # "openai" | "mlx" | "ctranslate2"
strategy_opts:
# language: auto # language code or "auto"
# audio_speed: 2.0 # playback speed multiplier
# base_url: null # custom OpenAI-compatible endpoint
# api_key: null # API key for that endpoint
# ── base64 ────────────────────────────────────────────────
# Sends raw audio as a base64 input_audio part (no transcription).
# Best for models with native audio understanding.
# strategy: base64
# strategy_opts:
# format: mp3 # mp3 | wav | flac | ogg | m4a | aac | opus | webm
# # default: inferred from file extension
# ── gemini ────────────────────────────────────────────────
# Passes audio as Gemini inline_data Part(s). Long files are
# automatically chunked.
# strategy: gemini
# strategy_opts:
# max_seconds: 120 # max chunk length in seconds
# overlap: 10 # overlap between chunks in seconds
# Custom post-processing transform (optional).
# pyfunc: null
# ── generate (optional — omit for encode-only) ──────────────────
generate:
prompt: >-
Summarize this audio recording ({filename}) based on the transcript.
Provide a concise summary (~80 words) covering the main topics discussed.
Transcript:
{transcript}
max_tokens: 512
# model: null # pin a specific LLM model (overrides profile)
# temperature: null # sampling temperature (null = model default)
# json_mode: false # request JSON-formatted response
# think: false # enable extended thinking
# reasoning_effort: none # none | low | medium | high
# extra_body: {} # provider-specific pass-through kwargs