Video pipeline spec template¶
Reference template for the video pipeline. Every encode strategy and every generate option is documented inline. Copy this into ~/.config/mm/pipelines/video/{mode}.yaml (replacing {mode} with fast or accurate) and edit only the fields you want to override — omitted keys fall back to the built-in defaults.
Source: python/mm/pipelines/video/spec.yaml.template
# Video pipeline spec — all available strategies and their options.
#
# Copy this file to ~/.config/mm/pipelines/video/{mode}.yaml
# and customise the values you need. Omitted strategy_opts fall back
# to their defaults.
#
# Many visual strategies have a "-w-transcript" variant that prepends
# an audio transcript before the visual output. Transcript kwargs
# (language, audio_speed, audio_model, audio_backend) live in
# strategy_opts.
kind: video
mode: fast # fast | accurate
# ── encode ──────────────────────────────────────────────────────────
encode:
# ── frames ──────────────────────────────────────────────────────
# Uniform frame extraction at a given FPS.
# strategy: frames
# strategy_opts:
# fps: 1.0 # frames per second to sample
# max_width: 1024 # frame resize width (px)
# max_frames_per_message: 16 # frames per Message batch
# ── frames-w-transcript ─────────────────────────────────────────
# Same as frames but with audio transcript prepended.
strategy: frames-w-transcript
strategy_opts:
fps: 1.0
max_width: 1024
max_frames_per_message: 16
# language: auto # language code or "auto"
# audio_speed: 2.0 # playback speed multiplier
# audio_model: nvidia/parakeet-tdt-0.6b-v3 # transcription model
# audio_backend: openai # "openai" | "mlx" | "ctranslate2"
# ── mosaic ──────────────────────────────────────────────────────
# Tiles extracted frames into mosaic grids. Uses scene detection
# (PySceneDetect) when available, else uniform sampling.
# strategy: mosaic
# strategy_opts:
# tile_cols: 4 # mosaic grid columns
# tile_rows: 4 # mosaic grid rows
# thumb_width: 160 # per-frame thumbnail width (px)
# num_mosaics: 8 # max number of mosaic images
# num_frames: 128 # total frames to sample
# ── mosaic-w-transcript ─────────────────────────────────────────
# Same as mosaic with Whisper transcript prepended.
# strategy: mosaic-w-transcript
# strategy_opts:
# tile_cols: 4
# tile_rows: 4
# thumb_width: 160
# num_mosaics: 8
# num_frames: 128
# language: auto
# audio_speed: 2.0
# ── chunks ──────────────────────────────────────────────────────
# Overlapping time-based chunks with extracted frames per chunk.
# strategy: chunks
# strategy_opts:
# chunk_duration: 60 # seconds per chunk
# overlap: 5 # overlap between chunks (s)
# max_width: 1024 # frame resize width (px)
# frames_per_chunk: 16 # frames extracted per chunk
# ── clips ───────────────────────────────────────────────────────
# Base64-encode video clips of uniform duration. Useful for
# models with native video input support.
# strategy: clips
# strategy_opts:
# duration: 120 # clip length in seconds (120 = default chunk size)
# overlap: 10 # overlap between clips in seconds
# max_size_mb: null # skip chunks exceeding this size (MB)
# ── clips-w-transcript ──────────────────────────────────────────
# Same as clips with Whisper transcript prepended.
# strategy: clips-w-transcript
# strategy_opts:
# duration: 120
# overlap: 10
# max_size_mb: null
# language: auto
# audio_speed: 2.0
# ── keyframes ───────────────────────────────────────────────────
# Extract I-frames (keyframes) only — much faster than uniform
# sampling for capturing scene changes.
# strategy: keyframes
# strategy_opts:
# max_keyframes: null # cap keyframes (null = all)
# max_width: 1024 # frame resize width (px)
# max_keyframes_per_message: 16 # keyframes per Message batch
# ── keyframes-w-transcript ──────────────────────────────────────
# Same as keyframes with Whisper transcript prepended.
# strategy: keyframes-w-transcript
# strategy_opts:
# max_keyframes: null
# max_width: 1024
# max_keyframes_per_message: 16
# language: auto
# audio_speed: 2.0
# ── shots ───────────────────────────────────────────────────────
# Scene-aware: detect shots via PySceneDetect, extract frames per shot.
# strategy: shots
# strategy_opts:
# threshold: 27.0 # scene detection threshold (higher = fewer shots)
# max_frames_per_shot: 8 # max frames per shot
# max_width: 1024 # frame resize width (px)
# ── shots-w-transcript ──────────────────────────────────────────
# Same as shots with Whisper transcript prepended.
# strategy: shots-w-transcript
# strategy_opts:
# threshold: 27.0
# max_frames_per_shot: 8
# max_width: 1024
# language: auto
# audio_speed: 2.0
# ── shot-mosaic ─────────────────────────────────────────────────
# Scene-aware: detect shots, tile frames into one mosaic per shot.
# strategy: shot-mosaic
# strategy_opts:
# threshold: 27.0 # scene detection threshold
# tile_cols: 4 # mosaic grid columns
# tile_rows: 4 # mosaic grid rows
# thumb_width: 160 # per-frame thumbnail width (px)
# ── shot-mosaic-w-transcript ────────────────────────────────────
# Same as shot-mosaic with Whisper transcript prepended.
# strategy: shot-mosaic-w-transcript
# strategy_opts:
# threshold: 27.0
# tile_cols: 4
# tile_rows: 4
# thumb_width: 160
# language: auto
# audio_speed: 2.0
# ── summary ─────────────────────────────────────────────────────
# Adaptive N-frame visual summary with scene-detection spread.
# Good for long videos where a compact overview is needed.
# strategy: summary
# strategy_opts:
# num_frames: 12 # frames in the summary
# use_scene_detection: true # try scene detection first
# max_width: 1024 # frame resize width (px)
# ── summary-w-transcript ────────────────────────────────────────
# Same as summary with Whisper transcript prepended.
# strategy: summary-w-transcript
# strategy_opts:
# num_frames: 12
# use_scene_detection: true
# max_width: 1024
# language: auto
# audio_speed: 2.0
# ── transcript ──────────────────────────────────────────────────
# Audio-only: transcribe the audio track, no visual frames.
# strategy: transcript
# strategy_opts:
# language: auto # language code or "auto"
# audio_speed: 2.0 # playback speed multiplier
# ── captions ────────────────────────────────────────────────────
# Extract embedded subtitles (SRT/VTT/SSA). Falls back to
# Whisper transcription when no subtitle streams are found.
# strategy: captions
# strategy_opts:
# subtitle_stream: 0 # which subtitle stream index
# fallback_to_whisper: true # use Whisper if no subtitles
# language: auto
# audio_speed: 2.0
# ── gemini ──────────────────────────────────────────────────────
# Pass the video directly as a Gemini inline_data Part.
# No strategy_opts — the raw file is sent as-is.
# strategy: gemini
# ── gemini-chunked ──────────────────────────────────────────────
# Duration-based chunking for Gemini inline_data.
# strategy: gemini-chunked
# strategy_opts:
# max_seconds: 120 # max chunk length (s)
# overlap: 10 # overlap between chunks (s)
# Custom post-processing transform (optional).
# pyfunc: null
# ── generate ────────────────────────────────────────────────────────
generate:
prompt: >-
Analyze this video.
You have been provided with the audio transcript (with timestamps)
followed by sampled video frames. Use both modalities to produce a
comprehensive analysis.
Provide a detailed visual and audio analysis (~200 words), up to 10
keyword tags, and describe each major scene or segment.
Use this format:
## Summary
{{detailed analysis}}
## Tags
- tag1
- tag2
...
## Scenes
- Scene 1: {{description}}
- Scene 2: {{description}}
...
max_tokens: 1536
# model: null # pin a specific LLM model (overrides profile)
# temperature: null # sampling temperature (null = model default)
# json_mode: false # request JSON-formatted response
# think: false # enable extended thinking
# reasoning_effort: none # none | low | medium | high
# extra_body: {} # provider-specific pass-through kwargs