Video pipeline spec template¶

Reference template for the video pipeline. Every encode strategy and every generate option is documented inline. Copy this into ~/.config/mm/pipelines/video/{mode}.yaml (replacing {mode} with fast or accurate) and edit only the fields you want to override — omitted keys fall back to the built-in defaults.
Source: python/mm/pipelines/video/spec.yaml.template
# Video pipeline spec — all available strategies and their options.
#
# Copy this file to ~/.config/mm/pipelines/video/{mode}.yaml
# and customise the values you need.  Omitted strategy_opts fall back
# to their defaults.
#
# Many visual strategies have a "-w-transcript" variant that prepends
# an audio transcript before the visual output.  Transcript kwargs
# (language, audio_speed, audio_model, audio_backend) live in
# strategy_opts.

kind: video
mode: fast        # fast | accurate

# ── encode ──────────────────────────────────────────────────────────

encode:
  # ── frames ──────────────────────────────────────────────────────
  # Uniform frame extraction at a given FPS.
  # strategy: frames
  # strategy_opts:
  #   fps: 1.0                        # frames per second to sample
  #   max_width: 1024                 # frame resize width (px)
  #   max_frames_per_message: 16      # frames per Message batch

  # ── frames-w-transcript ─────────────────────────────────────────
  # Same as frames but with audio transcript prepended.
  strategy: frames-w-transcript
  strategy_opts:
    fps: 1.0
    max_width: 1024
    max_frames_per_message: 16
    # language: auto                        # language code or "auto"
    # audio_speed: 2.0                      # playback speed multiplier
    # audio_model: nvidia/parakeet-tdt-0.6b-v3  # transcription model
    # audio_backend: openai                 # "openai" | "mlx" | "ctranslate2"

  # ── mosaic ──────────────────────────────────────────────────────
  # Tiles extracted frames into mosaic grids.  Uses scene detection
  # (PySceneDetect) when available, else uniform sampling.
  # strategy: mosaic
  # strategy_opts:
  #   tile_cols: 4                    # mosaic grid columns
  #   tile_rows: 4                    # mosaic grid rows
  #   thumb_width: 160                # per-frame thumbnail width (px)
  #   num_mosaics: 8                  # max number of mosaic images
  #   num_frames: 128                 # total frames to sample

  # ── mosaic-w-transcript ─────────────────────────────────────────
  # Same as mosaic with Whisper transcript prepended.
  # strategy: mosaic-w-transcript
  # strategy_opts:
  #   tile_cols: 4
  #   tile_rows: 4
  #   thumb_width: 160
  #   num_mosaics: 8
  #   num_frames: 128
  #   language: auto
  #   audio_speed: 2.0

  # ── chunks ──────────────────────────────────────────────────────
  # Overlapping time-based chunks with extracted frames per chunk.
  # strategy: chunks
  # strategy_opts:
  #   chunk_duration: 60              # seconds per chunk
  #   overlap: 5                      # overlap between chunks (s)
  #   max_width: 1024                 # frame resize width (px)
  #   frames_per_chunk: 16            # frames extracted per chunk

  # ── clips ───────────────────────────────────────────────────────
  # Base64-encode video clips of uniform duration.  Useful for
  # models with native video input support.
  # strategy: clips
  # strategy_opts:
  #   duration: 120                   # clip length in seconds (120 = default chunk size)
  #   overlap: 10                     # overlap between clips in seconds
  #   max_size_mb: null               # skip chunks exceeding this size (MB)

  # ── clips-w-transcript ──────────────────────────────────────────
  # Same as clips with Whisper transcript prepended.
  # strategy: clips-w-transcript
  # strategy_opts:
  #   duration: 120
  #   overlap: 10
  #   max_size_mb: null
  #   language: auto
  #   audio_speed: 2.0

  # ── keyframes ───────────────────────────────────────────────────
  # Extract I-frames (keyframes) only — much faster than uniform
  # sampling for capturing scene changes.
  # strategy: keyframes
  # strategy_opts:
  #   max_keyframes: null             # cap keyframes (null = all)
  #   max_width: 1024                 # frame resize width (px)
  #   max_keyframes_per_message: 16   # keyframes per Message batch

  # ── keyframes-w-transcript ──────────────────────────────────────
  # Same as keyframes with Whisper transcript prepended.
  # strategy: keyframes-w-transcript
  # strategy_opts:
  #   max_keyframes: null
  #   max_width: 1024
  #   max_keyframes_per_message: 16
  #   language: auto
  #   audio_speed: 2.0

  # ── shots ───────────────────────────────────────────────────────
  # Scene-aware: detect shots via PySceneDetect, extract frames per shot.
  # strategy: shots
  # strategy_opts:
  #   threshold: 27.0                 # scene detection threshold (higher = fewer shots)
  #   max_frames_per_shot: 8          # max frames per shot
  #   max_width: 1024                 # frame resize width (px)

  # ── shots-w-transcript ──────────────────────────────────────────
  # Same as shots with Whisper transcript prepended.
  # strategy: shots-w-transcript
  # strategy_opts:
  #   threshold: 27.0
  #   max_frames_per_shot: 8
  #   max_width: 1024
  #   language: auto
  #   audio_speed: 2.0

  # ── shot-mosaic ─────────────────────────────────────────────────
  # Scene-aware: detect shots, tile frames into one mosaic per shot.
  # strategy: shot-mosaic
  # strategy_opts:
  #   threshold: 27.0                 # scene detection threshold
  #   tile_cols: 4                    # mosaic grid columns
  #   tile_rows: 4                    # mosaic grid rows
  #   thumb_width: 160                # per-frame thumbnail width (px)

  # ── shot-mosaic-w-transcript ────────────────────────────────────
  # Same as shot-mosaic with Whisper transcript prepended.
  # strategy: shot-mosaic-w-transcript
  # strategy_opts:
  #   threshold: 27.0
  #   tile_cols: 4
  #   tile_rows: 4
  #   thumb_width: 160
  #   language: auto
  #   audio_speed: 2.0

  # ── summary ─────────────────────────────────────────────────────
  # Adaptive N-frame visual summary with scene-detection spread.
  # Good for long videos where a compact overview is needed.
  # strategy: summary
  # strategy_opts:
  #   num_frames: 12                  # frames in the summary
  #   use_scene_detection: true       # try scene detection first
  #   max_width: 1024                 # frame resize width (px)

  # ── summary-w-transcript ────────────────────────────────────────
  # Same as summary with Whisper transcript prepended.
  # strategy: summary-w-transcript
  # strategy_opts:
  #   num_frames: 12
  #   use_scene_detection: true
  #   max_width: 1024
  #   language: auto
  #   audio_speed: 2.0

  # ── transcript ──────────────────────────────────────────────────
  # Audio-only: transcribe the audio track, no visual frames.
  # strategy: transcript
  # strategy_opts:
  #   language: auto                        # language code or "auto"
  #   audio_speed: 2.0                      # playback speed multiplier

  # ── captions ────────────────────────────────────────────────────
  # Extract embedded subtitles (SRT/VTT/SSA).  Falls back to
  # Whisper transcription when no subtitle streams are found.
  # strategy: captions
  # strategy_opts:
  #   subtitle_stream: 0              # which subtitle stream index
  #   fallback_to_whisper: true       # use Whisper if no subtitles
  #   language: auto
  #   audio_speed: 2.0

  # ── gemini ──────────────────────────────────────────────────────
  # Pass the video directly as a Gemini inline_data Part.
  # No strategy_opts — the raw file is sent as-is.
  # strategy: gemini

  # ── gemini-chunked ──────────────────────────────────────────────
  # Duration-based chunking for Gemini inline_data.
  # strategy: gemini-chunked
  # strategy_opts:
  #   max_seconds: 120                # max chunk length (s)
  #   overlap: 10                     # overlap between chunks (s)

  # Custom post-processing transform (optional).
  # pyfunc: null

# ── generate ────────────────────────────────────────────────────────

generate:
  prompt: >-
    Analyze this video.

    You have been provided with the audio transcript (with timestamps)
    followed by sampled video frames. Use both modalities to produce a
    comprehensive analysis.

    Provide a detailed visual and audio analysis (~200 words), up to 10
    keyword tags, and describe each major scene or segment.

    Use this format:
    ## Summary
    {{detailed analysis}}

    ## Tags
    - tag1
    - tag2
    ...

    ## Scenes
    - Scene 1: {{description}}
    - Scene 2: {{description}}
    ...
  max_tokens: 1536
  # model: null              # pin a specific LLM model (overrides profile)
  # temperature: null        # sampling temperature (null = model default)
  # json_mode: false         # request JSON-formatted response
  # think: false             # enable extended thinking
  # reasoning_effort: none   # none | low | medium | high
  # extra_body: {}           # provider-specific pass-through kwargs