ahmetoner · charnesp · Feb 23, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,5 @@ pip-wheel-metadata
 poetry/core/*
 
 public
+.devcontainer
+.devcontainer/.env
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,44 @@
-FROM onerahmet/ffmpeg:n7.1 AS ffmpeg
+FROM debian:bookworm-slim AS ffmpeg
+
+ARG FFMPEG_VERSION=n7.1
+
+RUN export DEBIAN_FRONTEND=noninteractive \
+    && apt-get -qq update \
+    && apt-get -qq install --no-install-recommends \
+    build-essential \
+    git \
+    pkg-config \
+    yasm \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN git clone https://github.com/FFmpeg/FFmpeg.git --depth 1 --branch $FFMPEG_VERSION --single-branch /FFmpeg
+
+WORKDIR /FFmpeg
+
+RUN PATH="$HOME/bin:$PATH" PKG_CONFIG_PATH="$HOME/ffmpeg_build/lib/pkgconfig" ./configure \
+      --prefix="$HOME/ffmpeg_build" \
+      --pkg-config-flags="--static" \
+      --extra-cflags="-I$HOME/ffmpeg_build/include" \
+      --extra-ldflags="-L$HOME/ffmpeg_build/lib" \
+      --extra-libs="-lpthread -lm" \
+      --ld="g++" \
+      --bindir="$HOME/bin" \
+      --disable-doc \
+      --disable-htmlpages \
+      --disable-podpages \
+      --disable-txtpages \
+      --disable-network \
+      --disable-autodetect \
+      --disable-hwaccels \
+      --enable-ffprobe \
+      --disable-ffplay \
+      --enable-filter=copy \
+      --enable-protocol=file \
+      --enable-small && \
+    PATH="$HOME/bin:$PATH" make -j$(nproc) && \
+    make install && \
+    hash -r
 
 FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui
 
@@ -15,7 +55,8 @@ ENV PATH="${PATH}:${POETRY_VENV}/bin"
 WORKDIR /app
 
 COPY . /app
-COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
+COPY --from=ffmpeg /root/bin/ffmpeg /usr/local/bin/ffmpeg
+COPY --from=ffmpeg /root/bin/ffprobe /usr/local/bin/ffprobe
 COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css
 COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js
 

diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -1,4 +1,44 @@
-FROM onerahmet/ffmpeg:n7.1 AS ffmpeg
+FROM debian:bookworm-slim AS ffmpeg
+
+ARG FFMPEG_VERSION=n7.1
+
+RUN export DEBIAN_FRONTEND=noninteractive \
+    && apt-get -qq update \
+    && apt-get -qq install --no-install-recommends \
+    build-essential \
+    git \
+    pkg-config \
+    yasm \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN git clone https://github.com/FFmpeg/FFmpeg.git --depth 1 --branch $FFMPEG_VERSION --single-branch /FFmpeg
+
+WORKDIR /FFmpeg
+
+RUN PATH="$HOME/bin:$PATH" PKG_CONFIG_PATH="$HOME/ffmpeg_build/lib/pkgconfig" ./configure \
+      --prefix="$HOME/ffmpeg_build" \
+      --pkg-config-flags="--static" \
+      --extra-cflags="-I$HOME/ffmpeg_build/include" \
+      --extra-ldflags="-L$HOME/ffmpeg_build/lib" \
+      --extra-libs="-lpthread -lm" \
+      --ld="g++" \
+      --bindir="$HOME/bin" \
+      --disable-doc \
+      --disable-htmlpages \
+      --disable-podpages \
+      --disable-txtpages \
+      --disable-network \
+      --disable-autodetect \
+      --disable-hwaccels \
+      --enable-ffprobe \
+      --disable-ffplay \
+      --enable-filter=copy \
+      --enable-protocol=file \
+      --enable-small && \
+    PATH="$HOME/bin:$PATH" make -j$(nproc) && \
+    make install && \
+    hash -r
 
 FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui
 
@@ -37,6 +77,7 @@ RUN poetry install --no-root
 
 COPY . .
 COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
+COPY --from=ffmpeg /root/bin/ffprobe /usr/local/bin/ffprobe
 COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css
 COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js
 

diff --git a/app/utils.py b/app/utils.py
@@ -1,9 +1,11 @@
 import json
 import os
+import io
 from dataclasses import asdict
 from typing import BinaryIO, TextIO
 
 import ffmpeg
+from pydub import AudioSegment
 import numpy as np
 from faster_whisper.utils import format_timestamp
 
@@ -94,7 +96,7 @@ def write_result(self, result: dict, file: TextIO):
         json.dump(result, file)
 
 
-def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE):
+def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE, use_ffmpeg: bool = False):
     """
     Open an audio file object and read as mono waveform, resampling as necessary.
     Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py to accept a file object
@@ -106,22 +108,53 @@ def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE):
         If true, encode audio stream to WAV before sending to whisper
     sr: int
         The sample rate to resample the audio if necessary
+    use_ffmpeg: bool
+        If True, use ffmpeg to load audio. If False, use pydub.
     Returns
     -------
     A NumPy array containing the audio waveform, in float32 dtype.
     """
+
     if encode:
-        try:
-            # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
-            # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
-            out, _ = (
-                ffmpeg.input("pipe:", threads=0)
-                .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
-                .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read())
-            )
-        except ffmpeg.Error as e:
-            raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+        if use_ffmpeg:
+            try:
+                # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+                # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+                out, _ = (
+                    ffmpeg.input("pipe:", threads=0)
+                    .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+                    .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read())
+                )
+                samples = np.frombuffer(out, np.int16).flatten()
+
+            except ffmpeg.Error as e:
+                raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+
+        else:
+            try:
+                # Read audio file with pydub
+                audio = AudioSegment.from_file(io.BytesIO(file.read()))
+                # Pydub does not support resampling, so we need to convert the frame rate
+                if audio.frame_rate != sr:
+                    audio = audio.set_frame_rate(sr)
+                # Convert audio to mono
+                audio = audio.set_channels(1)
+                # Convert audio to numpy array
+                samples = np.array(audio.get_array_of_samples())
+
+            except Exception as e:
+                raise RuntimeError(f"Failed to load audio")
+
     else:
         out = file.read()
+        samples = np.frombuffer(out, np.int16).flatten()
+
+
+    # Convert samples to float32
+    samples = samples.astype(np.float32)
+    # Normalize the sample values
+    samples = samples / 32768.0
+
+    return samples
+
 
-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ fastapi = "^0.115.8"
 uvicorn = { extras = ["standard"], version = "^0.34.0" }
 python-multipart = "^0.0.20"
 ffmpeg-python = "^0.2.0"
+pydub = "^0.25.1"
 numpy = "<2.0.0"
 openai-whisper = "^20240930"
 faster-whisper = "^1.1.0"