Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,5 @@ pip-wheel-metadata
poetry/core/*

public
.devcontainer
.devcontainer/.env
45 changes: 43 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,44 @@
FROM onerahmet/ffmpeg:n7.1 AS ffmpeg
FROM debian:bookworm-slim AS ffmpeg

ARG FFMPEG_VERSION=n7.1

RUN export DEBIAN_FRONTEND=noninteractive \
&& apt-get -qq update \
&& apt-get -qq install --no-install-recommends \
build-essential \
git \
pkg-config \
yasm \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*

RUN git clone https://github.com/FFmpeg/FFmpeg.git --depth 1 --branch $FFMPEG_VERSION --single-branch /FFmpeg

WORKDIR /FFmpeg

RUN PATH="$HOME/bin:$PATH" PKG_CONFIG_PATH="$HOME/ffmpeg_build/lib/pkgconfig" ./configure \
--prefix="$HOME/ffmpeg_build" \
--pkg-config-flags="--static" \
--extra-cflags="-I$HOME/ffmpeg_build/include" \
--extra-ldflags="-L$HOME/ffmpeg_build/lib" \
--extra-libs="-lpthread -lm" \
--ld="g++" \
--bindir="$HOME/bin" \
--disable-doc \
--disable-htmlpages \
--disable-podpages \
--disable-txtpages \
--disable-network \
--disable-autodetect \
--disable-hwaccels \
--enable-ffprobe \
--disable-ffplay \
--enable-filter=copy \
--enable-protocol=file \
--enable-small && \
PATH="$HOME/bin:$PATH" make -j$(nproc) && \
make install && \
hash -r

FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui

Expand All @@ -15,7 +55,8 @@ ENV PATH="${PATH}:${POETRY_VENV}/bin"
WORKDIR /app

COPY . /app
COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
COPY --from=ffmpeg /root/bin/ffmpeg /usr/local/bin/ffmpeg
COPY --from=ffmpeg /root/bin/ffprobe /usr/local/bin/ffprobe
COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css
COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js

Expand Down
43 changes: 42 additions & 1 deletion Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -1,4 +1,44 @@
FROM onerahmet/ffmpeg:n7.1 AS ffmpeg
FROM debian:bookworm-slim AS ffmpeg

ARG FFMPEG_VERSION=n7.1

RUN export DEBIAN_FRONTEND=noninteractive \
&& apt-get -qq update \
&& apt-get -qq install --no-install-recommends \
build-essential \
git \
pkg-config \
yasm \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*

RUN git clone https://github.com/FFmpeg/FFmpeg.git --depth 1 --branch $FFMPEG_VERSION --single-branch /FFmpeg

WORKDIR /FFmpeg

RUN PATH="$HOME/bin:$PATH" PKG_CONFIG_PATH="$HOME/ffmpeg_build/lib/pkgconfig" ./configure \
--prefix="$HOME/ffmpeg_build" \
--pkg-config-flags="--static" \
--extra-cflags="-I$HOME/ffmpeg_build/include" \
--extra-ldflags="-L$HOME/ffmpeg_build/lib" \
--extra-libs="-lpthread -lm" \
--ld="g++" \
--bindir="$HOME/bin" \
--disable-doc \
--disable-htmlpages \
--disable-podpages \
--disable-txtpages \
--disable-network \
--disable-autodetect \
--disable-hwaccels \
--enable-ffprobe \
--disable-ffplay \
--enable-filter=copy \
--enable-protocol=file \
--enable-small && \
PATH="$HOME/bin:$PATH" make -j$(nproc) && \
make install && \
hash -r

FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui

Expand Down Expand Up @@ -37,6 +77,7 @@ RUN poetry install --no-root

COPY . .
COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
COPY --from=ffmpeg /root/bin/ffprobe /usr/local/bin/ffprobe
COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css
COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js

Expand Down
57 changes: 45 additions & 12 deletions app/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import json
import os
import io
from dataclasses import asdict
from typing import BinaryIO, TextIO

import ffmpeg
from pydub import AudioSegment
import numpy as np
from faster_whisper.utils import format_timestamp

Expand Down Expand Up @@ -94,7 +96,7 @@ def write_result(self, result: dict, file: TextIO):
json.dump(result, file)


def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE):
def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE, use_ffmpeg: bool = False):
"""
Open an audio file object and read as mono waveform, resampling as necessary.
Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py to accept a file object
Expand All @@ -106,22 +108,53 @@ def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE):
If true, encode audio stream to WAV before sending to whisper
sr: int
The sample rate to resample the audio if necessary
use_ffmpeg: bool
If True, use ffmpeg to load audio. If False, use pydub.
Returns
-------
A NumPy array containing the audio waveform, in float32 dtype.
"""

if encode:
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input("pipe:", threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read())
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
if use_ffmpeg:
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input("pipe:", threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read())
)
samples = np.frombuffer(out, np.int16).flatten()

except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

else:
try:
# Read audio file with pydub
audio = AudioSegment.from_file(io.BytesIO(file.read()))
# Pydub does not support resampling, so we need to convert the frame rate
if audio.frame_rate != sr:
audio = audio.set_frame_rate(sr)
# Convert audio to mono
audio = audio.set_channels(1)
# Convert audio to numpy array
samples = np.array(audio.get_array_of_samples())

except Exception as e:
raise RuntimeError(f"Failed to load audio")

else:
out = file.read()
samples = np.frombuffer(out, np.int16).flatten()


# Convert samples to float32
samples = samples.astype(np.float32)
# Normalize the sample values
samples = samples / 32768.0

return samples


return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ fastapi = "^0.115.8"
uvicorn = { extras = ["standard"], version = "^0.34.0" }
python-multipart = "^0.0.20"
ffmpeg-python = "^0.2.0"
pydub = "^0.25.1"
numpy = "<2.0.0"
openai-whisper = "^20240930"
faster-whisper = "^1.1.0"
Expand Down