From ddfbffe8f3a08a04edfe16e2c3446c11742d8e13 Mon Sep 17 00:00:00 2001 From: Georgi Sundberg <44953283+Chugarah@users.noreply.github.com> Date: Mon, 31 Mar 2025 06:33:00 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=A6=20Multi-Format=20Export=20Support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 🚀 Added simultaneous export for all transcript formats (`txt`, `srt`, `vtt`, `tsv`, `json`) - 🗂️ Packaged outputs into a single ZIP file for easy download - 🔄 Updated workflows & Docker setup to support the new feature - 📖 Updated README with relevant changes --- Dockerfile.gpu | 3 + README.md | 3 + app/asr_models/faster_whisper_engine.py | 42 ++++-- app/asr_models/mbain_whisperx_engine.py | 18 ++- app/asr_models/openai_whisper_engine.py | 18 ++- app/utils.py | 126 ++++++++++++++++-- app/webservice.py | 48 ++++++- docker-compose-cpu.yml | 17 +++ docker-compose.yml | 35 ++++- example.env | 25 ++++ whisper-asr-webservice-main/.dockerignore | 3 + .../.github/FUNDING.yml | 4 + .../.github/workflows/docker-publish.yml | 46 +++++++ .../.github/workflows/documentation.yml | 27 ++++ whisper-asr-webservice-main/.gitignore | 44 ++++++ 15 files changed, 427 insertions(+), 32 deletions(-) create mode 100644 docker-compose-cpu.yml create mode 100644 example.env create mode 100644 whisper-asr-webservice-main/.dockerignore create mode 100644 whisper-asr-webservice-main/.github/FUNDING.yml create mode 100644 whisper-asr-webservice-main/.github/workflows/docker-publish.yml create mode 100644 whisper-asr-webservice-main/.github/workflows/documentation.yml create mode 100644 whisper-asr-webservice-main/.gitignore diff --git a/Dockerfile.gpu b/Dockerfile.gpu index 903b7b3..e7deff6 100644 --- a/Dockerfile.gpu +++ b/Dockerfile.gpu @@ -15,6 +15,9 @@ RUN export DEBIAN_FRONTEND=noninteractive \ python${PYTHON_VERSION}-venv \ python3-pip \ libcudnn8 \ + libcudnn8-dev \ + # Make sure to install all required libcudnn components + libcudnn8-samples \ python3-pip \ && rm -rf /var/lib/apt/lists/* diff --git a/README.md b/README.md index e8cd150..7c40172 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ docker run -d -p 9000:9000 \ - Multiple ASR engines support (OpenAI Whisper, Faster Whisper, WhisperX) - Multiple output formats (text, JSON, VTT, SRT, TSV) +- Support for outputting all formats simultaneously with a single request - Word-level timestamps support - Voice activity detection (VAD) filtering - Speaker diarization (with WhisperX) @@ -90,3 +91,5 @@ After starting the service, visit `http://localhost:9000` or `http://0.0.0.0:900 ## Credits - This software uses libraries from the [FFmpeg](http://ffmpeg.org) project under the [LGPLv2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html) + + diff --git a/app/asr_models/faster_whisper_engine.py b/app/asr_models/faster_whisper_engine.py index d7c3c44..ecba45b 100644 --- a/app/asr_models/faster_whisper_engine.py +++ b/app/asr_models/faster_whisper_engine.py @@ -1,4 +1,5 @@ import time +import os from io import StringIO from threading import Thread from typing import BinaryIO, Union @@ -8,7 +9,7 @@ from app.asr_models.asr_model import ASRModel from app.config import CONFIG -from app.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT +from app.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT, WriteAll class FasterWhisperASR(ASRModel): @@ -59,10 +60,23 @@ def transcribe( text = text + segment.text result = {"language": options_dict.get("language", info.language), "segments": segments, "text": text} + # Store the output directory and audio path for the "all" option + self.output_dir = os.environ.get("OUTPUT_DIR", "/tmp") + self.audio_path = os.environ.get("AUDIO_FILENAME", "audio") + + # For "all" output format, create and return the zip bytes + if output == "all": + writer = WriteAll(self.output_dir) + zip_bytes = writer.create_zip_bytes(result) + # Create a generator that yields the bytes + def bytes_generator(): + yield zip_bytes + return bytes_generator() + + # For other formats, write to StringIO and return that output_file = StringIO() self.write_result(result, output_file, output) output_file.seek(0) - return output_file def language_detection(self, audio): @@ -84,13 +98,25 @@ def language_detection(self, audio): return detected_lang_code, detected_language_confidence def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]): + """ + Write the transcription result to the specified output format. + + For 'all' format, this function is not directly used as the transcribe method + handles it with create_zip_bytes. + For other formats, writes directly to the provided file object. + """ + # Initialize the appropriate writer class based on the output format if output == "srt": - WriteSRT(ResultWriter).write_result(result, file=file) + writer_class = WriteSRT elif output == "vtt": - WriteVTT(ResultWriter).write_result(result, file=file) + writer_class = WriteVTT elif output == "tsv": - WriteTSV(ResultWriter).write_result(result, file=file) + writer_class = WriteTSV elif output == "json": - WriteJSON(ResultWriter).write_result(result, file=file) - else: - WriteTXT(ResultWriter).write_result(result, file=file) + writer_class = WriteJSON + else: # Default to txt + writer_class = WriteTXT + + # Create a ResultWriter instance and write to the file + writer = writer_class(self.output_dir) + writer.write_result(result, file=file) diff --git a/app/asr_models/mbain_whisperx_engine.py b/app/asr_models/mbain_whisperx_engine.py index 87494a3..d5c1799 100644 --- a/app/asr_models/mbain_whisperx_engine.py +++ b/app/asr_models/mbain_whisperx_engine.py @@ -1,4 +1,5 @@ import time +import os from io import StringIO from threading import Thread from typing import BinaryIO, Union @@ -9,6 +10,7 @@ from app.asr_models.asr_model import ASRModel from app.config import CONFIG +from app.utils import WriteAll class WhisperXASR(ASRModel): @@ -85,10 +87,24 @@ def transcribe( result = whisperx.assign_word_speakers(diarize_segments, result) result["language"] = language + # Store the output directory and audio path for the "all" option + self.output_dir = os.environ.get("OUTPUT_DIR", "/tmp") + self.audio_path = os.environ.get("AUDIO_FILENAME", "audio") + + # For "all" output format, create and return the zip bytes + if output == "all": + # Import WriteAll from app.utils if needed + writer = WriteAll(self.output_dir) + zip_bytes = writer.create_zip_bytes(result) + # Create a generator that yields the bytes + def bytes_generator(): + yield zip_bytes + return bytes_generator() + + # For other formats, write to StringIO and return that output_file = StringIO() self.write_result(result, output_file, output) output_file.seek(0) - return output_file def language_detection(self, audio): diff --git a/app/asr_models/openai_whisper_engine.py b/app/asr_models/openai_whisper_engine.py index 655d682..efd13e0 100644 --- a/app/asr_models/openai_whisper_engine.py +++ b/app/asr_models/openai_whisper_engine.py @@ -1,4 +1,5 @@ import time +import os from io import StringIO from threading import Thread from typing import BinaryIO, Union @@ -9,6 +10,7 @@ from app.asr_models.asr_model import ASRModel from app.config import CONFIG +from app.utils import WriteAll class OpenAIWhisperASR(ASRModel): @@ -49,10 +51,24 @@ def transcribe( with self.model_lock: result = self.model.transcribe(audio, **options_dict) + # Store the output directory and audio path for the "all" option + self.output_dir = os.environ.get("OUTPUT_DIR", "/tmp") + self.audio_path = os.environ.get("AUDIO_FILENAME", "audio") + + # For "all" output format, create and return the zip bytes + if output == "all": + # Import WriteAll from app.utils if needed + writer = WriteAll(self.output_dir) + zip_bytes = writer.create_zip_bytes(result) + # Create a generator that yields the bytes + def bytes_generator(): + yield zip_bytes + return bytes_generator() + + # For other formats, write to StringIO and return that output_file = StringIO() self.write_result(result, output_file, output) output_file.seek(0) - return output_file def language_detection(self, audio): diff --git a/app/utils.py b/app/utils.py index ddc8a99..85f1f75 100644 --- a/app/utils.py +++ b/app/utils.py @@ -1,5 +1,7 @@ import json import os +import io +import zipfile from dataclasses import asdict from typing import BinaryIO, TextIO @@ -32,7 +34,9 @@ class WriteTXT(ResultWriter): def write_result(self, result: dict, file: TextIO): for segment in result["segments"]: - print(segment.text.strip(), file=file, flush=True) + # Handle both segment as dict and as object + text = segment["text"] if isinstance(segment, dict) else segment.text + print(text.strip(), file=file, flush=True) class WriteVTT(ResultWriter): @@ -41,9 +45,19 @@ class WriteVTT(ResultWriter): def write_result(self, result: dict, file: TextIO): print("WEBVTT\n", file=file) for segment in result["segments"]: + # Handle both segment as dict and as object + if isinstance(segment, dict): + start = segment["start"] + end = segment["end"] + text = segment["text"] + else: + start = segment.start + end = segment.end + text = segment.text + print( - f"{format_timestamp(segment.start)} --> {format_timestamp(segment.end)}\n" - f"{segment.text.strip().replace('-->', '->')}\n", + f"{format_timestamp(start)} --> {format_timestamp(end)}\n" + f"{text.strip().replace('-->', '->')}\n", file=file, flush=True, ) @@ -54,12 +68,22 @@ class WriteSRT(ResultWriter): def write_result(self, result: dict, file: TextIO): for i, segment in enumerate(result["segments"], start=1): + # Handle both segment as dict and as object + if isinstance(segment, dict): + start = segment["start"] + end = segment["end"] + text = segment["text"] + else: + start = segment.start + end = segment.end + text = segment.text + # write srt lines print( f"{i}\n" - f"{format_timestamp(segment.start, always_include_hours=True, decimal_marker=',')} --> " - f"{format_timestamp(segment.end, always_include_hours=True, decimal_marker=',')}\n" - f"{segment.text.strip().replace('-->', '->')}\n", + f"{format_timestamp(start, always_include_hours=True, decimal_marker=',')} --> " + f"{format_timestamp(end, always_include_hours=True, decimal_marker=',')}\n" + f"{text.strip().replace('-->', '->')}\n", file=file, flush=True, ) @@ -80,9 +104,19 @@ class WriteTSV(ResultWriter): def write_result(self, result: dict, file: TextIO): print("start", "end", "text", sep="\t", file=file) for segment in result["segments"]: - print(round(1000 * segment.start), file=file, end="\t") - print(round(1000 * segment.end), file=file, end="\t") - print(segment.text.strip().replace("\t", " "), file=file, flush=True) + # Handle both segment as dict and as object + if isinstance(segment, dict): + start = segment["start"] + end = segment["end"] + text = segment["text"] + else: + start = segment.start + end = segment.end + text = segment.text + + print(round(1000 * start), file=file, end="\t") + print(round(1000 * end), file=file, end="\t") + print(text.strip().replace("\t", " "), file=file, flush=True) class WriteJSON(ResultWriter): @@ -90,10 +124,82 @@ class WriteJSON(ResultWriter): def write_result(self, result: dict, file: TextIO): if "segments" in result: - result["segments"] = [asdict(segment) for segment in result["segments"]] + # Check if segments are already dictionaries or need to be converted + if result["segments"] and not isinstance(result["segments"][0], dict): + result["segments"] = [asdict(segment) for segment in result["segments"]] json.dump(result, file) +class WriteAll: + """ + Write a transcript to multiple files in all supported formats. + """ + + def __init__(self, output_dir: str): + self.output_dir = output_dir + self.writers = { + "txt": WriteTXT(output_dir), + "vtt": WriteVTT(output_dir), + "srt": WriteSRT(output_dir), + "tsv": WriteTSV(output_dir), + "json": WriteJSON(output_dir) + } + + def __call__(self, result: dict, audio_path: str): + for format_name, writer in self.writers.items(): + try: + writer(result, audio_path) + except Exception as e: + print(f"Error in {format_name} writer: {str(e)}") + # Continue with other formats even if one fails + + def create_zip_bytes(self, result: dict): + """ + Create a zip file in memory and return its bytes. + This creates a valid zip file with all transcript formats. + """ + # Create a new in-memory zip file + buffer = io.BytesIO() + + try: + # Open the zip file for writing + with zipfile.ZipFile(buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + # Write each format to the zip file + formats = { + "txt": WriteTXT, + "vtt": WriteVTT, + "srt": WriteSRT, + "tsv": WriteTSV, + "json": WriteJSON + } + + for format_name, writer_class in formats.items(): + try: + # Create a buffer for this format's content + output = io.StringIO() + + # Write the result to the buffer + writer = writer_class(self.output_dir) + writer.write_result(result, output) + + # Get the text content and add it to the zip + content = output.getvalue().encode('utf-8') # Convert string to bytes + zip_file.writestr(f"transcript.{format_name}", content) + + except Exception as e: + print(f"Error adding {format_name} to zip: {str(e)}") + # Continue with other formats + + # Reset the buffer position and get the zip bytes + buffer.seek(0) + return buffer.read() + + except Exception as e: + print(f"Error creating zip file: {str(e)}") + # Return an empty buffer if zip creation fails + return b"" + + def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE): """ Open an audio file object and read as mono waveform, resampling as necessary. diff --git a/app/webservice.py b/app/webservice.py index 8f4fa6a..78321c2 100644 --- a/app/webservice.py +++ b/app/webservice.py @@ -8,9 +8,10 @@ import uvicorn from fastapi import FastAPI, File, Query, UploadFile, applications from fastapi.openapi.docs import get_swagger_ui_html -from fastapi.responses import RedirectResponse, StreamingResponse +from fastapi.responses import RedirectResponse, StreamingResponse, Response, FileResponse from fastapi.staticfiles import StaticFiles from whisper import tokenizer +import tempfile from app.config import CONFIG from app.factory.asr_model_factory import ASRModelFactory @@ -86,8 +87,14 @@ async def asr( description="Max speakers in this file", include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" else False), ), - output: Union[str, None] = Query(default="txt", enum=["txt", "vtt", "srt", "tsv", "json"]), + output: Union[str, None] = Query(default="txt", enum=["txt", "vtt", "srt", "tsv", "json", "all"]), ): + # Set environment variables for output directory and audio filename if needed for "all" output + if output == "all": + os.environ["OUTPUT_DIR"] = CONFIG.TEMP_DIR if hasattr(CONFIG, "TEMP_DIR") else "/tmp" + os.environ["AUDIO_FILENAME"] = audio_file.filename + + # Process the audio file with the ASR model result = asr_model.transcribe( load_audio(audio_file.file, encode), task, @@ -98,13 +105,40 @@ async def asr( {"diarize": diarize, "min_speakers": min_speakers, "max_speakers": max_speakers}, output, ) + + # For "all" output format (zip file) + if output == "all": + # Get the bytes from the generator + zip_bytes = next(result) + + # Create a temporary file to save the zip + with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_file: + temp_file.write(zip_bytes) + temp_path = temp_file.name + + # Create a nice filename for the download + base_filename = os.path.basename(audio_file.filename) + download_filename = f"{os.path.splitext(base_filename)[0]}_transcripts.zip" + + # Use FastAPI's FileResponse to serve the file + return FileResponse( + path=temp_path, + filename=download_filename, + media_type="application/zip", + headers={"Asr-Engine": CONFIG.ASR_ENGINE} + ) + + # For other formats, continue using StreamingResponse + # Set the appropriate content type based on output format + content_type = "text/plain" + if output == "json": + content_type = "application/json" + + # Return the streaming response for text-based formats return StreamingResponse( result, - media_type="text/plain", - headers={ - "Asr-Engine": CONFIG.ASR_ENGINE, - "Content-Disposition": f'attachment; filename="{quote(audio_file.filename)}.{output}"', - }, + media_type=content_type, + headers={"Asr-Engine": CONFIG.ASR_ENGINE} ) diff --git a/docker-compose-cpu.yml b/docker-compose-cpu.yml new file mode 100644 index 0000000..5fb912b --- /dev/null +++ b/docker-compose-cpu.yml @@ -0,0 +1,17 @@ +version: "3.4" + +services: + whisper-asr-webservice: + build: + context: . + dockerfile: Dockerfile + environment: + - ASR_MODEL=base + ports: + - "9000:9000" + volumes: + - ./app:/app/app + - cache-whisper:/root/.cache + +volumes: + cache-whisper: \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 2fab6cb..bc8f326 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,17 +1,42 @@ -version: "3.4" - services: whisper-asr-webservice: build: context: . - dockerfile: Dockerfile - environment: - - ASR_MODEL=base + dockerfile: Dockerfile.gpu + restart: unless-stopped + env_file: .env ports: - "9000:9000" + dns: + - 172.20.0.2 # CoreDNS server IP volumes: - ./app:/app/app + - ./data:/data/whisper - cache-whisper:/root/.cache + - huggingface-cache:/root/.cache/huggingface + networks: + - monitoring + - xinference-network + - core-dns_core-network + - whisper-network + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + +networks: + whisper-network: + driver: bridge + monitoring: + driver: bridge + xinference-network: + driver: bridge + core-dns_core-network: + external: true volumes: cache-whisper: + huggingface-cache: \ No newline at end of file diff --git a/example.env b/example.env new file mode 100644 index 0000000..d4026cd --- /dev/null +++ b/example.env @@ -0,0 +1,25 @@ +# Whisper ASR Webservice Environment Configuration + +# Model configuration +ASR_ENGINE=whisperx +ASR_MODEL=large-v3 + +# Performance tuning +COMPUTE_TYPE=float32 +NUM_WORKERS=4 +BATCH_SIZE=16 +BEAM_SIZE=4 + +# Whisper X Settings +SUBTITLE_MAX_LINE_WIDTH=1000 +SUBTITLE_MAX_LINE_COUNT=2 +SUBTITLE_HIGHLIGHT_WORDS=true + + +# System settings +# 16000 the optimal for Whisper to work with Audio files +# 24000k seens ti be working for now but incase of issues lower it to 16000 +SAMPLE_RATE=24000 +HF_HOME=/root/.cache/huggingface +CPU_THREADS=4 + diff --git a/whisper-asr-webservice-main/.dockerignore b/whisper-asr-webservice-main/.dockerignore new file mode 100644 index 0000000..59e2f83 --- /dev/null +++ b/whisper-asr-webservice-main/.dockerignore @@ -0,0 +1,3 @@ +.git +.venv +venv \ No newline at end of file diff --git a/whisper-asr-webservice-main/.github/FUNDING.yml b/whisper-asr-webservice-main/.github/FUNDING.yml new file mode 100644 index 0000000..b558be2 --- /dev/null +++ b/whisper-asr-webservice-main/.github/FUNDING.yml @@ -0,0 +1,4 @@ +# These are supported funding model platforms + +github: [ahmetoner] +custom: ['https://bmc.link/ahmetoner'] diff --git a/whisper-asr-webservice-main/.github/workflows/docker-publish.yml b/whisper-asr-webservice-main/.github/workflows/docker-publish.yml new file mode 100644 index 0000000..571d2b8 --- /dev/null +++ b/whisper-asr-webservice-main/.github/workflows/docker-publish.yml @@ -0,0 +1,46 @@ +name: Publish Docker Image +on: + push: + tags: + - '*' + branches: + - debug + +env: + DOCKER_USER: ${{secrets.DOCKER_USER}} + DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}} + REPO_NAME: ${{secrets.REPO_NAME}} +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - docker_file: Dockerfile + platforms: linux/arm64,linux/amd64 + - docker_file: Dockerfile.gpu + tag_extension: -gpu + platforms: linux/amd64 + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKER_USER }} + password: ${{ secrets.DOCKER_PASSWORD }} + - name: Build and Publish the Docker debug image + if: github.ref == 'refs/heads/debug' + run: | + DOCKER_IMAGE_DEBUG=$DOCKER_USER/$REPO_NAME:debug${{ matrix.tag_extension }} + docker buildx build . --no-cache --platform=${{ matrix.platforms }} -t "${DOCKER_IMAGE_DEBUG}" -f ${{ matrix.docker_file }} --push + - name: Build and Publish the Docker image + if: github.ref != 'refs/heads/debug' + run: | + DOCKER_IMAGE_LATEST=$DOCKER_USER/$REPO_NAME:latest${{ matrix.tag_extension }} + DOCKER_IMAGE_VERSION=$DOCKER_USER/$REPO_NAME:$GITHUB_REF_NAME${{ matrix.tag_extension }} + docker buildx build . --no-cache --platform=${{ matrix.platforms }} -t "${DOCKER_IMAGE_LATEST}" -t "${DOCKER_IMAGE_VERSION}" -f ${{ matrix.docker_file }} --push diff --git a/whisper-asr-webservice-main/.github/workflows/documentation.yml b/whisper-asr-webservice-main/.github/workflows/documentation.yml new file mode 100644 index 0000000..03db5a8 --- /dev/null +++ b/whisper-asr-webservice-main/.github/workflows/documentation.yml @@ -0,0 +1,27 @@ +name: Documentation +on: + push: + tags: + - '*' + branches: + - docs +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + if: github.event.repository.fork == false + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: 3.x + - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + - uses: actions/cache@v3 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + - run: pip install mkdocs-material pymdown-extensions + - run: mkdocs gh-deploy --force diff --git a/whisper-asr-webservice-main/.gitignore b/whisper-asr-webservice-main/.gitignore new file mode 100644 index 0000000..35e5869 --- /dev/null +++ b/whisper-asr-webservice-main/.gitignore @@ -0,0 +1,44 @@ +*.pyc + +# Packages +*.egg +!/tests/**/*.egg +/*.egg-info +/dist/* +build +_build +.cache +*.so +venv + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.pytest_cache + +.DS_Store +.idea/* +.python-version +.vscode/* + +/test.py +/test_*.* + +/setup.cfg +MANIFEST.in +/setup.py +/docs/site/* +/tests/fixtures/simple_project/setup.py +/tests/fixtures/project_with_extras/setup.py +.mypy_cache + +.venv +/releases/* +pip-wheel-metadata +/poetry.toml + +poetry/core/* + +public