From 547846df89310e84b3427f4fa04090fa3de57d99 Mon Sep 17 00:00:00 2001 From: Bolor Date: Fri, 6 Mar 2026 13:23:36 -0800 Subject: [PATCH 1/4] adding scorer without pydub --- pyrit/score/audio_transcript_scorer.py | 127 +++++++++++++------------ 1 file changed, 67 insertions(+), 60 deletions(-) diff --git a/pyrit/score/audio_transcript_scorer.py b/pyrit/score/audio_transcript_scorer.py index b0d0ad2a92..31b6290671 100644 --- a/pyrit/score/audio_transcript_scorer.py +++ b/pyrit/score/audio_transcript_scorer.py @@ -1,8 +1,11 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +from ast import If import logging import os +import shutil +import subprocess import tempfile import uuid from abc import ABC @@ -16,6 +19,24 @@ logger = logging.getLogger(__name__) +def _check_ffmpeg_installed() -> bool: + """ + Check if ffmpeg is installed and available on PATH. + FFmpeg is required for scoring audio content in videos + + Returns: + bool: True if ffmpeg is installed, False otherwise. + """ + + if shutil.which("ffmpeg") is None: + # raise RuntimeError( + # "ffmpeg is required for audio processing but was not found on PATH. " + # "Install it via: apt install ffmpeg / brew install ffmpeg / " + # "https://ffmpeg.org/download.html" + # ) + return False + return True + class AudioTranscriptHelper(ABC): # noqa: B024 """ Abstract base class for audio scorers that process audio by transcribing and scoring the text. @@ -29,7 +50,6 @@ class AudioTranscriptHelper(ABC): # noqa: B024 _DEFAULT_SAMPLE_RATE = 16000 # 16kHz - Azure Speech optimal rate _DEFAULT_CHANNELS = 1 # Mono - Azure Speech prefers mono _DEFAULT_SAMPLE_WIDTH = 2 # 16-bit audio (2 bytes per sample) - _DEFAULT_EXPORT_PARAMS = ["-acodec", "pcm_s16le"] # 16-bit PCM for best compatibility def __init__( self, @@ -173,23 +193,30 @@ def _ensure_wav_format(self, audio_path: str) -> str: str: Path to WAV file (original if already WAV, or converted temporary file). Raises: - ModuleNotFoundError: If pydub is not installed. + RuntimeError: If ffmpeg is not installed. """ - try: - from pydub import AudioSegment - except ModuleNotFoundError as e: - logger.error("Could not import pydub. Install it via 'pip install pydub'") - raise e - - audio = AudioSegment.from_file(audio_path) - audio = ( - audio.set_frame_rate(self._DEFAULT_SAMPLE_RATE) - .set_channels(self._DEFAULT_CHANNELS) - .set_sample_width(self._DEFAULT_SAMPLE_WIDTH) - ) + + if not _check_ffmpeg_installed(): + raise RuntimeError( + "ffmpeg is required for audio processing but was not found on PATH. " + "Install it via: apt install ffmpeg / brew install ffmpeg / " + "https://ffmpeg.org/download.html" + ) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: - audio.export(temp_wav.name, format="wav") - return temp_wav.name + output_path = temp_wav.name + subprocess.run( + [ + "ffmpeg", "-i", audio_path, + "-ar", str(self._DEFAULT_SAMPLE_RATE), + "-ac", str(self._DEFAULT_CHANNELS), + "-acodec", "pcm_s16le", # 16-bit PCM + output_path, "-y", + ], + check=True, + capture_output=True, + ) + return output_path def _extract_audio_from_video(self, video_path: str) -> Optional[str]: """ @@ -203,7 +230,7 @@ def _extract_audio_from_video(self, video_path: str) -> Optional[str]: or returns None if extraction fails. Raises: - ModuleNotFoundError: If pydub/ffmpeg is not installed. + RuntimeError: If ffmpeg is not installed. """ return AudioTranscriptHelper.extract_audio_from_video(video_path) @@ -220,55 +247,35 @@ def extract_audio_from_video(video_path: str) -> Optional[str]: or returns None if extraction fails. Raises: - ModuleNotFoundError: If pydub/ffmpeg is not installed. + RuntimeError: If ffmpeg is not installed. """ - try: - from pydub import AudioSegment - except ModuleNotFoundError as e: - logger.error("Could not import pydub. Install it via 'pip install pydub'") - raise e + if not _check_ffmpeg_installed(): + raise RuntimeError( + "ffmpeg is required for audio processing but was not found on PATH. " + "Install it via: apt install ffmpeg / brew install ffmpeg / " + "https://ffmpeg.org/download.html" + ) try: - # Extract audio from video using pydub (requires ffmpeg) logger.info(f"Extracting audio from video: {video_path}") - audio = AudioSegment.from_file(video_path) + with tempfile.NamedTemporaryFile(suffix="_video_audio.wav", delete=False) as temp_audio: + output_path = temp_audio.name + subprocess.run( + [ + "ffmpeg", "-i", video_path, + "-ar", str(AudioTranscriptHelper._DEFAULT_SAMPLE_RATE), + "-ac", str(AudioTranscriptHelper._DEFAULT_CHANNELS), + "-acodec", "pcm_s16le", # 16-bit PCM + output_path, "-y", + ], + check=True, + capture_output=True, + ) logger.info( - f"Audio extracted: duration={len(audio)}ms, channels={audio.channels}, " - f"sample_width={audio.sample_width}, frame_rate={audio.frame_rate}" + f"Audio exported to: {output_path} " + f"(rate={AudioTranscriptHelper._DEFAULT_SAMPLE_RATE}Hz, mono)" ) - - # Optimize for Azure Speech recognition: - # Azure Speech works best with 16kHz mono audio (same as Azure TTS output) - if audio.frame_rate != AudioTranscriptHelper._DEFAULT_SAMPLE_RATE: - logger.info( - f"Resampling audio from {audio.frame_rate}Hz to {AudioTranscriptHelper._DEFAULT_SAMPLE_RATE}Hz" - ) - audio = audio.set_frame_rate(AudioTranscriptHelper._DEFAULT_SAMPLE_RATE) - - # Ensure 16-bit audio - if audio.sample_width != AudioTranscriptHelper._DEFAULT_SAMPLE_WIDTH: - logger.info( - f"Converting sample width from {audio.sample_width * 8}-bit" - f" to {AudioTranscriptHelper._DEFAULT_SAMPLE_WIDTH * 8}-bit" - ) - audio = audio.set_sample_width(AudioTranscriptHelper._DEFAULT_SAMPLE_WIDTH) - - # Convert to mono (Azure Speech prefers mono) - if audio.channels > AudioTranscriptHelper._DEFAULT_CHANNELS: - logger.info(f"Converting from {audio.channels} channels to mono") - audio = audio.set_channels(AudioTranscriptHelper._DEFAULT_CHANNELS) - - # Create temporary WAV file with PCM encoding for best compatibility - with tempfile.NamedTemporaryFile(suffix="_video_audio.wav", delete=False) as temp_audio: - audio.export( - temp_audio.name, - format="wav", - parameters=AudioTranscriptHelper._DEFAULT_EXPORT_PARAMS, - ) - logger.info( - f"Audio exported to: {temp_audio.name} (duration={len(audio)}ms, rate={audio.frame_rate}Hz, mono)" - ) - return temp_audio.name + return output_path except Exception as e: logger.warning(f"Failed to extract audio from video {video_path}: {e}") return None From 9f4d569769fb47dc7ff877600cd0115c83656aa0 Mon Sep 17 00:00:00 2001 From: Bolor Date: Sat, 7 Mar 2026 09:41:11 -0800 Subject: [PATCH 2/4] getting rid of pydub --- pyrit/score/audio_transcript_scorer.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/pyrit/score/audio_transcript_scorer.py b/pyrit/score/audio_transcript_scorer.py index 31b6290671..cb97be1404 100644 --- a/pyrit/score/audio_transcript_scorer.py +++ b/pyrit/score/audio_transcript_scorer.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from ast import If import logging import os import shutil @@ -22,20 +21,11 @@ def _check_ffmpeg_installed() -> bool: """ Check if ffmpeg is installed and available on PATH. - FFmpeg is required for scoring audio content in videos Returns: bool: True if ffmpeg is installed, False otherwise. """ - - if shutil.which("ffmpeg") is None: - # raise RuntimeError( - # "ffmpeg is required for audio processing but was not found on PATH. " - # "Install it via: apt install ffmpeg / brew install ffmpeg / " - # "https://ffmpeg.org/download.html" - # ) - return False - return True + return shutil.which("ffmpeg") is not None class AudioTranscriptHelper(ABC): # noqa: B024 """ From 4780255ec35d42831a87ce990bb184a0e49356d3 Mon Sep 17 00:00:00 2001 From: Bolor Date: Sat, 7 Mar 2026 11:20:44 -0800 Subject: [PATCH 3/4] precommit --- pyrit/score/audio_transcript_scorer.py | 41 ++++++++++++++++---------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/pyrit/score/audio_transcript_scorer.py b/pyrit/score/audio_transcript_scorer.py index cb97be1404..8e2630d20b 100644 --- a/pyrit/score/audio_transcript_scorer.py +++ b/pyrit/score/audio_transcript_scorer.py @@ -27,6 +27,7 @@ def _check_ffmpeg_installed() -> bool: """ return shutil.which("ffmpeg") is not None + class AudioTranscriptHelper(ABC): # noqa: B024 """ Abstract base class for audio scorers that process audio by transcribing and scoring the text. @@ -185,23 +186,28 @@ def _ensure_wav_format(self, audio_path: str) -> str: Raises: RuntimeError: If ffmpeg is not installed. """ - if not _check_ffmpeg_installed(): raise RuntimeError( "ffmpeg is required for audio processing but was not found on PATH. " "Install it via: apt install ffmpeg / brew install ffmpeg / " "https://ffmpeg.org/download.html" ) - + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: output_path = temp_wav.name subprocess.run( [ - "ffmpeg", "-i", audio_path, - "-ar", str(self._DEFAULT_SAMPLE_RATE), - "-ac", str(self._DEFAULT_CHANNELS), - "-acodec", "pcm_s16le", # 16-bit PCM - output_path, "-y", + "ffmpeg", + "-i", + audio_path, + "-ar", + str(self._DEFAULT_SAMPLE_RATE), + "-ac", + str(self._DEFAULT_CHANNELS), + "-acodec", + "pcm_s16le", # 16-bit PCM + output_path, + "-y", ], check=True, capture_output=True, @@ -252,19 +258,22 @@ def extract_audio_from_video(video_path: str) -> Optional[str]: output_path = temp_audio.name subprocess.run( [ - "ffmpeg", "-i", video_path, - "-ar", str(AudioTranscriptHelper._DEFAULT_SAMPLE_RATE), - "-ac", str(AudioTranscriptHelper._DEFAULT_CHANNELS), - "-acodec", "pcm_s16le", # 16-bit PCM - output_path, "-y", + "ffmpeg", + "-i", + video_path, + "-ar", + str(AudioTranscriptHelper._DEFAULT_SAMPLE_RATE), + "-ac", + str(AudioTranscriptHelper._DEFAULT_CHANNELS), + "-acodec", + "pcm_s16le", # 16-bit PCM + output_path, + "-y", ], check=True, capture_output=True, ) - logger.info( - f"Audio exported to: {output_path} " - f"(rate={AudioTranscriptHelper._DEFAULT_SAMPLE_RATE}Hz, mono)" - ) + logger.info(f"Audio exported to: {output_path} (rate={AudioTranscriptHelper._DEFAULT_SAMPLE_RATE}Hz, mono)") return output_path except Exception as e: logger.warning(f"Failed to extract audio from video {video_path}: {e}") From f80b6ba998f476ee26915af793277715697dadbc Mon Sep 17 00:00:00 2001 From: Bolor Date: Wed, 11 Mar 2026 15:10:28 -0700 Subject: [PATCH 4/4] replace with pyav --- pyproject.toml | 1 + pyrit/score/audio_transcript_scorer.py | 142 +++++++++++++------------ 2 files changed, 76 insertions(+), 67 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ed9ab048ed..5e3d4ddced 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,6 +135,7 @@ speech = [ # all includes all functional dependencies excluding the ones from the "dev" extra all = [ "accelerate>=1.7.0", + "av>=14.0.0", "azure-ai-ml>=1.27.1", "azure-cognitiveservices-speech>=1.44.0", "azureml-mlflow>=1.60.0", diff --git a/pyrit/score/audio_transcript_scorer.py b/pyrit/score/audio_transcript_scorer.py index 8e2630d20b..5d28cfb911 100644 --- a/pyrit/score/audio_transcript_scorer.py +++ b/pyrit/score/audio_transcript_scorer.py @@ -3,13 +3,13 @@ import logging import os -import shutil -import subprocess import tempfile import uuid from abc import ABC from typing import Optional +import av + from pyrit.memory import CentralMemory from pyrit.models import MessagePiece, Score from pyrit.prompt_converter import AzureSpeechAudioToTextConverter @@ -18,14 +18,74 @@ logger = logging.getLogger(__name__) -def _check_ffmpeg_installed() -> bool: +def _is_compliant_wav(input_path: str, *, sample_rate: int, channels: int) -> bool: + """ + Check if the audio file is already a compliant WAV with the target format. + + Args: + input_path (str): Path to the audio file. + sample_rate (int): Expected sample rate in Hz. + channels (int): Expected number of channels. + + Returns: + bool: True if the file is already compliant, False otherwise. + """ + try: + with av.open(input_path) as container: + if not container.streams.audio: + return False + stream = container.streams.audio[0] + codec_name = stream.codec_context.name + is_pcm_s16 = codec_name == "pcm_s16le" + is_correct_rate = stream.rate == sample_rate + is_correct_channels = stream.channels == channels + return is_pcm_s16 and is_correct_rate and is_correct_channels + except Exception: + return False + + +def _audio_to_wav(input_path: str, *, sample_rate: int, channels: int) -> str: """ - Check if ffmpeg is installed and available on PATH. + Convert any audio or video file to a normalised PCM WAV using PyAV. + + If the input is already a compliant WAV (correct sample rate, channels, and codec), + returns the original path without re-encoding. + + Args: + input_path (str): Source audio or video file. + sample_rate (int): Target sample rate in Hz. + channels (int): Target number of channels (1 = mono). Returns: - bool: True if ffmpeg is installed, False otherwise. + str: Path to the WAV file (original if compliant, otherwise a temporary file). """ - return shutil.which("ffmpeg") is not None + # Skip conversion if already compliant + if _is_compliant_wav(input_path, sample_rate=sample_rate, channels=channels): + logger.debug(f"Audio file already compliant, skipping conversion: {input_path}") + return input_path + + layout = "mono" if channels == 1 else "stereo" + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + output_path = tmp.name + + with av.open(input_path) as in_container: + with av.open(output_path, "w", format="wav") as out_container: + out_stream = out_container.add_stream("pcm_s16le", rate=sample_rate, layout=layout) + resampler = av.AudioResampler(format="s16", layout=layout, rate=sample_rate) + + for frame in in_container.decode(audio=0): + for out_frame in resampler.resample(frame): + for packet in out_stream.encode(out_frame): + out_container.mux(packet) + + for out_frame in resampler.resample(None): + for packet in out_stream.encode(out_frame): + out_container.mux(packet) + + for packet in out_stream.encode(None): + out_container.mux(packet) + + return output_path class AudioTranscriptHelper(ABC): # noqa: B024 @@ -160,7 +220,7 @@ async def _transcribe_audio_async(self, audio_path: str) -> str: logger.info(f"Audio transcription: WAV file size = {file_size} bytes") try: - converter = AzureSpeechAudioToTextConverter() + converter = AzureSpeechAudioToTextConverter(use_entra_auth=True) logger.info("Audio transcription: Starting Azure Speech transcription...") result = await converter.convert_async(prompt=wav_path, input_type="audio_path") logger.info(f"Audio transcription: Result = '{result.output_text}'") @@ -182,37 +242,12 @@ def _ensure_wav_format(self, audio_path: str) -> str: Returns: str: Path to WAV file (original if already WAV, or converted temporary file). - - Raises: - RuntimeError: If ffmpeg is not installed. """ - if not _check_ffmpeg_installed(): - raise RuntimeError( - "ffmpeg is required for audio processing but was not found on PATH. " - "Install it via: apt install ffmpeg / brew install ffmpeg / " - "https://ffmpeg.org/download.html" - ) - - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: - output_path = temp_wav.name - subprocess.run( - [ - "ffmpeg", - "-i", - audio_path, - "-ar", - str(self._DEFAULT_SAMPLE_RATE), - "-ac", - str(self._DEFAULT_CHANNELS), - "-acodec", - "pcm_s16le", # 16-bit PCM - output_path, - "-y", - ], - check=True, - capture_output=True, + return _audio_to_wav( + audio_path, + sample_rate=self._DEFAULT_SAMPLE_RATE, + channels=self._DEFAULT_CHANNELS, ) - return output_path def _extract_audio_from_video(self, video_path: str) -> Optional[str]: """ @@ -224,9 +259,6 @@ def _extract_audio_from_video(self, video_path: str) -> Optional[str]: Returns: str: a path to the extracted audio file (WAV format) or returns None if extraction fails. - - Raises: - RuntimeError: If ffmpeg is not installed. """ return AudioTranscriptHelper.extract_audio_from_video(video_path) @@ -241,37 +273,13 @@ def extract_audio_from_video(video_path: str) -> Optional[str]: Returns: str: a path to the extracted audio file (WAV format) or returns None if extraction fails. - - Raises: - RuntimeError: If ffmpeg is not installed. """ - if not _check_ffmpeg_installed(): - raise RuntimeError( - "ffmpeg is required for audio processing but was not found on PATH. " - "Install it via: apt install ffmpeg / brew install ffmpeg / " - "https://ffmpeg.org/download.html" - ) - try: logger.info(f"Extracting audio from video: {video_path}") - with tempfile.NamedTemporaryFile(suffix="_video_audio.wav", delete=False) as temp_audio: - output_path = temp_audio.name - subprocess.run( - [ - "ffmpeg", - "-i", - video_path, - "-ar", - str(AudioTranscriptHelper._DEFAULT_SAMPLE_RATE), - "-ac", - str(AudioTranscriptHelper._DEFAULT_CHANNELS), - "-acodec", - "pcm_s16le", # 16-bit PCM - output_path, - "-y", - ], - check=True, - capture_output=True, + output_path = _audio_to_wav( + video_path, + sample_rate=AudioTranscriptHelper._DEFAULT_SAMPLE_RATE, + channels=AudioTranscriptHelper._DEFAULT_CHANNELS, ) logger.info(f"Audio exported to: {output_path} (rate={AudioTranscriptHelper._DEFAULT_SAMPLE_RATE}Hz, mono)") return output_path