Skip to content

File: Broken/Externals/Whisper.py

Broken.Externals.Whisper

BrokenWhisper

Bases: ExternalModelsBase, ExternalTorchBase

Source code in Broken/Externals/Whisper.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
class BrokenWhisper(ExternalModelsBase, ExternalTorchBase):
    class Model(str, BrokenEnum):
        Tiny         = "tiny"
        TinyEN       = "tiny.en"
        Base         = "base"
        BaseEN       = "base.en"
        Small        = "small"
        SmallEN      = "small.en"
        SmallDistEN  = "distil-small.en"
        Medium       = "medium"
        MediumEN     = "medium.en"
        MediumDistEN = "distil-medium.en"
        LargeV1      = "large-v1"
        LargeV2      = "large-v2"
        LargeV3      = "large-v3"
        Large        = "large"
        LargeDist2   = "distil-large-v2"
        LargeDist3   = "distil-large-v3"

    model: Annotated[Model, Option("--model", "-m",
        help="[bold green](🟢 Basic)[/] Model to use for Transcription [green](tiny, base, small, medium, large)[/]")] = \
        Field(Model.LargeV2)

    lowvram: Annotated[bool, Option("--lowvram", "-l",
        help="[bold green](🟢 Basic)[/] Use INT8 instead of FP16 for low VRAM GPUs")] = \
        Field(False)

    def _load_model(self):
        self.load_torch()
        install(packages="faster_whisper")

        # Copy PyPI libcudnn to avoid setting LD_LIBRARY_PATH
        if BrokenPlatform.OnLinux:
            for target in ("libcudnn_ops_infer.so.8", "libcudnn_cnn_infer.so.8"):
                if (libcudnn := Path(f"/usr/lib/{target}")).exists():
                    continue
                for site_packages in site.getsitepackages():
                    if (pycudnn := Path(site_packages)/f"nvidia/cudnn/lib/{target}").exists():
                        log.warning(f"Running FasterWhisper might fail, as ({libcudnn}) doesn't exist")
                        log.warning(f"• Luckily, we can copy it from {pycudnn}")
                        shell("sudo", "cp", pycudnn, libcudnn, confirm=True)
                        break
                else:
                    raise RuntimeError(f"{target} not found in any site-packages")

        # Finally load the model
        log.info(f"Loading OpenAI Whisper model ({self.model.value})")

        self._model = WhisperModel(
            model_size_or_path=self.model.value,
            download_root=(BROKEN.DIRECTORIES.CACHE/"Whisper"),
            compute_type=("int8" if self.lowvram else "float16"),
        )

    def transcribe(self,
        audio: Union[str, Path, numpy.ndarray],
        *,
        reference: Optional[str]=None
    ) -> Spoken:
        if isinstance(audio, str) or isinstance(audio, Path):
            if not (audio := BrokenPath.get(audio)).exists():
                raise RuntimeError(f"Audio file doesn't exist: {audio}")
            audio = str(audio)

        self.load_model()
        spoken = Spoken()

        with Halo(f"Transcribing audio with Whisper model ({self.model}).."):
            for segment in self._model.transcribe(
                audio=audio,
                word_timestamps=True,
                initial_prompt=reference
            )[0]:
                spoken.sentences[(segment.start) : (segment.end + 0.001)] = segment.text.strip()

                for word in segment.words:
                    spoken.words[(word.start) : (word.end + 0.001)] = word.word.strip()
        del self._model
        return spoken

Model

Bases: str, BrokenEnum

Source code in Broken/Externals/Whisper.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class Model(str, BrokenEnum):
    Tiny         = "tiny"
    TinyEN       = "tiny.en"
    Base         = "base"
    BaseEN       = "base.en"
    Small        = "small"
    SmallEN      = "small.en"
    SmallDistEN  = "distil-small.en"
    Medium       = "medium"
    MediumEN     = "medium.en"
    MediumDistEN = "distil-medium.en"
    LargeV1      = "large-v1"
    LargeV2      = "large-v2"
    LargeV3      = "large-v3"
    Large        = "large"
    LargeDist2   = "distil-large-v2"
    LargeDist3   = "distil-large-v3"
Tiny
Tiny = 'tiny'
TinyEN
TinyEN = 'tiny.en'
Base
Base = 'base'
BaseEN
BaseEN = 'base.en'
Small
Small = 'small'
SmallEN
SmallEN = 'small.en'
SmallDistEN
SmallDistEN = 'distil-small.en'
Medium
Medium = 'medium'
MediumEN
MediumEN = 'medium.en'
MediumDistEN
MediumDistEN = 'distil-medium.en'
LargeV1
LargeV1 = 'large-v1'
LargeV2
LargeV2 = 'large-v2'
LargeV3
LargeV3 = 'large-v3'
Large
Large = 'large'
LargeDist2
LargeDist2 = 'distil-large-v2'
LargeDist3
LargeDist3 = 'distil-large-v3'

model

model: Annotated[
    Model,
    Option(
        "--model",
        "-m",
        help="[bold green](🟢 Basic)[/] Model to use for Transcription [green](tiny, base, small, medium, large)[/]",
    ),
] = Field(Model.LargeV2)

lowvram

lowvram: Annotated[
    bool,
    Option(
        "--lowvram",
        "-l",
        help="[bold green](🟢 Basic)[/] Use INT8 instead of FP16 for low VRAM GPUs",
    ),
] = Field(False)

transcribe

transcribe(
    audio: Union[str, Path, numpy.ndarray],
    *,
    reference: Optional[str] = None
) -> Spoken
Source code in Broken/Externals/Whisper.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def transcribe(self,
    audio: Union[str, Path, numpy.ndarray],
    *,
    reference: Optional[str]=None
) -> Spoken:
    if isinstance(audio, str) or isinstance(audio, Path):
        if not (audio := BrokenPath.get(audio)).exists():
            raise RuntimeError(f"Audio file doesn't exist: {audio}")
        audio = str(audio)

    self.load_model()
    spoken = Spoken()

    with Halo(f"Transcribing audio with Whisper model ({self.model}).."):
        for segment in self._model.transcribe(
            audio=audio,
            word_timestamps=True,
            initial_prompt=reference
        )[0]:
            spoken.sentences[(segment.start) : (segment.end + 0.001)] = segment.text.strip()

            for word in segment.words:
                spoken.words[(word.start) : (word.end + 0.001)] = word.word.strip()
    del self._model
    return spoken

Spoken

Bases: BaseModel

Source code in Broken/Externals/Whisper.py
109
110
111
112
class Spoken(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    sentences: IntervalTree[Segment] = Field(default_factory=IntervalTree)
    words: IntervalTree[Word] = Field(default_factory=IntervalTree)

model_config

model_config = ConfigDict(arbitrary_types_allowed=True)

sentences

sentences: IntervalTree[Segment] = Field(
    default_factory=IntervalTree
)

words

words: IntervalTree[Word] = Field(
    default_factory=IntervalTree
)