There’s a speech_recognition module in Python that transcribes an audio file — since ffmpeg can convert a video file to mp3, that means you can also use Python to transcribe a video file.
# requires pocketsphinx from CMU if using sphinx for speech to text recognition
import os
import speech_recognition as sr
import ffmpeg
strFFMPEGBinaryLocation = 'c:/tmp/ffmpeg/bin/ffmpeg.exe'
strCurrentDirectory = os.getcwd()
strInputVideo = "\"Z:/Path To/My Video/file.MP4\""
strOutputFileName = "converted.wav"
# Convert mp4 to wav file
strffmpeg_convert_mp4_to_wav = f'{strFFMPEGBinaryLocation} -i {strInputVideo} {strCurrentDirectory}/{strOutputFileName}'
os.system(strffmpeg_convert_mp4_to_wav)
# Run converted wav file through speech recognizer
r = sr.Recognizer()
audio = sr.AudioFile(f'{strCurrentDirectory}/{strOutputFileName}')
with audio as source:
#audio = r.record(source, 90) # Would need API key to process longer audio?
#text = r.recognize_google(audio)
audio = r.record(source)
text = r.recognize_sphinx(audio)
print(text)