Convert Video to srt/subtitle File with Google Speech-to-text Service,cloud storage and Jupyter notebook

Renee LIN
3 min readOct 20, 2021

--

I used SpeechRecognition lib to recognise speech into text, but found out its length is constrained by Google(SpeechRecognition used Google API), as I want to transcribe longer video I decided to use Google’s API directly. Besides, you can get text file as well as srt file.

Preparation

Open a google cloud account, set up

1.cloud storage, audio file is saved here

2.enable Speech-to-text API and generate user key. You might need to watch their official tutorial to figure it out. Level Up — Automated Subtitles with AI-https://www.youtube.com/watch?v=uBzp5xGSZ6o

3.I use colab too, so I save my API key and jupyter notebook on Google Drive

%pip install --upgrade google-cloud-speech
%pip install --upgrade google-cloud-storage
# this is because I put the API json key on my google drive
# skip this if you use local env
from google.colab import drive
drive.mount('/content/gdrive')
#ensure the file is accessible
!ls /content/gdrive/'My Drive'/'Colab Notebooks'/temp
# refer to google tutorial to check how to save the key
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/content/gdrive/My Drive/Colab Notebooks/temp/temp_speech.json"
#ensure the path is set correctly
!echo $GOOGLE_APPLICATION_CREDENTIALS

Connect to the cloud storage

from google.colab import auth
from google.cloud import storage
auth.authenticate_user()
project_id = 'prac-259701'
!gcloud config set project {project_id}
!gsutil ls
bucket_name = 'sage_ff14'

test if it is connected

# test, check if connecting with the cloud storage
from google.cloud import storage
project_id = 'prac-259701'
bucket_name = 'sage_ff14'
def list_blobs(bucket_name):
"""Lists all the blobs in the bucket."""
# Note: Client.list_blobs requires at least package version 1.17.0.
storage_client = storage.Client(project_id)
blobs = storage_client.list_blobs(bucket_name)
for blob in blobs:
print(blob.name)
list_blobs(bucket_name)

Now we can start using the service

!pip install srt
import srt
from google.cloud import speech
# parameters about the audio file
sample_rate_hertz = 44100
language_code = "en-US"
audio_channel_count = 2
encoding = 'LINEAR16'
out_file = "subtitle"
max_chars = 40

Below is the Google official code for requesting Speech to text service, I only make a little adjustments

  1. function to call the service
def long_running_recognize(uri):
"""
Transcribe long audio file from Cloud Storage using asynchronous speech
recognition
Args:
storage_uri URI for audio file in GCS, e.g. gs://[BUCKET]/[FILE]
"""
# print("Transcribing {} ...".format(args.storage_uri))
client = speech.SpeechClient()
# Encoding of audio data sent.
operation = client.long_running_recognize(
config=
{
"enable_word_time_offsets": True,
"enable_automatic_punctuation": True,
"sample_rate_hertz": sample_rate_hertz,
"language_code": language_code,
"audio_channel_count": audio_channel_count,
"encoding": encoding,
},
audio={"uri": storage_uri},
)
response = operation.result()
subs = [] for result in response.results:
# First alternative is the most probable result
# alternative = result.alternatives[0]
# subs.append(alternative)
# print(u"Transcript: {}".format(alternative.transcript))
subs = break_sentences( subs, result.alternatives[0]) print("Transcribing finished")
return subs

2. break the long text to sentences because we need srt file

def break_sentences(subs, alternative):
firstword = True
charcount = 0
idx = len(subs) + 1
content = ""
for w in alternative.words:
if firstword:
# first word in sentence, record start time
# start = w.start_time.ToTimedelta()
start = w.start_time
charcount += len(w.word)
content += " " + w.word.strip()
if ("." in w.word or "!" in w.word or "?" in w.word or
charcount > max_chars or
("," in w.word and not firstword)):
# break sentence at: . ! ? or line length exceeded
# also break if , and not first word
subs.append(srt.Subtitle(index=idx,
start=start,
# end=w.end_time.ToTimedelta(),
end=w.end_time,
content=srt.make_legal_content(content)))
firstword = True
idx += 1
content = ""
charcount = 0
else:
firstword = False
return subs

3. save the file

def write_srt(subs):
srt_file = out_file + ".srt"
print("Writing {} subtitles to: {}".format(language_code, srt_file))
f = open(srt_file, 'w')
f.writelines(srt.compose(subs))
f.close()
return
def write_txt(subs):
txt_file = out_file + ".txt"
print("Writing text to: {}".format(txt_file))
f = open(txt_file, 'w')
for s in subs:
f.write(s.content.strip() + "\n")
f.close()
return

Put in the file address, the files we need can be generated

storage_uri = 'gs://' + 'sage_ff14' + '/' +'converted-sage.wav'
print(storage_uri)
subs = long_running_recognize(storage_uri)
write_srt(subs)
write_txt(subs)

The complete notebook is here https://github.com/reneelin1712/autoTranslation/blob/efbf264225bc97ca0d408a5dee4077a769d2732c/video2srt_with_google_speech_to_text.ipynb

I need to check mozilla deepspeech in the future since Google API is not free, it is only allow 60minutes of free audio recognition

--

--

Renee LIN
Renee LIN

Written by Renee LIN

Passionate about web dev and data analysis. Huge FFXIV fan. Interested in health data now.

No responses yet