This Python script performs AI text-to-speech synthesis using Elevenlabs. It takes a list of prompts as input and generates individual .mp3 files for each prompt. It also automatically performs forced-alignment on the output speech using WebMAUS, giving you accompanying TextGrid files with word-level, syllable-level, and phoneme-level annotations.
If you want to get started using this script, please see our how-to-generate-speech for hands-on instructions.
You can also download the script as a .py file.
### Hans Rutger Bosker, Radboud University
### Date: 11 November 2024, run in Python 3.11.5 on Windows 11
### License: CC BY 4.0
import requests # to send HTTP requests
import json # for parsing the JSON data that is received from Elevenlabs
import base64 # for audio
import os # for changing the working directory and removing files
import pandas as pd # for handling input from an Excel file
import xml.etree.ElementTree as ET # to parse the XML response from webmaus
import tgt # to parse and access TextGrids
# IMPORTANT: pandas often uses openpyxl to read Excel files, so make sure to have the 'openpyxl' module installed as well!
# Set the working directory to the location of the script file
script_dir = os.path.dirname(os.path.abspath(__file__))
# Path to the input file
input_excel_path = "input.xlsx"
# Voice ID to use
VOICE_ID = "9BWtsMINqrJLrRacOk9x" # Aria
# Construct the URL for the Text-to-Speech API request
tts_url = f"{VOICE_ID}/with-timestamps"
# Your personal API Key for Elevenlabs
XI_API_KEY = "paste_your_own_API_key_here"
headers = {
"Accept": "application/json",
"xi-api-key": XI_API_KEY,
"Content-Type": "application/json"
# WebMAUSBasic parameters
webmaus_url = f""
webmaus_language = 'eng-US'
# runPho2Syl parameters
runpho2syl_url = f""
runpho2syl_input_tier = "MAU"
runpho2syl_wsync = "yes"
runpho2syl_outsym = "sampa"
runpho2syl_output_format = "tg" # Output format: TextGrid
# Function to convert JSON to TextGrid format (for alignment data)
def json_to_textgrid(json_data, output_path):
characters = json_data["characters"]
start_times = json_data["character_start_times_seconds"]
end_times = json_data["character_end_times_seconds"]
xmin = start_times[0]
xmax = end_times[-1]
textgrid_content = f'File type = "ooTextFile"\nObject class = "TextGrid"\n\n'
textgrid_content += f'xmin = {xmin}\n'
textgrid_content += f'xmax = {xmax}\n'
textgrid_content += f'tiers? <exists>\n'
textgrid_content += f'size = 1\n'
textgrid_content += f'item []:\n'
textgrid_content += f' item [1]:\n'
textgrid_content += f' class = "IntervalTier"\n'
textgrid_content += f' name = "Characters"\n'
textgrid_content += f' xmin = {xmin}\n'
textgrid_content += f' xmax = {xmax}\n'
textgrid_content += f' intervals: size = {len(characters)}\n'
for j, char in enumerate(characters):
char_xmin = start_times[j]
char_xmax = end_times[j]
text = char.replace('"', '""')
textgrid_content += f' intervals [{j + 1}]:\n'
textgrid_content += f' xmin = {char_xmin}\n'
textgrid_content += f' xmax = {char_xmax}\n'
textgrid_content += f' text = "{text}"\n'
# Write to the output file
with open(output_path, 'w') as file:
#print(f"TextGrid file written to {output_path}")
# Function to parse and combine TextGrid files with TextGridTools (tgt)
def combine_textgrids_tgt(textgrid_path_1, textgrid_path_2, output_path):
# Load the TextGrid files using TextGridTools
tg_1 = tgt.read_textgrid(textgrid_path_1)
tg_2 = tgt.read_textgrid(textgrid_path_2)
# Create a list of the textgrids to be merged
textgrids = [tg_1, tg_2]
## Create a new TextGrid for the combined output
#combined_tg = tgt.TextGrid()
combined_tg = tgt.util.merge_textgrids(
# Write the combined TextGrid to the output file
tgt.write_to_file(combined_tg, output_path, format='short')
print(f"TextGrid file saved as {output_path}")
# Read the input Excel file using pandas
df = pd.read_excel(input_excel_path)
# Main loop for processing the data
for index, row in df.iterrows():
item = row["filename"]
prompt = row["prompt"]
# Data payload for the API request
data = {
"text": prompt,
"model_id": "eleven_turbo_v2",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75,
"style": 0.0,
"use_speaker_boost": True
# Make the POST request to the TTS API with headers and data
response =, headers=headers, json=data, stream=True)
if response.status_code != 200:
print(f"Error encountered for prompt '{prompt}', status: {response.status_code}, content: {response.text}")
json_string = response.content.decode("utf-8")
response_dict = json.loads(json_string)
# Decode the base64-encoded audio data
audio_bytes = base64.b64decode(response_dict["audio_base64"])
# Save the input text to a text file using 'item' as the filename
output_text_path = f"{item}.txt"
with open(output_text_path, "w") as outfile:
print(f"Text file saved as {output_text_path}")
# Save the audio file using 'item' as the filename
output_audio_path = f"{item}.mp3"
with open(output_audio_path, 'wb') as f:
print(f"Audio file saved as {output_audio_path}")
# Save character-based alignment data to a JSON file using 'item' as the filename
output_json_path = f"{item}.json"
json_object = json.dumps(response_dict['alignment'], indent=4)
# ??? Do you want to save the JSON file? Then use this:
#with open(output_json_path, "w") as outfile:
# outfile.write(json_object)
#print(f"Alignment data saved as {output_json_path}")
# Now convert JSON to TextGrid...
# ...but wait with saving until the forced-alignment data from WebMAUSBasic have been added
output_textgrid_path_1 = f"{item}.TextGrid"
json_to_textgrid(response_dict['alignment'], output_textgrid_path_1)
# Run WebMAUSBasic on the mp3 and txt pairs
with open(output_audio_path, 'rb') as audio_file, open(output_text_path, 'r') as text_file:
# Construct the payload
files = {
'SIGNAL': audio_file,
'TEXT': text_file
data = {
'LANGUAGE': webmaus_language,
'OUTFORMAT': 'bpf'
# Send the request to the WebMAUSBasic API
response =, files=files, data=data)
if response.status_code == 200:
# Parse the XML response
root = ET.fromstring(response.content)
download_link = root.find(".//downloadLink").text
if download_link:
# Download the BPF file using the download link
output_filename_webmaus = f"{item}_webmaus.par"
download_response = requests.get(download_link)
# Check if the download was successful
if download_response.status_code == 200:
with open(output_filename_webmaus, 'wb') as f:
#print(f"WebMAUS TextGrid file downloaded as {output_filename_webmaus}")
# Step 2: Use the .par BPF file as input to runPho2Syl
with open(output_filename_webmaus, 'rb') as bpf_file:
files = {
'i': bpf_file
data = {
'wsync': runpho2syl_wsync,
'lng': webmaus_language,
'tier': runpho2syl_input_tier,
'outsym': runpho2syl_outsym,
'oform': runpho2syl_output_format
response_pho2syl =, files=files, data=data)
if response_pho2syl.status_code == 200:
# Parse the XML response to find download link for runPho2Syl
root_pho2syl = ET.fromstring(response_pho2syl.content)
download_link_pho2syl = root_pho2syl.find(".//downloadLink").text
if download_link_pho2syl:
# Download the runPho2Syl output file
output_filename_pho2syl = f"{item}_sylls.TextGrid"
download_response_pho2syl = requests.get(download_link_pho2syl)
if download_response_pho2syl.status_code == 200:
with open(output_filename_pho2syl, 'wb') as f:
# Combine both TextGrid files using TextGridTools (tgt)
output_combined_textgrid_path = f"{item}.TextGrid"
combine_textgrids_tgt(output_textgrid_path_1, output_filename_pho2syl, output_combined_textgrid_path)
# Remove the intermediate output files containing only partial output
print("Failed to download the runPho2Syl output file.")
print("Download link not found in the runPho2Syl response.")
print(f"Failed to process the runPho2Syl request. Status code: {response_pho2syl.status_code}")
# Remove the intermediate output files containing only partial output
print("Failed to download the WebMAUS BPF file.")
print("Download link not found in the response.")
print(f"Failed to process the WebMAUS request. Status code: {response.status_code}")
# End of script