Elevenlabs: Generate speech | SPEAC

This Python script performs AI text-to-speech synthesis using Elevenlabs. It takes a list of prompts as input and generates individual .mp3 files for each prompt. It also automatically performs forced-alignment on the output speech using WebMAUS, giving you accompanying TextGrid files with word-level, syllable-level, and phoneme-level annotations.
If you want to get started using this script, please see our how-to-generate-speech for hands-on instructions.
You can also download the script as a .py file.
################################################################################
### Hans Rutger Bosker, Radboud University
### HansRutger.Bosker@donders.ru.nl
### Date: 11 November 2024, run in Python 3.11.5 on Windows 11
### License: CC BY 4.0
################################################################################

######################################
# IMPORTING LIBRARIES
######################################

import requests # to send HTTP requests 
import json # for parsing the JSON data that is received from Elevenlabs
import base64 # for audio
import os # for changing the working directory and removing files
import pandas as pd # for handling input from an Excel file
import xml.etree.ElementTree as ET # to parse the XML response from webmaus
import tgt # to parse and access TextGrids
# IMPORTANT: pandas often uses openpyxl to read Excel files, so make sure to have the 'openpyxl' module installed as well!



######################################
# SET PARAMETERS
######################################

# Set the working directory to the location of the script file
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)

# Path to the input file
input_excel_path = "input.xlsx"

# Voice ID to use
VOICE_ID = "9BWtsMINqrJLrRacOk9x"  # Aria

# Construct the URL for the Text-to-Speech API request
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/with-timestamps"

# Your personal API Key for Elevenlabs
XI_API_KEY = "paste_your_own_API_key_here"
headers = {
    "Accept": "application/json",
    "xi-api-key": XI_API_KEY,
    "Content-Type": "application/json"
}

# WebMAUSBasic parameters
webmaus_url = f"https://clarin.phonetik.uni-muenchen.de/BASWebServices/services/runMAUSBasic"
webmaus_language = 'eng-US'

# runPho2Syl parameters
runpho2syl_url = f"https://clarin.phonetik.uni-muenchen.de/BASWebServices/services/runPho2Syl"
runpho2syl_input_tier = "MAU"
runpho2syl_wsync = "yes"
runpho2syl_outsym = "sampa"
runpho2syl_output_format = "tg"  # Output format: TextGrid



######################################
# DEFINE CUSTOM FUNCTIONS
######################################

# Function to convert JSON to TextGrid format (for alignment data)
def json_to_textgrid(json_data, output_path):
    characters = json_data["characters"]
    start_times = json_data["character_start_times_seconds"]
    end_times = json_data["character_end_times_seconds"]

    xmin = start_times[0]
    xmax = end_times[-1]

    textgrid_content = f'File type = "ooTextFile"\nObject class = "TextGrid"\n\n'
    textgrid_content += f'xmin = {xmin}\n'
    textgrid_content += f'xmax = {xmax}\n'
    textgrid_content += f'tiers? <exists>\n'
    textgrid_content += f'size = 1\n'
    textgrid_content += f'item []:\n'
    textgrid_content += f'    item [1]:\n'
    textgrid_content += f'        class = "IntervalTier"\n'
    textgrid_content += f'        name = "Characters"\n'
    textgrid_content += f'        xmin = {xmin}\n'
    textgrid_content += f'        xmax = {xmax}\n'
    textgrid_content += f'        intervals: size = {len(characters)}\n'

    for j, char in enumerate(characters):
        char_xmin = start_times[j]
        char_xmax = end_times[j]
        text = char.replace('"', '""')
        textgrid_content += f'        intervals [{j + 1}]:\n'
        textgrid_content += f'            xmin = {char_xmin}\n'
        textgrid_content += f'            xmax = {char_xmax}\n'
        textgrid_content += f'            text = "{text}"\n'

    # Write to the output file
    with open(output_path, 'w') as file:
        file.write(textgrid_content)
    #print(f"TextGrid file written to {output_path}")

# Function to parse and combine TextGrid files with TextGridTools (tgt)
def combine_textgrids_tgt(textgrid_path_1, textgrid_path_2, output_path):
    # Load the TextGrid files using TextGridTools
    tg_1 = tgt.read_textgrid(textgrid_path_1)
    tg_2 = tgt.read_textgrid(textgrid_path_2)

    # Create a list of the textgrids to be merged
    textgrids = [tg_1, tg_2]
    
    ## Create a new TextGrid for the combined output
    #combined_tg = tgt.TextGrid()

    combined_tg = tgt.util.merge_textgrids(
        textgrids=textgrids,
        ignore_duplicates=True)
    
    # Write the combined TextGrid to the output file
    tgt.write_to_file(combined_tg, output_path, format='short')
    print(f"TextGrid file saved as {output_path}")





######################################
# START GENERATING SPEECH + TEXTGRIDS
######################################

# Read the input Excel file using pandas
df = pd.read_excel(input_excel_path)

# Main loop for processing the data
for index, row in df.iterrows():
    item = row["filename"]
    prompt = row["prompt"]

    # Data payload for the API request
    data = {
        "text": prompt,
        "model_id": "eleven_turbo_v2",
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.75,
            "style": 0.0,
            "use_speaker_boost": True
        }
    }

    # Make the POST request to the TTS API with headers and data
    response = requests.post(tts_url, headers=headers, json=data, stream=True)

    if response.status_code != 200:
        print(f"Error encountered for prompt '{prompt}', status: {response.status_code}, content: {response.text}")
        continue

    json_string = response.content.decode("utf-8")
    response_dict = json.loads(json_string)

    # Decode the base64-encoded audio data
    audio_bytes = base64.b64decode(response_dict["audio_base64"])

    # Save the input text to a text file using 'item' as the filename
    output_text_path = f"{item}.txt"
    with open(output_text_path, "w") as outfile:
        outfile.write(prompt)
    print(f"Text file saved as {output_text_path}")

    # Save the audio file using 'item' as the filename
    output_audio_path = f"{item}.mp3"
    with open(output_audio_path, 'wb') as f:
        f.write(audio_bytes)
    print(f"Audio file saved as {output_audio_path}")

    # Save character-based alignment data to a JSON file using 'item' as the filename
    output_json_path = f"{item}.json"
    json_object = json.dumps(response_dict['alignment'], indent=4)
    # ??? Do you want to save the JSON file? Then use this:
    #with open(output_json_path, "w") as outfile:
    #    outfile.write(json_object)
    #print(f"Alignment data saved as {output_json_path}")



    # Now convert JSON to TextGrid...
    # ...but wait with saving until the forced-alignment data from WebMAUSBasic have been added
    output_textgrid_path_1 = f"{item}.TextGrid"
    json_to_textgrid(response_dict['alignment'], output_textgrid_path_1)

    # Run WebMAUSBasic on the mp3 and txt pairs
    with open(output_audio_path, 'rb') as audio_file, open(output_text_path, 'r') as text_file:
        # Construct the payload
        files = {
            'SIGNAL': audio_file,
            'TEXT': text_file
        }
        data = {
            'LANGUAGE': webmaus_language,
            'OUTFORMAT': 'bpf'
        }

        # Send the request to the WebMAUSBasic API
        response = requests.post(webmaus_url, files=files, data=data)

        if response.status_code == 200:
            # Parse the XML response
            root = ET.fromstring(response.content)
            download_link = root.find(".//downloadLink").text

            if download_link:
                # Download the BPF file using the download link
                output_filename_webmaus = f"{item}_webmaus.par"
                download_response = requests.get(download_link)

                # Check if the download was successful
                if download_response.status_code == 200:
                    with open(output_filename_webmaus, 'wb') as f:
                        f.write(download_response.content)
                    #print(f"WebMAUS TextGrid file downloaded as {output_filename_webmaus}")
                    
                    # Step 2: Use the .par BPF file as input to runPho2Syl
                    with open(output_filename_webmaus, 'rb') as bpf_file:
                        files = {
                            'i': bpf_file
                        }
                        data = {
                            'wsync': runpho2syl_wsync,
                            'lng': webmaus_language,
                            'tier': runpho2syl_input_tier,
                            'outsym': runpho2syl_outsym,
                            'oform': runpho2syl_output_format
                        }
                        response_pho2syl = requests.post(runpho2syl_url, files=files, data=data)

                        if response_pho2syl.status_code == 200:
                            # Parse the XML response to find download link for runPho2Syl
                            root_pho2syl = ET.fromstring(response_pho2syl.content)
                            download_link_pho2syl = root_pho2syl.find(".//downloadLink").text

                            if download_link_pho2syl:
                                # Download the runPho2Syl output file
                                output_filename_pho2syl = f"{item}_sylls.TextGrid"
                                download_response_pho2syl = requests.get(download_link_pho2syl)
                                
                                if download_response_pho2syl.status_code == 200:
                                    with open(output_filename_pho2syl, 'wb') as f:
                                        f.write(download_response_pho2syl.content)
                                    
                                    # Combine both TextGrid files using TextGridTools (tgt)
                                    output_combined_textgrid_path = f"{item}.TextGrid"
                                    combine_textgrids_tgt(output_textgrid_path_1, output_filename_pho2syl, output_combined_textgrid_path)

                                    # Remove the intermediate output files containing only partial output
                                    os.remove(output_filename_pho2syl)
                                else:
                                    print("Failed to download the runPho2Syl output file.")
                            else:
                                print("Download link not found in the runPho2Syl response.")
                        else:
                            print(f"Failed to process the runPho2Syl request. Status code: {response_pho2syl.status_code}")
                        
                    # Remove the intermediate output files containing only partial output
                    os.remove(output_filename_webmaus)
                else:
                    print("Failed to download the WebMAUS BPF file.")
            else:
                print("Download link not found in the response.")
        else:
            print(f"Failed to process the WebMAUS request. Status code: {response.status_code}")

################################################################################
# End of script
################################################################################