The Text to Speech (TTS) API of the Speech service converts your text into natural-sounding speech. In this tutorial to generate speech, we will send HTTP POST requests to the Speech service. There, text is synthesized into speech and returned as an audio file. A variety of voices and languages are supported.
The Microsoft Text-to-Speech service offers more than 75 voices in more than 45 languages and locales. For the details of the voices supported, see Supported languages. If you want a unique voice for your application, you can create custom voice fonts from your own speech samples.
Prerequisites
To use the Microsoft Text-to-Speech service you need to create a Cognitive Services API account with access to the Speech APIs. If you don’t have an Azure subscription, you can create a trial account. You need the access key provided when you activate your free trial, or you may use a paid subscription key from your Azure dashboard.
How to convert Text to Speech
- Create a new C# project in Visual Studio
- Create a new class TextToSpeechClient
- Add the following code
using System; using System.Collections.Generic; using System.IO; using System.Net; using System.Net.Http; using System.Text; using System.Threading; using System.Threading.Tasks; using System.Xml.Linq; namespace TextToSpeech { /// <summary> /// This class demonstrates how to get a valid O-auth token /// </summary> public class Authentication { private string AccessUri; private string apiKey; private string accessToken; private Timer accessTokenRenewer; //Access token expires every 10 minutes. Renew it every 9 minutes only. private const int RefreshTokenDuration = 9; public Authentication(string issueTokenUri, string apiKey) { this.AccessUri = issueTokenUri; this.apiKey = apiKey; this.accessToken = HttpPost(issueTokenUri, this.apiKey); // renew the token every specfied minutes accessTokenRenewer = new Timer(new TimerCallback(OnTokenExpiredCallback), this, TimeSpan.FromMinutes(RefreshTokenDuration), TimeSpan.FromMilliseconds(-1)); } public string GetAccessToken() { return this.accessToken; } private void RenewAccessToken() { string newAccessToken = HttpPost(AccessUri, this.apiKey); //swap the new token with old one //Note: the swap is thread unsafe this.accessToken = newAccessToken; Console.WriteLine(string.Format("Renewed token for user: {0} is: {1}", this.apiKey, this.accessToken)); } private void OnTokenExpiredCallback(object stateInfo) { try { RenewAccessToken(); } catch (Exception ex) { Console.WriteLine(string.Format("Failed renewing access token. Details: {0}", ex.Message)); } finally { try { accessTokenRenewer.Change(TimeSpan.FromMinutes(RefreshTokenDuration), TimeSpan.FromMilliseconds(-1)); } catch (Exception ex) { Console.WriteLine(string.Format("Failed to reschedule the timer to renew access token. Details: {0}", ex.Message)); } } } private string HttpPost(string accessUri, string apiKey) { // Prepare OAuth request WebRequest webRequest = WebRequest.Create(accessUri); webRequest.Method = "POST"; webRequest.ContentLength = 0; webRequest.Headers["Ocp-Apim-Subscription-Key"] = apiKey; using (WebResponse webResponse = webRequest.GetResponse()) { using (Stream stream = webResponse.GetResponseStream()) { using (MemoryStream ms = new MemoryStream()) { byte[] waveBytes = null; int count = 0; do { byte[] buf = new byte[1024]; count = stream.Read(buf, 0, 1024); ms.Write(buf, 0, count); } while (stream.CanRead && count > 0); waveBytes = ms.ToArray(); return Encoding.UTF8.GetString(waveBytes); } } } } } /// <summary> /// Generic event args /// </summary> /// <typeparam name="T">Any type T</typeparam> public class GenericEventArgs<T> : EventArgs { /// <summary> /// Initializes a new instance of the <see cref="GenericEventArgs{T}" /> class. /// </summary> /// <param name="eventData">The event data.</param> public GenericEventArgs(T eventData) { this.EventData = eventData; } /// <summary> /// Gets the event data. /// </summary> public T EventData { get; private set; } } /// <summary> /// Gender of the voice. /// </summary> public enum Gender { Female, Male } /// <summary> /// Voice output formats. /// </summary> public enum AudioOutputFormat { /// <summary> /// raw-8khz-8bit-mono-mulaw request output audio format type. /// </summary> Raw8Khz8BitMonoMULaw, /// <summary> /// raw-16khz-16bit-mono-pcm request output audio format type. /// </summary> Raw16Khz16BitMonoPcm, /// <summary> /// riff-8khz-8bit-mono-mulaw request output audio format type. /// </summary> Riff8Khz8BitMonoMULaw, /// <summary> /// riff-16khz-16bit-mono-pcm request output audio format type. /// </summary> Riff16Khz16BitMonoPcm, // <summary> /// ssml-16khz-16bit-mono-silk request output audio format type. /// It is a SSML with audio segment, with audio compressed by SILK codec /// </summary> Ssml16Khz16BitMonoSilk, /// <summary> /// raw-16khz-16bit-mono-truesilk request output audio format type. /// Audio compressed by SILK codec /// </summary> Raw16Khz16BitMonoTrueSilk, /// <summary> /// ssml-16khz-16bit-mono-tts request output audio format type. /// It is a SSML with audio segment, and it needs tts engine to play out /// </summary> Ssml16Khz16BitMonoTts, /// <summary> /// audio-16khz-128kbitrate-mono-mp3 request output audio format type. /// </summary> Audio16Khz128KBitRateMonoMp3, /// <summary> /// audio-16khz-64kbitrate-mono-mp3 request output audio format type. /// </summary> Audio16Khz64KBitRateMonoMp3, /// <summary> /// audio-16khz-32kbitrate-mono-mp3 request output audio format type. /// </summary> Audio16Khz32KBitRateMonoMp3, /// <summary> /// audio-16khz-16kbps-mono-siren request output audio format type. /// </summary> Audio16Khz16KbpsMonoSiren, /// <summary> /// riff-16khz-16kbps-mono-siren request output audio format type. /// </summary> Riff16Khz16KbpsMonoSiren, /// <summary> /// raw-24khz-16bit-mono-truesilk request output audio format type. /// </summary> Raw24Khz16BitMonoTrueSilk, /// <summary> /// raw-24khz-16bit-mono-pcm request output audio format type. /// </summary> Raw24Khz16BitMonoPcm, /// <summary> /// riff-24khz-16bit-mono-pcm request output audio format type. /// </summary> Riff24Khz16BitMonoPcm, /// <summary> /// audio-24khz-48kbitrate-mono-mp3 request output audio format type. /// </summary> Audio24Khz48KBitRateMonoMp3, /// <summary> /// audio-24khz-96kbitrate-mono-mp3 request output audio format type. /// </summary> Audio24Khz96KBitRateMonoMp3, /// <summary> /// audio-24khz-160kbitrate-mono-mp3 request output audio format type. /// </summary> Audio24Khz160KBitRateMonoMp3 } /// <summary> /// Sample synthesize request /// </summary> public class Synthesize { /// <summary> /// Generates SSML. /// </summary> /// <param name="locale">The locale.</param> /// <param name="gender">The gender.</param> /// <param name="name">The voice name.</param> /// <param name="text">The text input.</param> private string GenerateSsml(string locale, string gender, string name, string text) { var ssmlDoc = new XDocument( new XElement("speak", new XAttribute("version", "1.0"), new XAttribute(XNamespace.Xml + "lang", "en-US"), new XElement("voice", new XAttribute(XNamespace.Xml + "lang", locale), new XAttribute(XNamespace.Xml + "gender", gender), new XAttribute("name", name), text))); return ssmlDoc.ToString(); } private HttpClient client; private HttpClientHandler handler; /// <summary> /// Initializes a new instance of the <see cref="Synthesize"/> class. /// </summary> public Synthesize() { var cookieContainer = new CookieContainer(); handler = new HttpClientHandler() { CookieContainer = new CookieContainer(), UseProxy = false }; client = new HttpClient(handler); } ~Synthesize() { client.Dispose(); handler.Dispose(); } /// <summary> /// Called when a TTS request has been completed and audio is available. /// </summary> public event EventHandler<GenericEventArgs<Stream>> OnAudioAvailable; /// <summary> /// Called when an error has occured. e.g this could be an HTTP error. /// </summary> public event EventHandler<GenericEventArgs<Exception>> OnError; /// <summary> /// Sends the specified text to be spoken to the TTS service and saves the response audio to a file. /// </summary> /// <param name="cancellationToken">The cancellation token.</param> /// <returns>A Task</returns> public Task Speak(CancellationToken cancellationToken, InputOptions inputOptions) { client.DefaultRequestHeaders.Clear(); foreach (var header in inputOptions.Headers) { client.DefaultRequestHeaders.TryAddWithoutValidation(header.Key, header.Value); } var genderValue = ""; switch (inputOptions.VoiceType) { case Gender.Male: genderValue = "Male"; break; case Gender.Female: default: genderValue = "Female"; break; } var request = new HttpRequestMessage(HttpMethod.Post, inputOptions.RequestUri) { Content = new StringContent(GenerateSsml(inputOptions.Locale, genderValue, inputOptions.VoiceName, inputOptions.Text)) }; var httpTask = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken); Console.WriteLine("Response status code: [{0}]", httpTask.Result.StatusCode); var saveTask = httpTask.ContinueWith( async (responseMessage, token) => { try { if (responseMessage.IsCompleted && responseMessage.Result != null && responseMessage.Result.IsSuccessStatusCode) { var httpStream = await responseMessage.Result.Content.ReadAsStreamAsync().ConfigureAwait(false); this.AudioAvailable(new GenericEventArgs<Stream>(httpStream)); } else { this.Error(new GenericEventArgs<Exception>(new Exception(String.Format("Service returned {0}", responseMessage.Result.StatusCode)))); } } catch (Exception e) { this.Error(new GenericEventArgs<Exception>(e.GetBaseException())); } finally { responseMessage.Dispose(); request.Dispose(); } }, TaskContinuationOptions.AttachedToParent, cancellationToken); return saveTask; } /// <summary> /// Called when a TTS requst has been successfully completed and audio is available. /// </summary> private void AudioAvailable(GenericEventArgs<Stream> e) { EventHandler<GenericEventArgs<Stream>> handler = this.OnAudioAvailable; if (handler != null) { handler(this, e); } } /// <summary> /// Error handler function /// </summary> /// <param name="e">The exception</param> private void Error(GenericEventArgs<Exception> e) { EventHandler<GenericEventArgs<Exception>> handler = this.OnError; if (handler != null) { handler(this, e); } } /// <summary> /// Inputs Options for the TTS Service. /// </summary> public class InputOptions { /// <summary> /// Initializes a new instance of the <see cref="Input"/> class. /// </summary> public InputOptions() { this.Locale = "en-us"; this.VoiceName = "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)"; // Default to Riff16Khz16BitMonoPcm output format. this.OutputFormat = AudioOutputFormat.Riff16Khz16BitMonoPcm; } /// <summary> /// Gets or sets the request URI. /// </summary> public Uri RequestUri { get; set; } /// <summary> /// Gets or sets the audio output format. /// </summary> public AudioOutputFormat OutputFormat { get; set; } /// <summary> /// Gets or sets the headers. /// </summary> public IEnumerable<KeyValuePair<string, string>> Headers { get { List<KeyValuePair<string, string>> toReturn = new List<KeyValuePair<string, string>>(); toReturn.Add(new KeyValuePair<string, string>("Content-Type", "application/ssml+xml")); string outputFormat; switch (this.OutputFormat) { case AudioOutputFormat.Raw16Khz16BitMonoPcm: outputFormat = "raw-16khz-16bit-mono-pcm"; break; case AudioOutputFormat.Raw8Khz8BitMonoMULaw: outputFormat = "raw-8khz-8bit-mono-mulaw"; break; case AudioOutputFormat.Riff16Khz16BitMonoPcm: outputFormat = "riff-16khz-16bit-mono-pcm"; break; case AudioOutputFormat.Riff8Khz8BitMonoMULaw: outputFormat = "riff-8khz-8bit-mono-mulaw"; break; case AudioOutputFormat.Ssml16Khz16BitMonoSilk: outputFormat = "ssml-16khz-16bit-mono-silk"; break; case AudioOutputFormat.Raw16Khz16BitMonoTrueSilk: outputFormat = "raw-16khz-16bit-mono-truesilk"; break; case AudioOutputFormat.Ssml16Khz16BitMonoTts: outputFormat = "ssml-16khz-16bit-mono-tts"; break; case AudioOutputFormat.Audio16Khz128KBitRateMonoMp3: outputFormat = "audio-16khz-128kbitrate-mono-mp3"; break; case AudioOutputFormat.Audio16Khz64KBitRateMonoMp3: outputFormat = "audio-16khz-64kbitrate-mono-mp3"; break; case AudioOutputFormat.Audio16Khz32KBitRateMonoMp3: outputFormat = "audio-16khz-32kbitrate-mono-mp3"; break; case AudioOutputFormat.Audio16Khz16KbpsMonoSiren: outputFormat = "audio-16khz-16kbps-mono-siren"; break; case AudioOutputFormat.Riff16Khz16KbpsMonoSiren: outputFormat = "riff-16khz-16kbps-mono-siren"; break; case AudioOutputFormat.Raw24Khz16BitMonoPcm: outputFormat = "raw-24khz-16bit-mono-pcm"; break; case AudioOutputFormat.Riff24Khz16BitMonoPcm: outputFormat = "riff-24khz-16bit-mono-pcm"; break; case AudioOutputFormat.Audio24Khz48KBitRateMonoMp3: outputFormat = "audio-24khz-48kbitrate-mono-mp3"; break; case AudioOutputFormat.Audio24Khz96KBitRateMonoMp3: outputFormat = "audio-24khz-96kbitrate-mono-mp3"; break; case AudioOutputFormat.Audio24Khz160KBitRateMonoMp3: outputFormat = "audio-24khz-160kbitrate-mono-mp3"; break; default: outputFormat = "riff-16khz-16bit-mono-pcm"; break; } toReturn.Add(new KeyValuePair<string, string>("X-Microsoft-OutputFormat", outputFormat)); // authorization Header toReturn.Add(new KeyValuePair<string, string>("Authorization", this.AuthorizationToken)); // Refer to the doc toReturn.Add(new KeyValuePair<string, string>("X-Search-AppId", "07D3234E49CE426DAA29772419F436CA")); // Refer to the doc toReturn.Add(new KeyValuePair<string, string>("X-Search-ClientID", "1ECFAE91408841A480F00935DC390960")); // The software originating the request toReturn.Add(new KeyValuePair<string, string>("User-Agent", "TTSClient")); return toReturn; } set { Headers = value; } } /// <summary> /// Gets or sets the locale. /// </summary> public String Locale { get; set; } /// <summary> /// Gets or sets the type of the voice; male/female. /// </summary> public Gender VoiceType { get; set; } /// <summary> /// Gets or sets the name of the voice. /// </summary> public string VoiceName { get; set; } /// <summary> /// Authorization Token. /// </summary> public string AuthorizationToken { get; set; } /// <summary> /// Gets or sets the text. /// </summary> public string Text { get; set; } } } }
- Then in the Main method add the following code
using System; using System.IO; using System.Media; using System.Threading; using CognitiveServicesTTS; namespace TextToSpeech { internal class Program { private static void PlayAudio(object sender, GenericEventArgs<Stream> args) { Console.WriteLine(args.EventData); // For SoundPlayer to be able to play the wav file, it has to be encoded in PCM. // Use output audio format AudioOutputFormat.Riff16Khz16BitMonoPcm to do that. SoundPlayer player = new SoundPlayer(args.EventData); player.PlaySync(); args.EventData.Dispose(); } private static void ErrorHandler(object sender, GenericEventArgs<Exception> e) { Console.WriteLine("Unable to complete the TTS request: [{0}]", e.ToString()); } private static void Main(string[] args) { Console.WriteLine("Starting Authtentication"); string accessToken; //Replace the code with your Speech key Authentication auth = new Authentication("https://westus.api.cognitive.microsoft.com/sts/v1.0/issueToken", "input your key here"); try { accessToken = auth.GetAccessToken(); Console.WriteLine("Token: {0}\n", accessToken); } catch (Exception ex) { Console.WriteLine("Failed authentication."); Console.WriteLine(ex.ToString()); Console.WriteLine(ex.Message); return; } Console.WriteLine("Starting TTSSample request code execution."); string requestUri = "https://westus.tts.speech.microsoft.com/cognitiveservices/v1"; var cortana = new Synthesize(); cortana.OnAudioAvailable += PlayAudio; cortana.OnError += ErrorHandler; cortana.Speak(CancellationToken.None, new Synthesize.InputOptions() { RequestUri = new Uri(requestUri), // Text to be spoken. Text = "I want this text converted to Speech", VoiceType = Gender.Female, // Refer to the documentation for complete list of supported locales. Locale = "en-US", // You can also customize the output voice. Refer to the documentation to view the different // voices that the TTS service can output. // VoiceName = "Microsoft Server Speech Text to Speech Voice (en-US, Jessa24KRUS)", VoiceName = "Microsoft Server Speech Text to Speech Voice (en-US, Guy24KRUS)", // VoiceName = "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)", // Service can return audio in different output format. OutputFormat = AudioOutputFormat.Riff24Khz16BitMonoPcm, AuthorizationToken = "Bearer " + accessToken, }).Wait(); } } }
- Replace the API key with your API key from Azure
- In the InputOptions add the options you like:
Text: The text you want to convert to speech
VoiceType: Here input the desired gender Female or Male
Locale: Input the local of the Text. For a list of available locales check the list here
VoiceName: Choose the Voice you want. For a list of available voices check the list here
OutputFormat: Use the following AudioOutputFormat.Riff24Khz16BitMonoPcm
AuthorizationToken: Add the string “Bearer ” together with the accessToken you have received - Run the program to convert your text to speech!!!