Skip to content

Commit 087e50f

Browse files
HavenDVclaude
andcommitted
feat(audio): AudioResult.WriteWavTo/WriteWavFile + README TTS section
- Promotes the RIFF/WAVE writer that was duplicated between samples/LiveAudioPlayback and samples/AudioRoundTrip into AudioResult.WriteWavTo(stream) / WriteWavFile(path). Sample rate defaults to the value parsed from MimeType. Both samples now call the SDK helper instead of carrying their own copies. - README gains a Text-to-Speech and Speech-to-Text section showing SpeakAsync + GeminiAudioTags + WriteWavFile and the new ISpeechToTextClient (MEAI) round-trip. The features bullet now mentions ISpeechToTextClient and TTS support. - Adds 3 new unit tests covering WAV header, explicit-rate override, and the empty-AudioData guard (15 total Audio tests now pass). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6aa0895 commit 087e50f

5 files changed

Lines changed: 161 additions & 63 deletions

File tree

README.md

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
- Updated and supported automatically if there are no breaking changes
1212
- All modern .NET features - nullability, trimming, NativeAOT, etc.
1313
- Support .Net Framework/.Net Standard 2.0
14-
- Microsoft.Extensions.AI `IChatClient` and `IEmbeddingGenerator` support
14+
- Microsoft.Extensions.AI `IChatClient`, `IEmbeddingGenerator` and `ISpeechToTextClient` support
15+
- First-class TTS (`SpeakAsync` with Gemini 3.1 Flash TTS), audio-tag controllability, and built-in WAV output
1516

1617
### Usage
1718
```csharp
@@ -26,6 +27,7 @@ The SDK implements [`IChatClient`](https://learn.microsoft.com/en-us/dotnet/api/
2627
```csharp
2728
using Google.Gemini;
2829
using Microsoft.Extensions.AI;
30+
#pragma warning disable MEAI001 // ISpeechToTextClient is evaluation API
2931

3032
// IChatClient
3133
IChatClient chatClient = new GeminiClient(apiKey);
@@ -38,8 +40,37 @@ IEmbeddingGenerator<string, Embedding<float>> generator = new GeminiClient(apiKe
3840
var embeddings = await generator.GenerateAsync(
3941
["Hello, world!"],
4042
new EmbeddingGenerationOptions { ModelId = "gemini-embedding-001" });
43+
44+
// ISpeechToTextClient
45+
ISpeechToTextClient stt = new GeminiClient(apiKey);
46+
using var wav = File.OpenRead("speech.wav");
47+
var transcription = await stt.GetTextAsync(wav);
48+
```
49+
50+
### Text-to-Speech and Speech-to-Text
51+
52+
`SpeakAsync` synthesizes speech with `gemini-3.1-flash-tts-preview` (default) and returns
53+
raw PCM that you can write to disk with the built-in WAV helper:
54+
55+
```csharp
56+
using Google.Gemini;
57+
58+
using var client = new GeminiClient(apiKey);
59+
60+
var result = await client.SpeakAsync(
61+
text: $"{GeminiAudioTags.Cheerful} Hello! {GeminiAudioTags.Excited} This is Gemini.",
62+
voiceName: GeminiVoices.Puck);
63+
64+
Console.WriteLine($"{result.AudioData!.Length} bytes @ {result.SampleRateHz} Hz");
65+
result.WriteWavFile("speech.wav");
4166
```
4267

68+
`GeminiAudioTags` exposes constants for the commonly supported inline audio tags
69+
(emotion / style / delivery / pacing). `GeminiVoices` lists all 30 prebuilt voice
70+
names, and `client.ListTtsModelsAsync()` discovers TTS-capable models at runtime.
71+
See [`docs/guides/tts-and-stt.md`](docs/guides/tts-and-stt.md) and the
72+
[`samples/AudioRoundTrip`](samples/AudioRoundTrip) console for a complete walk-through.
73+
4374
### Live API (Real-time Voice/Video)
4475

4576
The SDK supports the [Gemini Live API](https://ai.google.dev/gemini-api/docs/live-api) for real-time bidirectional voice and video interactions over WebSocket:

samples/AudioRoundTrip/Program.cs

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,12 @@
3939
return 1;
4040
}
4141

42-
var pcmData = tts.AudioData!;
4342
var sampleRate = tts.SampleRateHz ?? 24000;
44-
Console.WriteLine($" {pcmData.Length:N0} bytes PCM @ {sampleRate} Hz ({pcmData.Length / (double)(sampleRate * 2):F1}s)");
43+
Console.WriteLine($" {tts.AudioData!.Length:N0} bytes PCM @ {sampleRate} Hz ({tts.AudioData.Length / (double)(sampleRate * 2):F1}s)");
4544

4645
// 2) Save as WAV next to the executable so the user can play it.
4746
var wavPath = Path.Combine(Directory.GetCurrentDirectory(), "audio_round_trip.wav");
48-
WriteWavFile(wavPath, pcmData, sampleRate: sampleRate, bitsPerSample: 16, channels: 1);
47+
tts.WriteWavFile(wavPath);
4948
Console.WriteLine($" Saved: {wavPath}");
5049

5150
// 3) Round-trip the audio through the MEAI ISpeechToTextClient interface
@@ -61,29 +60,3 @@
6160
Console.WriteLine($" Text: {response.Text}");
6261

6362
return 0;
64-
65-
static void WriteWavFile(string path, byte[] pcmData, int sampleRate, int bitsPerSample, int channels)
66-
{
67-
var byteRate = sampleRate * channels * bitsPerSample / 8;
68-
var blockAlign = channels * bitsPerSample / 8;
69-
70-
using var fs = System.IO.File.Create(path);
71-
using var writer = new BinaryWriter(fs);
72-
73-
writer.Write("RIFF"u8);
74-
writer.Write(36 + pcmData.Length);
75-
writer.Write("WAVE"u8);
76-
77-
writer.Write("fmt "u8);
78-
writer.Write(16);
79-
writer.Write((short)1);
80-
writer.Write((short)channels);
81-
writer.Write(sampleRate);
82-
writer.Write(byteRate);
83-
writer.Write((short)blockAlign);
84-
writer.Write((short)bitsPerSample);
85-
86-
writer.Write("data"u8);
87-
writer.Write(pcmData.Length);
88-
writer.Write(pcmData);
89-
}

samples/LiveAudioPlayback/Program.cs

Lines changed: 1 addition & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@
9999
{
100100
var pcmData = audioStream.ToArray();
101101
var wavPath = Path.Combine(Directory.GetCurrentDirectory(), $"response_{turnNumber:D3}.wav");
102-
WriteWavFile(wavPath, pcmData, sampleRate: 24000, bitsPerSample: 16, channels: 1);
102+
new AudioResult { AudioData = pcmData }.WriteWavFile(wavPath, sampleRate: 24000);
103103
Console.WriteLine($" Audio saved: {wavPath} ({pcmData.Length:N0} bytes PCM, {pcmData.Length / 48000.0:F1}s)\n");
104104
}
105105
else
@@ -109,35 +109,3 @@
109109
}
110110

111111
Console.WriteLine("\nSession ended.");
112-
113-
/// <summary>
114-
/// Writes raw PCM audio data as a WAV file with the specified format.
115-
/// </summary>
116-
static void WriteWavFile(string path, byte[] pcmData, int sampleRate, int bitsPerSample, int channels)
117-
{
118-
var byteRate = sampleRate * channels * bitsPerSample / 8;
119-
var blockAlign = channels * bitsPerSample / 8;
120-
121-
using var fs = System.IO.File.Create(path);
122-
using var writer = new BinaryWriter(fs);
123-
124-
// RIFF header
125-
writer.Write("RIFF"u8);
126-
writer.Write(36 + pcmData.Length); // file size - 8
127-
writer.Write("WAVE"u8);
128-
129-
// fmt sub-chunk
130-
writer.Write("fmt "u8);
131-
writer.Write(16); // sub-chunk size (PCM)
132-
writer.Write((short)1); // audio format (PCM = 1)
133-
writer.Write((short)channels);
134-
writer.Write(sampleRate);
135-
writer.Write(byteRate);
136-
writer.Write((short)blockAlign);
137-
writer.Write((short)bitsPerSample);
138-
139-
// data sub-chunk
140-
writer.Write("data"u8);
141-
writer.Write(pcmData.Length);
142-
writer.Write(pcmData);
143-
}

src/libs/Google.Gemini/Extensions/GeminiClientExtensions.Audio.cs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,4 +242,76 @@ public record AudioResult
242242

243243
return int.TryParse(rateValue, out var rate) ? rate : null;
244244
}
245+
246+
/// <summary>
247+
/// Writes <see cref="AudioData"/> as a 16-bit little-endian PCM WAV stream.
248+
/// Useful for saving Gemini TTS output (which arrives as raw PCM in
249+
/// <c>audio/L16;…;rate=NNN</c>) to a playable file or HTTP response.
250+
/// </summary>
251+
/// <param name="destination">Target stream. Must be writable. Not closed by this method.</param>
252+
/// <param name="sampleRate">Sample rate in Hz. Defaults to <see cref="SampleRateHz"/> or 24000.</param>
253+
/// <param name="channels">Channel count. Defaults to 1 (mono — Gemini TTS is single-channel).</param>
254+
/// <param name="bitsPerSample">Bit depth. Defaults to 16 (matches Gemini's L16 output).</param>
255+
public void WriteWavTo(
256+
Stream destination,
257+
int? sampleRate = null,
258+
int channels = 1,
259+
int bitsPerSample = 16)
260+
{
261+
ArgumentNullException.ThrowIfNull(destination);
262+
if (AudioData is not { Length: > 0 } pcm)
263+
{
264+
throw new InvalidOperationException("AudioResult contains no audio data.");
265+
}
266+
267+
var effectiveRate = sampleRate ?? SampleRateHz ?? 24000;
268+
WriteWavHeaderAndBody(destination, pcm, effectiveRate, channels, bitsPerSample);
269+
}
270+
271+
/// <summary>
272+
/// Writes <see cref="AudioData"/> as a WAV file at <paramref name="path"/>.
273+
/// Overwrites the file if it already exists.
274+
/// </summary>
275+
public void WriteWavFile(
276+
string path,
277+
int? sampleRate = null,
278+
int channels = 1,
279+
int bitsPerSample = 16)
280+
{
281+
ArgumentException.ThrowIfNullOrEmpty(path);
282+
283+
using var fs = System.IO.File.Create(path);
284+
WriteWavTo(fs, sampleRate, channels, bitsPerSample);
285+
}
286+
287+
private static void WriteWavHeaderAndBody(
288+
Stream destination,
289+
byte[] pcmData,
290+
int sampleRate,
291+
int channels,
292+
int bitsPerSample)
293+
{
294+
var byteRate = sampleRate * channels * bitsPerSample / 8;
295+
var blockAlign = channels * bitsPerSample / 8;
296+
297+
using var writer = new BinaryWriter(destination, System.Text.Encoding.ASCII, leaveOpen: true);
298+
299+
writer.Write("RIFF"u8);
300+
writer.Write(36 + pcmData.Length);
301+
writer.Write("WAVE"u8);
302+
303+
writer.Write("fmt "u8);
304+
writer.Write(16);
305+
writer.Write((short)1);
306+
writer.Write((short)channels);
307+
writer.Write(sampleRate);
308+
writer.Write(byteRate);
309+
writer.Write((short)blockAlign);
310+
writer.Write((short)bitsPerSample);
311+
312+
writer.Write("data"u8);
313+
writer.Write(pcmData.Length);
314+
writer.Write(pcmData);
315+
writer.Flush();
316+
}
245317
}

src/tests/IntegrationTests/Tests.Audio.cs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,58 @@ public void AudioResult_SampleRateHz_MirrorsParseResult()
3939
new AudioResult { MimeType = null }.SampleRateHz.Should().BeNull();
4040
new AudioResult { MimeType = "audio/wav" }.SampleRateHz.Should().BeNull();
4141
}
42+
43+
[TestMethod]
44+
public void AudioResult_WriteWavTo_EmitsRiffWaveHeader()
45+
{
46+
var pcm = new byte[] { 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04, 0x00 };
47+
var result = new AudioResult
48+
{
49+
AudioData = pcm,
50+
MimeType = "audio/L16;codec=pcm;rate=16000",
51+
};
52+
53+
using var ms = new MemoryStream();
54+
result.WriteWavTo(ms);
55+
56+
var bytes = ms.ToArray();
57+
bytes.Length.Should().Be(44 + pcm.Length, "WAV header is 44 bytes for PCM");
58+
59+
// RIFF / WAVE / fmt / data chunk IDs in the standard offsets.
60+
System.Text.Encoding.ASCII.GetString(bytes, 0, 4).Should().Be("RIFF");
61+
System.Text.Encoding.ASCII.GetString(bytes, 8, 4).Should().Be("WAVE");
62+
System.Text.Encoding.ASCII.GetString(bytes, 12, 4).Should().Be("fmt ");
63+
System.Text.Encoding.ASCII.GetString(bytes, 36, 4).Should().Be("data");
64+
65+
// Sample rate at offset 24 (little-endian uint32) should match parsed rate.
66+
BitConverter.ToInt32(bytes, 24).Should().Be(16000);
67+
68+
// Body should be the PCM payload verbatim.
69+
bytes.AsSpan(44).ToArray().Should().BeEquivalentTo(pcm);
70+
}
71+
72+
[TestMethod]
73+
public void AudioResult_WriteWavTo_UsesExplicitSampleRateOverParse()
74+
{
75+
var result = new AudioResult
76+
{
77+
AudioData = new byte[] { 0, 0 },
78+
MimeType = "audio/L16;codec=pcm;rate=24000",
79+
};
80+
81+
using var ms = new MemoryStream();
82+
result.WriteWavTo(ms, sampleRate: 48000);
83+
84+
BitConverter.ToInt32(ms.ToArray(), 24).Should().Be(48000);
85+
}
86+
87+
[TestMethod]
88+
public void AudioResult_WriteWavTo_ThrowsWhenNoAudio()
89+
{
90+
var result = new AudioResult { AudioData = null };
91+
using var ms = new MemoryStream();
92+
93+
var act = () => result.WriteWavTo(ms);
94+
act.Should().Throw<InvalidOperationException>();
95+
}
4296
}

0 commit comments

Comments
 (0)