From 3555794a78aa643bc1d2476a297a3a34c90b6f44 Mon Sep 17 00:00:00 2001 From: derspotter Date: Fri, 20 Feb 2026 20:54:53 +0100 Subject: [PATCH] fix: support Opus/OGG output format - Move output_format from JSON body to query parameter (per ElevenLabs API spec) - Add .ogg/.opus extension inference -> opus_48000_64 - Change Accept header to */* to support non-MP3 formats - Fixes both StreamTTS and ConvertTTS endpoints --- cmd/speak.go | 2 ++ internal/elevenlabs/client.go | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/cmd/speak.go b/cmd/speak.go index e043407..00c8682 100644 --- a/cmd/speak.go +++ b/cmd/speak.go @@ -535,6 +535,8 @@ func inferFormatFromExt(path string) string { return "mp3_44100_128" case ".wav", ".wave": return "pcm_44100" + case ".ogg", ".opus": + return "opus_48000_64" default: return "" } diff --git a/internal/elevenlabs/client.go b/internal/elevenlabs/client.go index b8524d5..6143749 100644 --- a/internal/elevenlabs/client.go +++ b/internal/elevenlabs/client.go @@ -213,11 +213,15 @@ func (c *Client) StreamTTS(ctx context.Context, voiceID string, payload TTSReque return nil, err } u.Path = path.Join(u.Path, "/v1/text-to-speech", voiceID, "stream") + q := u.Query() if latency > 0 { - q := u.Query() q.Set("optimize_streaming_latency", fmt.Sprint(latency)) - u.RawQuery = q.Encode() } + if payload.OutputFormat != "" { + q.Set("output_format", payload.OutputFormat) + payload.OutputFormat = "" // don't send in body + } + u.RawQuery = q.Encode() bodyBytes, err := json.Marshal(payload) if err != nil { @@ -229,7 +233,7 @@ func (c *Client) StreamTTS(ctx context.Context, voiceID string, payload TTSReque return nil, err } req.Header.Set("Content-Type", "application/json") - req.Header.Set("Accept", "audio/mpeg") + req.Header.Set("Accept", "*/*") req.Header.Set("xi-api-key", c.apiKey) resp, err := c.httpClient.Do(req) @@ -253,6 +257,12 @@ func (c *Client) ConvertTTS(ctx context.Context, voiceID string, payload TTSRequ return nil, err } u.Path = path.Join(u.Path, "/v1/text-to-speech", voiceID) + q := u.Query() + if payload.OutputFormat != "" { + q.Set("output_format", payload.OutputFormat) + payload.OutputFormat = "" + } + u.RawQuery = q.Encode() bodyBytes, err := json.Marshal(payload) if err != nil { @@ -264,7 +274,7 @@ func (c *Client) ConvertTTS(ctx context.Context, voiceID string, payload TTSRequ return nil, err } req.Header.Set("Content-Type", "application/json") - req.Header.Set("Accept", "audio/mpeg") + req.Header.Set("Accept", "*/*") req.Header.Set("xi-api-key", c.apiKey) resp, err := c.httpClient.Do(req)