Skip to content

Commit f272605

Browse files
committed
more robust approach
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 9a09820 commit f272605

File tree

1 file changed

+40
-22
lines changed

1 file changed

+40
-22
lines changed

core/http/endpoints/openai/realtime.go

+40-22
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
package openai
22

33
import (
4+
"bytes"
45
"context"
56
"encoding/base64"
67
"encoding/json"
78
"fmt"
9+
"os"
810
"strings"
911
"sync"
1012
"time"
1113

14+
"github.com/go-audio/wav"
15+
1216
"github.com/go-audio/audio"
1317
"github.com/gofiber/fiber/v2"
1418
"github.com/gofiber/websocket/v2"
@@ -488,21 +492,8 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo
488492
}
489493

490494
const (
491-
minMicVolume = 450
492-
sendToVADDelay = time.Second
493-
)
494-
495-
type VADState int
496-
497-
const (
498-
StateSilence VADState = iota
499-
StateSpeaking
500-
)
501-
502-
const (
503-
// tune these thresholds to taste
504-
SpeechFramesThreshold = 3 // must see X consecutive speech results to confirm "start"
505-
SilenceFramesThreshold = 5 // must see X consecutive silence results to confirm "end"
495+
sendToVADDelay = 2 * time.Second
496+
silenceThreshold = 2 * time.Second
506497
)
507498

508499
// handleVAD is a goroutine that listens for audio data from the client,
@@ -534,14 +525,18 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
534525
copy(allAudio, session.InputAudioBuffer)
535526
session.AudioBufferLock.Unlock()
536527

537-
// 2) If there's no audio at all, just continue
538-
if len(allAudio) == 0 {
528+
// 2) If there's no audio at all, or just too small samples, just continue
529+
if len(allAudio) == 0 || len(allAudio) < 32000 {
539530
continue
540531
}
541532

542533
// 3) Run VAD on the entire audio so far
543534
segments, err := runVAD(vadContext, session, allAudio)
544535
if err != nil {
536+
if err.Error() == "unexpected speech end" {
537+
log.Debug().Msg("VAD cancelled")
538+
continue
539+
}
545540
log.Error().Msgf("failed to process audio: %s", err.Error())
546541
sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
547542
// handle or log error, continue
@@ -550,7 +545,7 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
550545

551546
segCount := len(segments)
552547

553-
if len(segments) == 0 && !speaking && time.Since(timeOfLastNewSeg) > 1*time.Second {
548+
if len(segments) == 0 && !speaking && time.Since(timeOfLastNewSeg) > silenceThreshold {
554549
// no speech detected, and we haven't seen a new segment in > 1s
555550
// clean up input
556551
session.AudioBufferLock.Lock()
@@ -569,8 +564,11 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
569564
}
570565

571566
// 5) If speaking, but we haven't seen a new segment in > 1s => finalize
572-
if speaking && time.Since(timeOfLastNewSeg) > 1*time.Second {
567+
if speaking && time.Since(timeOfLastNewSeg) > sendToVADDelay {
573568
log.Debug().Msgf("Detected end of speech segment")
569+
session.AudioBufferLock.Lock()
570+
session.InputAudioBuffer = nil
571+
session.AudioBufferLock.Unlock()
574572
// user has presumably stopped talking
575573
commitUtterance(allAudio, cfg, evaluator, session, conv, c)
576574
// reset state
@@ -608,18 +606,38 @@ func commitUtterance(utt []byte, cfg *config.BackendConfig, evaluator *templates
608606
Item: item,
609607
})
610608

611-
// Optionally trigger the response generation
609+
// save chunk to disk
610+
f, err := os.CreateTemp("", "audio-*.wav")
611+
if err != nil {
612+
log.Error().Msgf("failed to create temp file: %s", err.Error())
613+
return
614+
}
615+
defer f.Close()
616+
//defer os.Remove(f.Name())
617+
log.Debug().Msgf("Writing to %s\n", f.Name())
618+
619+
f.Write(utt)
620+
f.Sync()
621+
622+
// trigger the response generation
612623
generateResponse(cfg, evaluator, session, conv, ResponseCreate{}, c, websocket.TextMessage)
613624
}
614625

615-
// runVAD is a helper that calls your model's VAD method, returning
626+
// runVAD is a helper that calls the model's VAD method, returning
616627
// true if it detects speech, false if it detects silence
617628
func runVAD(ctx context.Context, session *Session, chunk []byte) ([]*proto.VADSegment, error) {
618629

619630
adata := sound.BytesToInt16sLE(chunk)
620631

621632
// Resample from 24kHz to 16kHz
622-
// adata = sound.ResampleInt16(adata, 24000, 16000)
633+
adata = sound.ResampleInt16(adata, 24000, 16000)
634+
635+
dec := wav.NewDecoder(bytes.NewReader(chunk))
636+
dur, err := dec.Duration()
637+
if err != nil {
638+
fmt.Printf("failed to get duration: %s\n", err)
639+
}
640+
fmt.Printf("duration: %s\n", dur)
623641

624642
soundIntBuffer := &audio.IntBuffer{
625643
Format: &audio.Format{SampleRate: 16000, NumChannels: 1},

0 commit comments

Comments
 (0)