1
1
package openai
2
2
3
3
import (
4
+ "bytes"
4
5
"context"
5
6
"encoding/base64"
6
7
"encoding/json"
7
8
"fmt"
9
+ "os"
8
10
"strings"
9
11
"sync"
10
12
"time"
11
13
14
+ "github.com/go-audio/wav"
15
+
12
16
"github.com/go-audio/audio"
13
17
"github.com/gofiber/fiber/v2"
14
18
"github.com/gofiber/websocket/v2"
@@ -488,21 +492,8 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo
488
492
}
489
493
490
494
const (
491
- minMicVolume = 450
492
- sendToVADDelay = time .Second
493
- )
494
-
495
- type VADState int
496
-
497
- const (
498
- StateSilence VADState = iota
499
- StateSpeaking
500
- )
501
-
502
- const (
503
- // tune these thresholds to taste
504
- SpeechFramesThreshold = 3 // must see X consecutive speech results to confirm "start"
505
- SilenceFramesThreshold = 5 // must see X consecutive silence results to confirm "end"
495
+ sendToVADDelay = 2 * time .Second
496
+ silenceThreshold = 2 * time .Second
506
497
)
507
498
508
499
// handleVAD is a goroutine that listens for audio data from the client,
@@ -534,14 +525,18 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
534
525
copy (allAudio , session .InputAudioBuffer )
535
526
session .AudioBufferLock .Unlock ()
536
527
537
- // 2) If there's no audio at all, just continue
538
- if len (allAudio ) == 0 {
528
+ // 2) If there's no audio at all, or just too small samples, just continue
529
+ if len (allAudio ) == 0 || len ( allAudio ) < 32000 {
539
530
continue
540
531
}
541
532
542
533
// 3) Run VAD on the entire audio so far
543
534
segments , err := runVAD (vadContext , session , allAudio )
544
535
if err != nil {
536
+ if err .Error () == "unexpected speech end" {
537
+ log .Debug ().Msg ("VAD cancelled" )
538
+ continue
539
+ }
545
540
log .Error ().Msgf ("failed to process audio: %s" , err .Error ())
546
541
sendError (c , "processing_error" , "Failed to process audio: " + err .Error (), "" , "" )
547
542
// handle or log error, continue
@@ -550,7 +545,7 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
550
545
551
546
segCount := len (segments )
552
547
553
- if len (segments ) == 0 && ! speaking && time .Since (timeOfLastNewSeg ) > 1 * time . Second {
548
+ if len (segments ) == 0 && ! speaking && time .Since (timeOfLastNewSeg ) > silenceThreshold {
554
549
// no speech detected, and we haven't seen a new segment in > 1s
555
550
// clean up input
556
551
session .AudioBufferLock .Lock ()
@@ -569,8 +564,11 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
569
564
}
570
565
571
566
// 5) If speaking, but we haven't seen a new segment in > 1s => finalize
572
- if speaking && time .Since (timeOfLastNewSeg ) > 1 * time . Second {
567
+ if speaking && time .Since (timeOfLastNewSeg ) > sendToVADDelay {
573
568
log .Debug ().Msgf ("Detected end of speech segment" )
569
+ session .AudioBufferLock .Lock ()
570
+ session .InputAudioBuffer = nil
571
+ session .AudioBufferLock .Unlock ()
574
572
// user has presumably stopped talking
575
573
commitUtterance (allAudio , cfg , evaluator , session , conv , c )
576
574
// reset state
@@ -608,18 +606,38 @@ func commitUtterance(utt []byte, cfg *config.BackendConfig, evaluator *templates
608
606
Item : item ,
609
607
})
610
608
611
- // Optionally trigger the response generation
609
+ // save chunk to disk
610
+ f , err := os .CreateTemp ("" , "audio-*.wav" )
611
+ if err != nil {
612
+ log .Error ().Msgf ("failed to create temp file: %s" , err .Error ())
613
+ return
614
+ }
615
+ defer f .Close ()
616
+ //defer os.Remove(f.Name())
617
+ log .Debug ().Msgf ("Writing to %s\n " , f .Name ())
618
+
619
+ f .Write (utt )
620
+ f .Sync ()
621
+
622
+ // trigger the response generation
612
623
generateResponse (cfg , evaluator , session , conv , ResponseCreate {}, c , websocket .TextMessage )
613
624
}
614
625
615
- // runVAD is a helper that calls your model's VAD method, returning
626
+ // runVAD is a helper that calls the model's VAD method, returning
616
627
// true if it detects speech, false if it detects silence
617
628
func runVAD (ctx context.Context , session * Session , chunk []byte ) ([]* proto.VADSegment , error ) {
618
629
619
630
adata := sound .BytesToInt16sLE (chunk )
620
631
621
632
// Resample from 24kHz to 16kHz
622
- // adata = sound.ResampleInt16(adata, 24000, 16000)
633
+ adata = sound .ResampleInt16 (adata , 24000 , 16000 )
634
+
635
+ dec := wav .NewDecoder (bytes .NewReader (chunk ))
636
+ dur , err := dec .Duration ()
637
+ if err != nil {
638
+ fmt .Printf ("failed to get duration: %s\n " , err )
639
+ }
640
+ fmt .Printf ("duration: %s\n " , dur )
623
641
624
642
soundIntBuffer := & audio.IntBuffer {
625
643
Format : & audio.Format {SampleRate : 16000 , NumChannels : 1 },
0 commit comments