src/server/internal/orchestrator/visual_input_test.go
4,823 bytes · 175 lines · capsule://quake0day/[email protected]
raw on github
package orchestrator
import (
"encoding/base64"
"errors"
"testing"
"time"
"github.com/cyberverse/server/internal/character"
"github.com/cyberverse/server/internal/config"
"github.com/cyberverse/server/internal/ws"
)
func newVisualInputTestOrchestratorWithConfig(t *testing.T, mode PipelineMode, provider string, visualCfg config.VisualInputConfig) (*Orchestrator, *Session) {
t.Helper()
mgr := NewSessionManager(4)
var charStore *character.Store
charID := ""
if provider != "" {
var err error
charStore, err = character.NewStore(t.TempDir())
if err != nil {
t.Fatal(err)
}
char, err := charStore.Create(&character.Character{
Name: "Visual",
Mode: "omni",
VoiceProvider: provider,
VoiceType: "Tina",
})
if err != nil {
t.Fatal(err)
}
charID = char.ID
}
session, err := mgr.Create("session-visual", mode, charID)
if err != nil {
t.Fatal(err)
}
orch := New(
&idleVideoInferenceStub{},
ws.NewHub(),
mgr,
nil,
charStore,
config.PipelineConfig{
VisualInput: visualCfg,
},
)
return orch, session
}
func newVisualInputTestOrchestrator(t *testing.T, mode PipelineMode) (*Orchestrator, *Session) {
t.Helper()
enabled := true
return newVisualInputTestOrchestratorWithConfig(t, mode, "", config.VisualInputConfig{
Enabled: &enabled,
FrameIntervalMS: 1000,
MaxWidth: 1280,
MaxHeight: 720,
MaxFrameBytes: 1024,
MaxRecentFrames: 2,
FrameTTLMS: 10000,
})
}
func newVisualInputTestVoiceOrchestrator(t *testing.T, provider string) (*Orchestrator, *Session) {
t.Helper()
enabled := true
return newVisualInputTestOrchestratorWithConfig(t, ModeOmni, provider, config.VisualInputConfig{
Enabled: &enabled,
FrameIntervalMS: 1000,
MaxWidth: 1280,
MaxHeight: 720,
MaxFrameBytes: 1024,
MaxRecentFrames: 2,
FrameTTLMS: 10000,
})
}
func TestHandleVisualFrameStoresLatestForStandardSession(t *testing.T) {
orch, session := newVisualInputTestOrchestrator(t, ModeStandard)
if err := orch.HandleVisualInputStart(session.ID, "screen"); err != nil {
t.Fatal(err)
}
err := orch.HandleVisualFrame(session.ID, ws.WSMessage{
Source: "screen",
Mime: "image/jpeg",
Data: base64.StdEncoding.EncodeToString([]byte{0xff, 0xd8, 0xff, 0x00}),
Width: 640,
Height: 360,
TimestampMS: 123,
FrameSeq: 1,
})
if err != nil {
t.Fatal(err)
}
frames := session.LatestVisualFrames(time.Now(), time.Second)
if len(frames) != 1 {
t.Fatalf("expected 1 frame, got %d", len(frames))
}
if frames[0].Source != "screen" || frames[0].MimeType != "image/jpeg" || frames[0].FrameSeq != 1 {
t.Fatalf("unexpected frame: %+v", frames[0])
}
}
func TestHandleVisualFrameStoresLatestForQwenOmniSession(t *testing.T) {
orch, session := newVisualInputTestVoiceOrchestrator(t, "qwen_omni")
if err := orch.HandleVisualInputStart(session.ID, "camera"); err != nil {
t.Fatal(err)
}
err := orch.HandleVisualFrame(session.ID, ws.WSMessage{
Source: "camera",
Mime: "image/jpeg",
Data: base64.StdEncoding.EncodeToString([]byte{0xff, 0xd8, 0xff, 0x00}),
Width: 640,
Height: 360,
TimestampMS: 123,
FrameSeq: 1,
})
if err != nil {
t.Fatal(err)
}
frames := session.LatestVisualFrames(time.Now(), time.Second)
if len(frames) != 1 {
t.Fatalf("expected 1 frame, got %d", len(frames))
}
if frames[0].Source != "camera" || frames[0].MimeType != "image/jpeg" || frames[0].FrameSeq != 1 {
t.Fatalf("unexpected frame: %+v", frames[0])
}
}
func TestHandleVisualFrameRejectsDoubaoOmniSession(t *testing.T) {
orch, session := newVisualInputTestVoiceOrchestrator(t, "doubao")
err := orch.HandleVisualFrame(session.ID, ws.WSMessage{
Source: "camera",
Mime: "image/jpeg",
Data: base64.StdEncoding.EncodeToString([]byte{0xff, 0xd8, 0xff, 0x00}),
Width: 640,
Height: 360,
})
if !errors.Is(err, ErrVisualInputUnsupported) {
t.Fatalf("expected ErrVisualInputUnsupported, got %v", err)
}
}
func TestQwenOmniVisualInputConfigIsClamped(t *testing.T) {
enabled := true
orch, session := newVisualInputTestOrchestratorWithConfig(t, ModeOmni, "qwen_omni", config.VisualInputConfig{
Enabled: &enabled,
FrameIntervalMS: 250,
MaxWidth: 1280,
MaxHeight: 720,
MaxFrameBytes: 900 * 1024,
MaxRecentFrames: 2,
FrameTTLMS: 10000,
})
cfg, ok := orch.VisualInputConfigForSession(session)
if !ok {
t.Fatal("expected qwen_omni visual input support")
}
if cfg.FrameIntervalMS != 1000 {
t.Fatalf("expected frame interval clamp to 1000ms, got %d", cfg.FrameIntervalMS)
}
if cfg.MaxFrameBytes != qwenOmniMaxVisualFrameBytes {
t.Fatalf("expected max frame bytes clamp to %d, got %d", qwenOmniMaxVisualFrameBytes, cfg.MaxFrameBytes)
}
}