capsule AI-native Unix-like composition layer

src/server/internal/orchestrator/visual_input_test.go

4,823 bytes · 175 lines · capsule://quake0day/[email protected] raw on github

package orchestrator

import (
	"encoding/base64"
	"errors"
	"testing"
	"time"

	"github.com/cyberverse/server/internal/character"
	"github.com/cyberverse/server/internal/config"
	"github.com/cyberverse/server/internal/ws"
)

func newVisualInputTestOrchestratorWithConfig(t *testing.T, mode PipelineMode, provider string, visualCfg config.VisualInputConfig) (*Orchestrator, *Session) {
	t.Helper()
	mgr := NewSessionManager(4)
	var charStore *character.Store
	charID := ""
	if provider != "" {
		var err error
		charStore, err = character.NewStore(t.TempDir())
		if err != nil {
			t.Fatal(err)
		}
		char, err := charStore.Create(&character.Character{
			Name:          "Visual",
			Mode:          "omni",
			VoiceProvider: provider,
			VoiceType:     "Tina",
		})
		if err != nil {
			t.Fatal(err)
		}
		charID = char.ID
	}
	session, err := mgr.Create("session-visual", mode, charID)
	if err != nil {
		t.Fatal(err)
	}
	orch := New(
		&idleVideoInferenceStub{},
		ws.NewHub(),
		mgr,
		nil,
		charStore,
		config.PipelineConfig{
			VisualInput: visualCfg,
		},
	)
	return orch, session
}

func newVisualInputTestOrchestrator(t *testing.T, mode PipelineMode) (*Orchestrator, *Session) {
	t.Helper()
	enabled := true
	return newVisualInputTestOrchestratorWithConfig(t, mode, "", config.VisualInputConfig{
		Enabled:         &enabled,
		FrameIntervalMS: 1000,
		MaxWidth:        1280,
		MaxHeight:       720,
		MaxFrameBytes:   1024,
		MaxRecentFrames: 2,
		FrameTTLMS:      10000,
	})
}

func newVisualInputTestVoiceOrchestrator(t *testing.T, provider string) (*Orchestrator, *Session) {
	t.Helper()
	enabled := true
	return newVisualInputTestOrchestratorWithConfig(t, ModeOmni, provider, config.VisualInputConfig{
		Enabled:         &enabled,
		FrameIntervalMS: 1000,
		MaxWidth:        1280,
		MaxHeight:       720,
		MaxFrameBytes:   1024,
		MaxRecentFrames: 2,
		FrameTTLMS:      10000,
	})
}

func TestHandleVisualFrameStoresLatestForStandardSession(t *testing.T) {
	orch, session := newVisualInputTestOrchestrator(t, ModeStandard)

	if err := orch.HandleVisualInputStart(session.ID, "screen"); err != nil {
		t.Fatal(err)
	}
	err := orch.HandleVisualFrame(session.ID, ws.WSMessage{
		Source:      "screen",
		Mime:        "image/jpeg",
		Data:        base64.StdEncoding.EncodeToString([]byte{0xff, 0xd8, 0xff, 0x00}),
		Width:       640,
		Height:      360,
		TimestampMS: 123,
		FrameSeq:    1,
	})
	if err != nil {
		t.Fatal(err)
	}

	frames := session.LatestVisualFrames(time.Now(), time.Second)
	if len(frames) != 1 {
		t.Fatalf("expected 1 frame, got %d", len(frames))
	}
	if frames[0].Source != "screen" || frames[0].MimeType != "image/jpeg" || frames[0].FrameSeq != 1 {
		t.Fatalf("unexpected frame: %+v", frames[0])
	}
}

func TestHandleVisualFrameStoresLatestForQwenOmniSession(t *testing.T) {
	orch, session := newVisualInputTestVoiceOrchestrator(t, "qwen_omni")

	if err := orch.HandleVisualInputStart(session.ID, "camera"); err != nil {
		t.Fatal(err)
	}
	err := orch.HandleVisualFrame(session.ID, ws.WSMessage{
		Source:      "camera",
		Mime:        "image/jpeg",
		Data:        base64.StdEncoding.EncodeToString([]byte{0xff, 0xd8, 0xff, 0x00}),
		Width:       640,
		Height:      360,
		TimestampMS: 123,
		FrameSeq:    1,
	})
	if err != nil {
		t.Fatal(err)
	}

	frames := session.LatestVisualFrames(time.Now(), time.Second)
	if len(frames) != 1 {
		t.Fatalf("expected 1 frame, got %d", len(frames))
	}
	if frames[0].Source != "camera" || frames[0].MimeType != "image/jpeg" || frames[0].FrameSeq != 1 {
		t.Fatalf("unexpected frame: %+v", frames[0])
	}
}

func TestHandleVisualFrameRejectsDoubaoOmniSession(t *testing.T) {
	orch, session := newVisualInputTestVoiceOrchestrator(t, "doubao")

	err := orch.HandleVisualFrame(session.ID, ws.WSMessage{
		Source: "camera",
		Mime:   "image/jpeg",
		Data:   base64.StdEncoding.EncodeToString([]byte{0xff, 0xd8, 0xff, 0x00}),
		Width:  640,
		Height: 360,
	})
	if !errors.Is(err, ErrVisualInputUnsupported) {
		t.Fatalf("expected ErrVisualInputUnsupported, got %v", err)
	}
}

func TestQwenOmniVisualInputConfigIsClamped(t *testing.T) {
	enabled := true
	orch, session := newVisualInputTestOrchestratorWithConfig(t, ModeOmni, "qwen_omni", config.VisualInputConfig{
		Enabled:         &enabled,
		FrameIntervalMS: 250,
		MaxWidth:        1280,
		MaxHeight:       720,
		MaxFrameBytes:   900 * 1024,
		MaxRecentFrames: 2,
		FrameTTLMS:      10000,
	})

	cfg, ok := orch.VisualInputConfigForSession(session)
	if !ok {
		t.Fatal("expected qwen_omni visual input support")
	}
	if cfg.FrameIntervalMS != 1000 {
		t.Fatalf("expected frame interval clamp to 1000ms, got %d", cfg.FrameIntervalMS)
	}
	if cfg.MaxFrameBytes != qwenOmniMaxVisualFrameBytes {
		t.Fatalf("expected max frame bytes clamp to %d, got %d", qwenOmniMaxVisualFrameBytes, cfg.MaxFrameBytes)
	}
}