capsule AI-native Unix-like composition layer

src/server/internal/inference/interfaces.go

2,794 bytes · 103 lines · capsule://quake0day/[email protected] raw on github

package inference

import (
	"context"

	pb "github.com/cyberverse/server/internal/pb"
)

type ImageFrame struct {
	Data        []byte
	MimeType    string
	Width       int32
	Height      int32
	Source      string
	TimestampMS int64
	FrameSeq    int64
}

// ChatMessage represents a single message in a conversation.
type ChatMessage struct {
	Role    string
	Content string
	Images  []ImageFrame
}

// LLMConfig holds parameters for LLM generation.
type LLMConfig struct {
	Model       string
	Temperature float32
	MaxTokens   int32
	Provider    string
}

type TTSConfig struct {
	Provider      string
	Voice         string
	SpeakingStyle string
	Language      string
	SessionID     string
}

type ASRConfig struct {
	Provider  string
	Language  string
	SessionID string
}

// VoiceLLMSessionConfig holds per-session character config for VoiceLLM.
type VoiceLLMDialogContextItem struct {
	Role      string
	Text      string
	Timestamp int64
}

type VoiceLLMSessionConfig struct {
	SessionID      string
	Provider       string
	CharacterID    string
	CharacterDir   string
	SystemPrompt   string
	Voice          string // maps to voice_type / speaker
	BotName        string
	SpeakingStyle  string
	WelcomeMessage string
	DialogContext  []VoiceLLMDialogContextItem
}

// VoiceLLMInputEvent is one input item for a VoiceLLM conversation stream.
// Exactly one of Audio, Text, or Image should be set.
type VoiceLLMInputEvent struct {
	Audio []byte
	Text  string
	Image *ImageFrame
}

// InferenceService defines the interface for communicating with the Python
// inference layer. Using an interface allows tests to inject mocks.
type InferenceService interface {
	HealthCheck(ctx context.Context) error
	AvatarInfo(ctx context.Context) (*pb.AvatarInfo, error)

	// Avatar
	SetAvatar(ctx context.Context, sessionID string, imageData []byte, format string) error
	GenerateAvatarStream(ctx context.Context, audioCh <-chan *pb.AudioChunk) (<-chan *pb.VideoChunk, <-chan error)
	GenerateAvatar(ctx context.Context, audioChunks []*pb.AudioChunk) (<-chan *pb.VideoChunk, <-chan error)

	// LLM
	GenerateLLMStream(ctx context.Context, sessionID string, messages []ChatMessage, config LLMConfig) (<-chan *pb.LLMChunk, <-chan error)

	// TTS
	SynthesizeSpeechStream(ctx context.Context, textCh <-chan string, config TTSConfig) (<-chan *pb.AudioChunk, <-chan error)

	// ASR
	TranscribeStream(ctx context.Context, audioCh <-chan []byte, config ASRConfig) (<-chan *pb.TranscriptEvent, <-chan error)

	// VoiceLLM
	CheckVoice(ctx context.Context, config VoiceLLMSessionConfig) (string, error)
	ConverseStream(ctx context.Context, inputCh <-chan VoiceLLMInputEvent, config VoiceLLMSessionConfig) (<-chan *pb.VoiceLLMOutput, <-chan error)
	Interrupt(ctx context.Context, sessionID string) error

	Close() error
}