src/server/internal/inference/interfaces.go
2,794 bytes · 103 lines · capsule://quake0day/[email protected]
raw on github
package inference
import (
"context"
pb "github.com/cyberverse/server/internal/pb"
)
type ImageFrame struct {
Data []byte
MimeType string
Width int32
Height int32
Source string
TimestampMS int64
FrameSeq int64
}
// ChatMessage represents a single message in a conversation.
type ChatMessage struct {
Role string
Content string
Images []ImageFrame
}
// LLMConfig holds parameters for LLM generation.
type LLMConfig struct {
Model string
Temperature float32
MaxTokens int32
Provider string
}
type TTSConfig struct {
Provider string
Voice string
SpeakingStyle string
Language string
SessionID string
}
type ASRConfig struct {
Provider string
Language string
SessionID string
}
// VoiceLLMSessionConfig holds per-session character config for VoiceLLM.
type VoiceLLMDialogContextItem struct {
Role string
Text string
Timestamp int64
}
type VoiceLLMSessionConfig struct {
SessionID string
Provider string
CharacterID string
CharacterDir string
SystemPrompt string
Voice string // maps to voice_type / speaker
BotName string
SpeakingStyle string
WelcomeMessage string
DialogContext []VoiceLLMDialogContextItem
}
// VoiceLLMInputEvent is one input item for a VoiceLLM conversation stream.
// Exactly one of Audio, Text, or Image should be set.
type VoiceLLMInputEvent struct {
Audio []byte
Text string
Image *ImageFrame
}
// InferenceService defines the interface for communicating with the Python
// inference layer. Using an interface allows tests to inject mocks.
type InferenceService interface {
HealthCheck(ctx context.Context) error
AvatarInfo(ctx context.Context) (*pb.AvatarInfo, error)
// Avatar
SetAvatar(ctx context.Context, sessionID string, imageData []byte, format string) error
GenerateAvatarStream(ctx context.Context, audioCh <-chan *pb.AudioChunk) (<-chan *pb.VideoChunk, <-chan error)
GenerateAvatar(ctx context.Context, audioChunks []*pb.AudioChunk) (<-chan *pb.VideoChunk, <-chan error)
// LLM
GenerateLLMStream(ctx context.Context, sessionID string, messages []ChatMessage, config LLMConfig) (<-chan *pb.LLMChunk, <-chan error)
// TTS
SynthesizeSpeechStream(ctx context.Context, textCh <-chan string, config TTSConfig) (<-chan *pb.AudioChunk, <-chan error)
// ASR
TranscribeStream(ctx context.Context, audioCh <-chan []byte, config ASRConfig) (<-chan *pb.TranscriptEvent, <-chan error)
// VoiceLLM
CheckVoice(ctx context.Context, config VoiceLLMSessionConfig) (string, error)
ConverseStream(ctx context.Context, inputCh <-chan VoiceLLMInputEvent, config VoiceLLMSessionConfig) (<-chan *pb.VoiceLLMOutput, <-chan error)
Interrupt(ctx context.Context, sessionID string) error
Close() error
}