src/infra/cyberverse_config.example.yaml
8,270 bytes · 226 lines · capsule://quake0day/[email protected]
raw on github
# CyberVerse example configuration.
# Copy this file to cyberverse_config.yaml and edit the local copy:
# cp infra/cyberverse_config.example.yaml cyberverse_config.yaml
#
# Keep secrets in .env. Values such as ${DASHSCOPE_API_KEY} are expanded from
# environment variables after .env is loaded.
server:
host: "0.0.0.0"
http_port: 8080
grpc_port: 50051
cors_origins: ["*"]
livekit:
url: "${LIVEKIT_URL}"
api_key: "${LIVEKIT_API_KEY}"
api_secret: "${LIVEKIT_API_SECRET}"
# Warm up inference components to reduce first-frame latency.
warmup:
enabled: true
distributed:
enabled: true # Enable warmup in multi-GPU mode.
timeout_s: 30 # Timeout in seconds to prevent hangs.
inference:
avatar:
# Set to false for pure voice sessions. Cached idle videos are still
# served, but realtime speaking video and new idle video generation stop.
enabled: true
# Select the avatar backend initialized by the inference process.
default: "flash_head"
runtime:
cuda_visible_devices: 0 # Use 0,1 for two GPUs.
world_size: 1 # Match the number of visible GPUs.
flash_head:
plugin_class: "inference.plugins.avatar.flash_head_plugin.FlashHeadAvatarPlugin"
models_dir: "models"
checkpoint_dir: "./checkpoints/SoulX-FlashHead-1_3B"
wav2vec_dir: "./checkpoints/wav2vec2-base-960h"
model_type: "pro"
device: "cuda:0"
seed: 9999
compile_model: true
compile_vae: true
dist_worker_main_thread: true
infer_params:
frame_num: 33
motion_frames_latent_num: 2
tgt_fps: 20
sample_rate: 16000
sample_shift: 5
color_correction_strength: 1.0
cached_audio_duration: 8
num_heads: 12
height: 512
width: 512
live_act:
plugin_class: "inference.plugins.avatar.live_act_plugin.LiveActAvatarPlugin"
models_dir: "models/SoulX-LiveAct"
ckpt_dir: "./checkpoints/LiveAct"
wav2vec_dir: "./checkpoints/chinese-wav2vec2-base"
seed: 42
t5_cpu: false
fp8_kv_cache: false
offload_cache: false
block_offload: false
mean_memory: false
compile_wan_model: false
compile_vae_decode: false
dist_worker_main_thread: true
default_prompt: "一个人在说话"
infer_params:
size: "320*480"
fps: 20
audio_cfg: 1.0
omni:
# Real realtime omni model providers only.
default: "qwen_omni"
doubao:
plugin_class: "inference.plugins.voice_llm.doubao_realtime.DoubaoRealtimePlugin"
access_token: "${DOUBAO_ACCESS_TOKEN}"
app_id: "${DOUBAO_APP_ID}"
voice_type: "zh_female_default"
# Avoid DialogAudioIdleTimeoutError before FlashHead's first chunk arrives.
end_smooth_window_ms: 6000
qwen_omni:
plugin_class: "inference.plugins.voice_llm.qwen_omni_realtime.QwenOmniRealtimePlugin"
api_key: "${DASHSCOPE_API_KEY}"
model: "qwen3.5-omni-flash-realtime"
voice: "Tina"
input_sample_rate: 16000
output_sample_rate: 24000
vad_type: "semantic_vad"
vad_threshold: 0.5
vad_silence_duration_ms: 800
persona:
# PersonaAgent is the orchestration layer. It wraps a concrete omni
# model provider and coordinates background tasks.
persona:
plugin_class: "inference.plugins.voice_llm.persona_agent.PersonaAgentPlugin"
# Use a provider whose adapter exposes native hidden tool calls.
# The qwen_omni adapter supports PersonaAgent task tools in this MVP.
model_provider: "qwen_omni"
# Defaults to data/tasks/langgraph_checkpoints.db when empty.
checkpoint_db_path: ""
llm:
# PersonaAgent owns the local Supervisor LangGraph and sub-agent
# runtime directly. Defaults to inference.llm.default; set this only
# when background tasks should use a different text LLM.
provider: "qwen"
tools:
zhihu:
access_secret: "${ZHIHU_ACCESS_SECRET}"
api_base: "https://developer.zhihu.com"
timeout_seconds: 30
zhida_model: "zhida-fast-1p5"
max_agent_iterations: 100
llm:
# Used by standard mode and by PersonaAgent's local sub-agent runtime.
# The global system prompt is owned by the Go orchestrator; do not
# duplicate persona prompts here.
default: "qwen"
qwen:
plugin_class: "inference.plugins.llm.qwen_plugin.QwenLLMPlugin"
api_key: "${DASHSCOPE_API_KEY}"
model: "qwen3.6-plus"
temperature: 0.7
extra_body:
enable_thinking: false
openai:
plugin_class: "inference.plugins.llm.openai_plugin.OpenAILLMPlugin"
api_key: "${OPENAI_API_KEY}"
model: "gpt-4o"
temperature: 0.7
embedding:
# Used by local character RAG indexes.
default: "qwen"
qwen:
api_key: "${DASHSCOPE_API_KEY}"
model: "text-embedding-v4"
openai:
api_key: "${OPENAI_API_KEY}"
model: "text-embedding-3-small"
tts:
# Used by standard mode after the LLM response is generated.
default: "qwen"
qwen:
plugin_class: "inference.plugins.tts.qwen_tts_plugin.QwenTTSPlugin"
api_key: "${DASHSCOPE_API_KEY}"
model: "qwen3-tts-flash-realtime"
voice: "Momo"
sample_rate: 24000
target_sample_rate: 16000
openai:
plugin_class: "inference.plugins.tts.openai_tts_plugin.OpenAITTSPlugin"
api_key: "${OPENAI_API_KEY}"
model: "tts-1"
voice: "nova"
asr:
# Used by standard mode to transcribe microphone input.
default: "qwen"
qwen:
plugin_class: "inference.plugins.asr.qwen_asr_plugin.QwenASRPlugin"
api_key: "${DASHSCOPE_API_KEY}"
model: "qwen3-asr-flash-realtime"
language: "auto"
sample_rate: 16000
vad_threshold: 0.8
vad_silence_duration_ms: 600
whisper:
plugin_class: "inference.plugins.asr.whisper_plugin.WhisperASRPlugin"
model_size: "base"
language: "auto"
device: "cpu"
session:
max_concurrent: 4
idle_timeout_s: 300
max_duration_s: 3600
pipeline:
default_mode: "omni"
streaming_mode: "direct" # "direct" = P2P WebRTC, "livekit" = LiveKit SFU.
rag:
enabled: true
top_k: 5
min_score: 0.25
max_context_chars: 4500
chunk_chars: 900
chunk_overlap_chars: 120
visual_input:
enabled: true
frame_interval_ms: 1000
max_width: 1280
max_height: 720
jpeg_quality: 0.78
max_frame_bytes: 524288
ws_max_message_bytes: 1048576
max_recent_frames: 2
frame_ttl_ms: 10000
turn_enabled: true
turn_port: 8443 # TCP port for embedded TURN server, useful through SSH tunnels.
turn_realm: "cyberverse"
turn_username: "cyberverse"
turn_password: "${TURN_PASSWORD}"
ice_public_ip: "" # REQUIRED for remote Direct WebRTC: public hostname or IP (not 127.0.0.1). Used in turn:HOST:PORT sent to browsers.
recording:
# Enables per-turn MP4, raw WAV, and transcript files.
enabled: true
# Used when a session has no character-specific recording directory.
# Character sessions are stored under data/characters/.../sessions/...
output_dir: "./recordings"
# x264 quality for recorded MP4 files. Lower means higher quality/larger files.
crf: 23
inference_grpc:
addr: "localhost:50051"