src/inference/services/llm_service.py

2,158 bytes · 60 lines · capsule://quake0day/[email protected] raw on github

import grpc

from inference.core.registry import PluginRegistry
from inference.generated import llm_pb2, llm_pb2_grpc
from inference.plugins.llm.base import LLMPlugin


class LLMGRPCService(llm_pb2_grpc.LLMServiceServicer):

    def __init__(self, registry: PluginRegistry) -> None:
        self.registry = registry

    def _get_plugin(self, provider: str = "") -> LLMPlugin:
        provider = provider.strip()
        if provider:
            return self.registry.get(f"llm.{provider}")
        plugin = self.registry.get_by_category("llm")
        if plugin is None:
            raise RuntimeError("No LLM plugin initialized")
        return plugin

    async def GenerateStream(self, request, context):
        provider = request.config.provider if request.config else ""
        try:
            plugin = self._get_plugin(provider)
        except (KeyError, RuntimeError) as exc:
            await context.abort(grpc.StatusCode.INVALID_ARGUMENT, str(exc))

        messages = []
        has_images = False
        for msg in request.messages:
            images = [
                {
                    "data": image.data,
                    "mime_type": image.mime_type,
                    "width": image.width,
                    "height": image.height,
                    "source": image.source,
                    "timestamp_ms": image.timestamp_ms,
                    "frame_seq": image.frame_seq,
                }
                for image in msg.images
            ]
            has_images = has_images or bool(images)
            item = {"role": msg.role, "content": msg.content}
            if images:
                item["images"] = images
            messages.append(item)

        if has_images and not getattr(plugin, "supports_images", False):
            raise RuntimeError("Configured LLM plugin does not support image input")

        async for chunk in plugin.generate_stream(messages):
            yield llm_pb2.LLMChunk(
                token=chunk.token,
                accumulated_text=chunk.accumulated_text,
                is_sentence_end=chunk.is_sentence_end,
                is_final=chunk.is_final,
            )