livekit · theomonnom · Oct 16, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/.changeset/wild-walls-occur.md b/.changeset/wild-walls-occur.md
@@ -0,0 +1,6 @@
+---
+"livekit-agents": patch
+"livekit-plugins-eou": minor
+---
+
+feat: inference process & end of utterance plugin
diff --git a/examples/voice-pipeline-agent/minimal_assistant.py b/examples/voice-pipeline-agent/minimal_assistant.py
@@ -13,7 +13,7 @@
     metrics,
 )
 from livekit.agents.pipeline import VoicePipelineAgent
-from livekit.plugins import deepgram, openai, silero
+from livekit.plugins import deepgram, eou, openai, silero
 
 load_dotenv()
 logger = logging.getLogger("voice-assistant")
@@ -49,6 +49,7 @@ async def entrypoint(ctx: JobContext):
         stt=deepgram.STT(model=dg_model),
         llm=openai.LLM(),
         tts=openai.TTS(),
+        eou=eou.EOU(),
         chat_ctx=initial_ctx,
     )
 

diff --git a/livekit-agents/livekit/agents/inference_runner.py b/livekit-agents/livekit/agents/inference_runner.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+import threading
+from abc import ABC, abstractmethod
+from typing import ClassVar, Protocol, Type
+
+
+class _RunnerMeta(Protocol):
+    INFERENCE_METHOD: ClassVar[str]
+
+
+_RunnersDict = dict[str, Type["_InferenceRunner"]]
+
+
+# kept private until we stabilize the API (only used for EOU today)
+class _InferenceRunner(ABC, _RunnerMeta):
+    registered_runners: _RunnersDict = {}
+
+    @classmethod
+    def register_runner(cls, runner_class: Type["_InferenceRunner"]) -> None:
+        if threading.current_thread() != threading.main_thread():
+            raise RuntimeError("InferenceRunner must be registered on the main thread")
+
+        if runner_class.INFERENCE_METHOD in cls.registered_runners:
+            raise ValueError(
+                f"InferenceRunner {runner_class.INFERENCE_METHOD} already registered"
+            )
+
+        cls.registered_runners[runner_class.INFERENCE_METHOD] = runner_class
+
+    @abstractmethod
+    def initialize(self) -> None:
+        """Initialize the runner. This is used to load models, etc."""
+        ...
+
+    @abstractmethod
+    def run(self, data: bytes) -> bytes | None:
+        """Run inference on the given data."""
+        ...
diff --git a/livekit-agents/livekit/agents/ipc/__init__.py b/livekit-agents/livekit/agents/ipc/__init__.py
@@ -1,17 +1,19 @@
 from . import (
     channel,
+    inference_proc_executor,
     job_executor,
-    proc_job_executor,
+    job_proc_executor,
+    job_thread_executor,
     proc_pool,
     proto,
-    thread_job_executor,
 )
 
 __all__ = [
     "proto",
     "channel",
     "proc_pool",
-    "proc_job_executor",
-    "thread_job_executor",
+    "job_proc_executor",
+    "job_thread_executor",
+    "inference_proc_executor",
     "job_executor",
 ]
diff --git a/livekit-agents/livekit/agents/ipc/inference_executor.py b/livekit-agents/livekit/agents/ipc/inference_executor.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+
+from typing import Protocol
+
+
+class InferenceExecutor(Protocol):
+    async def do_inference(self, method: str, data: bytes) -> bytes | None: ...
diff --git a/livekit-agents/livekit/agents/ipc/inference_proc_executor.py b/livekit-agents/livekit/agents/ipc/inference_proc_executor.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import multiprocessing as mp
+import socket
+from multiprocessing.context import BaseContext
+
+from ..inference_runner import _RunnersDict
+from ..log import logger
+from ..utils import aio, log_exceptions, shortuuid
+from . import channel, proto
+from .inference_proc_lazy_main import ProcStartArgs, proc_main
+from .supervised_proc import SupervisedProc
+
+
+class InferenceProcExecutor(SupervisedProc):
+    def __init__(
+        self,
+        *,
+        runners: _RunnersDict,
+        initialize_timeout: float,
+        close_timeout: float,
+        memory_warn_mb: float,
+        memory_limit_mb: float,
+        ping_interval: float,
+        ping_timeout: float,
+        high_ping_threshold: float,
+        mp_ctx: BaseContext,
+        loop: asyncio.AbstractEventLoop,
+    ) -> None:
+        super().__init__(
+            initialize_timeout=initialize_timeout,
+            close_timeout=close_timeout,
+            memory_warn_mb=memory_warn_mb,
+            memory_limit_mb=memory_limit_mb,
+            ping_interval=ping_interval,
+            ping_timeout=ping_timeout,
+            high_ping_threshold=high_ping_threshold,
+            mp_ctx=mp_ctx,
+            loop=loop,
+        )
+
+        self._runners = runners
+        self._active_requests: dict[str, asyncio.Future[proto.InferenceResponse]] = {}
+
+    def _create_process(self, cch: socket.socket, log_cch: socket.socket) -> mp.Process:
+        proc_args = ProcStartArgs(
+            log_cch=log_cch,
+            mp_cch=cch,
+            runners=self._runners,
+        )
+
+        return self._opts.mp_ctx.Process(  # type: ignore
+            target=proc_main,
+            args=(proc_args,),
+            name="inference_proc",
+        )
+
+    @log_exceptions(logger=logger)
+    async def _main_task(self, ipc_ch: aio.ChanReceiver[channel.Message]) -> None:
+        async for msg in ipc_ch:
+            if isinstance(msg, proto.InferenceResponse):
+                fut = self._active_requests.pop(msg.request_id, None)
+                if fut is None:
+                    logger.warning(
+                        "received unexpected inference response",
+                        extra={"request_id": msg.request_id},
+                    )
+                    return
+
+                with contextlib.suppress(asyncio.InvalidStateError):
+                    fut.set_result(msg)
+
+    async def do_inference(self, method: str, data: bytes) -> bytes | None:
+        if not self.started:
+            raise RuntimeError("process not started")
+
+        request_id = shortuuid("inference_req_")
+        fut = asyncio.Future[proto.InferenceResponse]()
+
+        await channel.asend_message(
+            self._pch,
+            proto.InferenceRequest(request_id=request_id, method=method, data=data),
+        )
+
+        self._active_requests[request_id] = fut
+
+        inf_resp = await fut
+        if inf_resp.error:
+            raise RuntimeError(f"inference of {method} failed: {inf_resp.error}")
+
+        return inf_resp.data
+
+    def logging_extra(self):
+        extra = super().logging_extra()
+        extra["inference"] = True
+        return extra
diff --git a/livekit-agents/livekit/agents/ipc/inference_proc_lazy_main.py b/livekit-agents/livekit/agents/ipc/inference_proc_lazy_main.py
@@ -0,0 +1,108 @@
+from multiprocessing import current_process
+
+if current_process().name == "inference_proc":
+    import signal
+    import sys
+
+    # ignore signals in the inference process (the parent process will handle them)
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+    signal.signal(signal.SIGTERM, signal.SIG_IGN)
+
+    def _no_traceback_excepthook(exc_type, exc_val, traceback):
+        if isinstance(exc_val, KeyboardInterrupt):
+            return
+        sys.__excepthook__(exc_type, exc_val, traceback)
+
+    sys.excepthook = _no_traceback_excepthook
+
+
+import asyncio
+import socket
+from dataclasses import dataclass
+
+from ..inference_runner import _RunnersDict
+from ..log import logger
+from ..utils import aio, log_exceptions
+from . import proto
+from .channel import Message
+from .proc_client import _ProcClient
+
+
+@dataclass
+class ProcStartArgs:
+    log_cch: socket.socket
+    mp_cch: socket.socket
+    runners: _RunnersDict
+
+
+def proc_main(args: ProcStartArgs) -> None:
+    from .proc_client import _ProcClient
+
+    inf_proc = _InferenceProc(args.runners)
+
+    client = _ProcClient(
+        args.mp_cch,
+        args.log_cch,
+        inf_proc.initialize,
+        inf_proc.entrypoint,
+    )
+
+    client.initialize_logger()
+
+    pid = current_process().pid
+    logger.info("initializing inference process", extra={"pid": pid})
+    client.initialize()
+    logger.info("inference process initialized", extra={"pid": pid})
+
+    client.run()
+
+
+class _InferenceProc:
+    def __init__(self, runners: _RunnersDict) -> None:
+        # create an instance of each runner (the ctor must not requires any argument)
+        self._runners = {name: runner() for name, runner in runners.items()}
+
+    def initialize(
+        self, init_req: proto.InitializeRequest, client: _ProcClient
+    ) -> None:
+        self._client = client
+
+        for runner in self._runners.values():
+            logger.debug(
+                "initializing inference runner",
+                extra={"runner": runner.__class__.INFERENCE_METHOD},
+            )
+            runner.initialize()
+
+    @log_exceptions(logger=logger)
+    async def entrypoint(self, cch: aio.ChanReceiver[Message]) -> None:
+        async for msg in cch:
+            if isinstance(msg, proto.InferenceRequest):
+                await self._handle_inference_request(msg)
+
+            if isinstance(msg, proto.ShutdownRequest):
+                await self._client.send(proto.Exiting(reason=msg.reason))
+                break
+
+    async def _handle_inference_request(self, msg: proto.InferenceRequest) -> None:
+        loop = asyncio.get_running_loop()
+
+        if msg.method not in self._runners:
+            logger.warning("unknown inference method", extra={"method": msg.method})
+
+        try:
+            data = await loop.run_in_executor(
+                None, self._runners[msg.method].run, msg.data
+            )
+            await self._client.send(
+                proto.InferenceResponse(
+                    request_id=msg.request_id,
+                    data=data,
+                )
+            )
+
+        except Exception as e:
+            logger.exception("error running inference")
+            await self._client.send(
+                proto.InferenceResponse(request_id=msg.request_id, error=str(e))
+            )