⚡ perf(llm): reduzir latência com warmup, cache e execução não bloqueante

- inicializa Vertex AI uma única vez por processo - adiciona cache de modelos GenerativeModel por nome - adiciona cache da conversão de tools para formato Vertex - executa send_message em asyncio.to_thread para não bloquear o loop async - adiciona método warmup no LLMService (best effort) - executa warmup no startup da API FastAPI - executa warmup no startup do Telegram Satellite 🎯 Resultado esperado: - menor latência no primeiro request (cold start) - menor overhead por requisição subsequente - melhor throughput em cenários concorrentes
3 months ago · 3bc23e63d1
parent e79be95264
commit 3bc23e63d1
3 changed files with 69 additions and 7 deletions
--- a/app/integrations/telegram_satellite_service.py
+++ b/app/integrations/telegram_satellite_service.py
@ -10,6 +10,7 @@ from fastapi import HTTPException
 from app.core.settings import settings
 from app.db.database import SessionLocal
 from app.db.mock_database import SessionMockLocal
+from app.services.llm_service import LLMService
 from app.services.orquestrador_service import OrquestradorService
 from app.services.user_service import UserService

@ -65,6 +66,7 @@ class TelegramSatelliteService:
    async def run(self) -> None:
        """Inicia loop de long polling para consumir atualizacoes do bot."""
        logger.info("Telegram satellite iniciado com long polling.")
+        await self._warmup_llm()
        offset = None
        timeout = aiohttp.ClientTimeout(total=self.request_timeout)

@ -82,6 +84,14 @@ class TelegramSatelliteService:
                    offset = update_id + 1
                    await self._handle_update(session=session, update=update)

+    async def _warmup_llm(self) -> None:
+        """Preaquece o LLM no startup do satelite para reduzir latencia do primeiro usuario."""
+        try:
+            await LLMService().warmup()
+            logger.info("Warmup de LLM concluido no Telegram satellite.")
+        except Exception:
+            logger.exception("Falha no warmup de LLM do Telegram satellite.")
+
    async def _initialize_offset(self, session: aiohttp.ClientSession) -> int | None:
        """
        Descarta backlog pendente no startup para evitar respostas repetidas apos restart.
--- a/app/main.py
+++ b/app/main.py
@ -6,6 +6,7 @@ from app.db.database import Base, engine
 from app.db.mock_database import MockBase, mock_engine
 from app.db.models import Tool
 from app.db.mock_models import Customer, Order, ReviewSchedule, Vehicle
+from app.services.llm_service import LLMService

 app = FastAPI(title="AI Orquestrador")

@ -39,4 +40,10 @@ async def startup_event():
    except Exception as e:
        print(f"[Auto-Seed] Aviso: falha ao inicializar MySQL (mock): {e}")

+    try:
+        await LLMService().warmup()
+        print("[Startup] LLM warmup concluido.")
+    except Exception as e:
+        print(f"[Startup] Aviso: falha no warmup do LLM: {e}")
+
    print("[Auto-Seed] Startup finalizado.")
--- a/app/services/llm_service.py
+++ b/app/services/llm_service.py
@ -1,3 +1,5 @@
+import asyncio
+import json
 from typing import Dict, Any, List, Optional

 import vertexai
@ -9,13 +11,18 @@ from app.models.tool_model import ToolDefinition


 class LLMService:
+    _vertex_initialized = False
+    _models: dict[str, GenerativeModel] = {}
+    _vertex_tools_cache: dict[str, Optional[List[Tool]]] = {}

    def __init__(self):
        """Inicializa o cliente Vertex AI e define modelos de fallback."""
-        vertexai.init(
-            project=settings.google_project_id,
-            location=settings.google_location,
-        )
+        if not LLMService._vertex_initialized:
+            vertexai.init(
+                project=settings.google_project_id,
+                location=settings.google_location,
+            )
+            LLMService._vertex_initialized = True

        configured = settings.vertex_model_name.strip()
        fallback_models = ["gemini-2.5-flash", "gemini-2.0-flash-001", "gemini-1.5-pro"]
@ -27,6 +34,23 @@ class LLMService:
        if not tools:
            return None

+        cache_key = json.dumps(
+            [
+                {
+                    "name": tool.name,
+                    "description": tool.description,
+                    "parameters": tool.parameters,
+                }
+                for tool in tools
+            ],
+            sort_keys=True,
+            ensure_ascii=True,
+            separators=(",", ":"),
+        )
+        cached = LLMService._vertex_tools_cache.get(cache_key)
+        if cached is not None:
+            return cached
+
        function_declarations = [
            FunctionDeclaration(
                name=tool.name,
@ -36,7 +60,16 @@ class LLMService:
            for tool in tools
        ]

-        return [Tool(function_declarations=function_declarations)]
+        vertex_tools = [Tool(function_declarations=function_declarations)]
+        LLMService._vertex_tools_cache[cache_key] = vertex_tools
+        return vertex_tools
+
+    def _get_model(self, model_name: str) -> GenerativeModel:
+        model = LLMService._models.get(model_name)
+        if model is None:
+            model = GenerativeModel(model_name)
+            LLMService._models[model_name] = model
+        return model

    async def generate_response(
        self,
@ -52,13 +85,14 @@ class LLMService:

        for model_name in self.model_names:
            try:
-                model = GenerativeModel(model_name)
+                model = self._get_model(model_name)
                chat = model.start_chat(history=history or [])
                send_kwargs = {"tools": vertex_tools} if vertex_tools else {}
-                response = chat.send_message(message, **send_kwargs)
+                response = await asyncio.to_thread(chat.send_message, message, **send_kwargs)
                break
            except NotFound as err:
                last_error = err
+                LLMService._models.pop(model_name, None)
                continue

        if response is None:
@ -83,3 +117,14 @@ class LLMService:
            "response": response.text,
            "tool_call": None,
        }
+
+    async def warmup(self) -> None:
+        """Preaquece conexao/modelo para reduzir latencia da primeira requisicao real."""
+        try:
+            await self.generate_response(
+                message="Responda apenas: ok",
+                tools=[],
+            )
+        except Exception:
+            # Warmup e melhor esforco; falhas nao devem bloquear inicializacao.
+            return