⚡ perf(llm): reduzir latência com warmup, cache e execução não bloqueante

- inicializa Vertex AI uma única vez por processo - adiciona cache de modelos GenerativeModel por nome - adiciona cache da conversão de tools para formato Vertex - executa send_message em asyncio.to_thread para não bloquear o loop async - adiciona método warmup no LLMService (best effort) - executa warmup no startup da API FastAPI - executa warmup no startup do Telegram Satellite 🎯 Resultado esperado: - menor latência no primeiro request (cold start) - menor overhead por requisição subsequente - melhor throughput em cenários concorrentes
3 months ago · 3bc23e63d1
parent e79be95264
commit 3bc23e63d1
3 changed files with 69 additions and 7 deletions
--- a/app/integrations/telegram_satellite_service.py
+++ b/app/integrations/telegram_satellite_service.py
@ -10,6 +10,7 @@ from fastapi import HTTPException
 from app.core.settings import settings
 from app.db.database import SessionLocal
 from app.db.mock_database import SessionMockLocal
 from app.services.llm_service import LLMService
 from app.services.orquestrador_service import OrquestradorService
 from app.services.user_service import UserService
@ -65,6 +66,7 @@ class TelegramSatelliteService:
    async def run(self) -> None:
        """Inicia loop de long polling para consumir atualizacoes do bot."""
        logger.info("Telegram satellite iniciado com long polling.")
        await self._warmup_llm()
        offset = None
        timeout = aiohttp.ClientTimeout(total=self.request_timeout)
@ -82,6 +84,14 @@ class TelegramSatelliteService:
                    offset = update_id + 1
                    await self._handle_update(session=session, update=update)
    async def _warmup_llm(self) -> None:
        """Preaquece o LLM no startup do satelite para reduzir latencia do primeiro usuario."""
        try:
            await LLMService().warmup()
            logger.info("Warmup de LLM concluido no Telegram satellite.")
        except Exception:
            logger.exception("Falha no warmup de LLM do Telegram satellite.")
    async def _initialize_offset(self, session: aiohttp.ClientSession) -> int | None:
        """
        Descarta backlog pendente no startup para evitar respostas repetidas apos restart.
--- a/app/main.py
+++ b/app/main.py
@ -6,6 +6,7 @@ from app.db.database import Base, engine
 from app.db.mock_database import MockBase, mock_engine
 from app.db.models import Tool
 from app.db.mock_models import Customer, Order, ReviewSchedule, Vehicle
 from app.services.llm_service import LLMService
 app = FastAPI(title="AI Orquestrador")
@ -39,4 +40,10 @@ async def startup_event():
    except Exception as e:
        print(f"[Auto-Seed] Aviso: falha ao inicializar MySQL (mock): {e}")
    try:
        await LLMService().warmup()
        print("[Startup] LLM warmup concluido.")
    except Exception as e:
        print(f"[Startup] Aviso: falha no warmup do LLM: {e}")
    print("[Auto-Seed] Startup finalizado.")
--- a/app/services/llm_service.py
+++ b/app/services/llm_service.py
@ -1,3 +1,5 @@
 import asyncio
 import json
 from typing import Dict, Any, List, Optional
 import vertexai
@ -9,13 +11,18 @@ from app.models.tool_model import ToolDefinition
 class LLMService:
    _vertex_initialized = False
    _models: dict[str, GenerativeModel] = {}
    _vertex_tools_cache: dict[str, Optional[List[Tool]]] = {}
    def __init__(self):
        """Inicializa o cliente Vertex AI e define modelos de fallback."""
-        vertexai.init(
+        if not LLMService._vertex_initialized:
-            project=settings.google_project_id,
+            vertexai.init(
-            location=settings.google_location,
+                project=settings.google_project_id,
-        )
+                location=settings.google_location,
            )
            LLMService._vertex_initialized = True
        configured = settings.vertex_model_name.strip()
        fallback_models = ["gemini-2.5-flash", "gemini-2.0-flash-001", "gemini-1.5-pro"]
@ -27,6 +34,23 @@ class LLMService:
        if not tools:
            return None
        cache_key = json.dumps(
            [
                {
                    "name": tool.name,
                    "description": tool.description,
                    "parameters": tool.parameters,
                }
                for tool in tools
            ],
            sort_keys=True,
            ensure_ascii=True,
            separators=(",", ":"),
        )
        cached = LLMService._vertex_tools_cache.get(cache_key)
        if cached is not None:
            return cached
        function_declarations = [
            FunctionDeclaration(
                name=tool.name,
@ -36,7 +60,16 @@ class LLMService:
            for tool in tools
        ]
-        return [Tool(function_declarations=function_declarations)]
+        vertex_tools = [Tool(function_declarations=function_declarations)]
        LLMService._vertex_tools_cache[cache_key] = vertex_tools
        return vertex_tools
    def _get_model(self, model_name: str) -> GenerativeModel:
        model = LLMService._models.get(model_name)
        if model is None:
            model = GenerativeModel(model_name)
            LLMService._models[model_name] = model
        return model
    async def generate_response(
        self,
@ -52,13 +85,14 @@ class LLMService:
        for model_name in self.model_names:
            try:
-                model = GenerativeModel(model_name)
+                model = self._get_model(model_name)
                chat = model.start_chat(history=history or [])
                send_kwargs = {"tools": vertex_tools} if vertex_tools else {}
-                response = chat.send_message(message, **send_kwargs)
+                response = await asyncio.to_thread(chat.send_message, message, **send_kwargs)
                break
            except NotFound as err:
                last_error = err
                LLMService._models.pop(model_name, None)
                continue
        if response is None:
@ -83,3 +117,14 @@ class LLMService:
            "response": response.text,
            "tool_call": None,
        }
    async def warmup(self) -> None:
        """Preaquece conexao/modelo para reduzir latencia da primeira requisicao real."""
        try:
            await self.generate_response(
                message="Responda apenas: ok",
                tools=[],
            )
        except Exception:
            # Warmup e melhor esforco; falhas nao devem bloquear inicializacao.
            return