perf(llm): reduzir latência com warmup, cache e execução não bloqueante

- inicializa Vertex AI uma única vez por processo
- adiciona cache de modelos GenerativeModel por nome
- adiciona cache da conversão de tools para formato Vertex
- executa send_message em asyncio.to_thread para não bloquear o loop async
- adiciona método warmup no LLMService (best effort)
- executa warmup no startup da API FastAPI
- executa warmup no startup do Telegram Satellite

🎯 Resultado esperado:
- menor latência no primeiro request (cold start)
- menor overhead por requisição subsequente
- melhor throughput em cenários concorrentes
main
parent e79be95264
commit 3bc23e63d1

@ -10,6 +10,7 @@ from fastapi import HTTPException
from app.core.settings import settings
from app.db.database import SessionLocal
from app.db.mock_database import SessionMockLocal
from app.services.llm_service import LLMService
from app.services.orquestrador_service import OrquestradorService
from app.services.user_service import UserService
@ -65,6 +66,7 @@ class TelegramSatelliteService:
async def run(self) -> None:
"""Inicia loop de long polling para consumir atualizacoes do bot."""
logger.info("Telegram satellite iniciado com long polling.")
await self._warmup_llm()
offset = None
timeout = aiohttp.ClientTimeout(total=self.request_timeout)
@ -82,6 +84,14 @@ class TelegramSatelliteService:
offset = update_id + 1
await self._handle_update(session=session, update=update)
async def _warmup_llm(self) -> None:
"""Preaquece o LLM no startup do satelite para reduzir latencia do primeiro usuario."""
try:
await LLMService().warmup()
logger.info("Warmup de LLM concluido no Telegram satellite.")
except Exception:
logger.exception("Falha no warmup de LLM do Telegram satellite.")
async def _initialize_offset(self, session: aiohttp.ClientSession) -> int | None:
"""
Descarta backlog pendente no startup para evitar respostas repetidas apos restart.

@ -6,6 +6,7 @@ from app.db.database import Base, engine
from app.db.mock_database import MockBase, mock_engine
from app.db.models import Tool
from app.db.mock_models import Customer, Order, ReviewSchedule, Vehicle
from app.services.llm_service import LLMService
app = FastAPI(title="AI Orquestrador")
@ -39,4 +40,10 @@ async def startup_event():
except Exception as e:
print(f"[Auto-Seed] Aviso: falha ao inicializar MySQL (mock): {e}")
try:
await LLMService().warmup()
print("[Startup] LLM warmup concluido.")
except Exception as e:
print(f"[Startup] Aviso: falha no warmup do LLM: {e}")
print("[Auto-Seed] Startup finalizado.")

@ -1,3 +1,5 @@
import asyncio
import json
from typing import Dict, Any, List, Optional
import vertexai
@ -9,13 +11,18 @@ from app.models.tool_model import ToolDefinition
class LLMService:
_vertex_initialized = False
_models: dict[str, GenerativeModel] = {}
_vertex_tools_cache: dict[str, Optional[List[Tool]]] = {}
def __init__(self):
"""Inicializa o cliente Vertex AI e define modelos de fallback."""
vertexai.init(
project=settings.google_project_id,
location=settings.google_location,
)
if not LLMService._vertex_initialized:
vertexai.init(
project=settings.google_project_id,
location=settings.google_location,
)
LLMService._vertex_initialized = True
configured = settings.vertex_model_name.strip()
fallback_models = ["gemini-2.5-flash", "gemini-2.0-flash-001", "gemini-1.5-pro"]
@ -27,6 +34,23 @@ class LLMService:
if not tools:
return None
cache_key = json.dumps(
[
{
"name": tool.name,
"description": tool.description,
"parameters": tool.parameters,
}
for tool in tools
],
sort_keys=True,
ensure_ascii=True,
separators=(",", ":"),
)
cached = LLMService._vertex_tools_cache.get(cache_key)
if cached is not None:
return cached
function_declarations = [
FunctionDeclaration(
name=tool.name,
@ -36,7 +60,16 @@ class LLMService:
for tool in tools
]
return [Tool(function_declarations=function_declarations)]
vertex_tools = [Tool(function_declarations=function_declarations)]
LLMService._vertex_tools_cache[cache_key] = vertex_tools
return vertex_tools
def _get_model(self, model_name: str) -> GenerativeModel:
model = LLMService._models.get(model_name)
if model is None:
model = GenerativeModel(model_name)
LLMService._models[model_name] = model
return model
async def generate_response(
self,
@ -52,13 +85,14 @@ class LLMService:
for model_name in self.model_names:
try:
model = GenerativeModel(model_name)
model = self._get_model(model_name)
chat = model.start_chat(history=history or [])
send_kwargs = {"tools": vertex_tools} if vertex_tools else {}
response = chat.send_message(message, **send_kwargs)
response = await asyncio.to_thread(chat.send_message, message, **send_kwargs)
break
except NotFound as err:
last_error = err
LLMService._models.pop(model_name, None)
continue
if response is None:
@ -83,3 +117,14 @@ class LLMService:
"response": response.text,
"tool_call": None,
}
async def warmup(self) -> None:
"""Preaquece conexao/modelo para reduzir latencia da primeira requisicao real."""
try:
await self.generate_response(
message="Responda apenas: ok",
tools=[],
)
except Exception:
# Warmup e melhor esforco; falhas nao devem bloquear inicializacao.
return

Loading…
Cancel
Save