perf(llm): reduzir latência com warmup, cache e execução não bloqueante

- inicializa Vertex AI uma única vez por processo
- adiciona cache de modelos GenerativeModel por nome
- adiciona cache da conversão de tools para formato Vertex
- executa send_message em asyncio.to_thread para não bloquear o loop async
- adiciona método warmup no LLMService (best effort)
- executa warmup no startup da API FastAPI
- executa warmup no startup do Telegram Satellite

🎯 Resultado esperado:
- menor latência no primeiro request (cold start)
- menor overhead por requisição subsequente
- melhor throughput em cenários concorrentes
main
parent e79be95264
commit 3bc23e63d1

@ -10,6 +10,7 @@ from fastapi import HTTPException
from app.core.settings import settings from app.core.settings import settings
from app.db.database import SessionLocal from app.db.database import SessionLocal
from app.db.mock_database import SessionMockLocal from app.db.mock_database import SessionMockLocal
from app.services.llm_service import LLMService
from app.services.orquestrador_service import OrquestradorService from app.services.orquestrador_service import OrquestradorService
from app.services.user_service import UserService from app.services.user_service import UserService
@ -65,6 +66,7 @@ class TelegramSatelliteService:
async def run(self) -> None: async def run(self) -> None:
"""Inicia loop de long polling para consumir atualizacoes do bot.""" """Inicia loop de long polling para consumir atualizacoes do bot."""
logger.info("Telegram satellite iniciado com long polling.") logger.info("Telegram satellite iniciado com long polling.")
await self._warmup_llm()
offset = None offset = None
timeout = aiohttp.ClientTimeout(total=self.request_timeout) timeout = aiohttp.ClientTimeout(total=self.request_timeout)
@ -82,6 +84,14 @@ class TelegramSatelliteService:
offset = update_id + 1 offset = update_id + 1
await self._handle_update(session=session, update=update) await self._handle_update(session=session, update=update)
async def _warmup_llm(self) -> None:
"""Preaquece o LLM no startup do satelite para reduzir latencia do primeiro usuario."""
try:
await LLMService().warmup()
logger.info("Warmup de LLM concluido no Telegram satellite.")
except Exception:
logger.exception("Falha no warmup de LLM do Telegram satellite.")
async def _initialize_offset(self, session: aiohttp.ClientSession) -> int | None: async def _initialize_offset(self, session: aiohttp.ClientSession) -> int | None:
""" """
Descarta backlog pendente no startup para evitar respostas repetidas apos restart. Descarta backlog pendente no startup para evitar respostas repetidas apos restart.

@ -6,6 +6,7 @@ from app.db.database import Base, engine
from app.db.mock_database import MockBase, mock_engine from app.db.mock_database import MockBase, mock_engine
from app.db.models import Tool from app.db.models import Tool
from app.db.mock_models import Customer, Order, ReviewSchedule, Vehicle from app.db.mock_models import Customer, Order, ReviewSchedule, Vehicle
from app.services.llm_service import LLMService
app = FastAPI(title="AI Orquestrador") app = FastAPI(title="AI Orquestrador")
@ -39,4 +40,10 @@ async def startup_event():
except Exception as e: except Exception as e:
print(f"[Auto-Seed] Aviso: falha ao inicializar MySQL (mock): {e}") print(f"[Auto-Seed] Aviso: falha ao inicializar MySQL (mock): {e}")
try:
await LLMService().warmup()
print("[Startup] LLM warmup concluido.")
except Exception as e:
print(f"[Startup] Aviso: falha no warmup do LLM: {e}")
print("[Auto-Seed] Startup finalizado.") print("[Auto-Seed] Startup finalizado.")

@ -1,3 +1,5 @@
import asyncio
import json
from typing import Dict, Any, List, Optional from typing import Dict, Any, List, Optional
import vertexai import vertexai
@ -9,13 +11,18 @@ from app.models.tool_model import ToolDefinition
class LLMService: class LLMService:
_vertex_initialized = False
_models: dict[str, GenerativeModel] = {}
_vertex_tools_cache: dict[str, Optional[List[Tool]]] = {}
def __init__(self): def __init__(self):
"""Inicializa o cliente Vertex AI e define modelos de fallback.""" """Inicializa o cliente Vertex AI e define modelos de fallback."""
vertexai.init( if not LLMService._vertex_initialized:
project=settings.google_project_id, vertexai.init(
location=settings.google_location, project=settings.google_project_id,
) location=settings.google_location,
)
LLMService._vertex_initialized = True
configured = settings.vertex_model_name.strip() configured = settings.vertex_model_name.strip()
fallback_models = ["gemini-2.5-flash", "gemini-2.0-flash-001", "gemini-1.5-pro"] fallback_models = ["gemini-2.5-flash", "gemini-2.0-flash-001", "gemini-1.5-pro"]
@ -27,6 +34,23 @@ class LLMService:
if not tools: if not tools:
return None return None
cache_key = json.dumps(
[
{
"name": tool.name,
"description": tool.description,
"parameters": tool.parameters,
}
for tool in tools
],
sort_keys=True,
ensure_ascii=True,
separators=(",", ":"),
)
cached = LLMService._vertex_tools_cache.get(cache_key)
if cached is not None:
return cached
function_declarations = [ function_declarations = [
FunctionDeclaration( FunctionDeclaration(
name=tool.name, name=tool.name,
@ -36,7 +60,16 @@ class LLMService:
for tool in tools for tool in tools
] ]
return [Tool(function_declarations=function_declarations)] vertex_tools = [Tool(function_declarations=function_declarations)]
LLMService._vertex_tools_cache[cache_key] = vertex_tools
return vertex_tools
def _get_model(self, model_name: str) -> GenerativeModel:
model = LLMService._models.get(model_name)
if model is None:
model = GenerativeModel(model_name)
LLMService._models[model_name] = model
return model
async def generate_response( async def generate_response(
self, self,
@ -52,13 +85,14 @@ class LLMService:
for model_name in self.model_names: for model_name in self.model_names:
try: try:
model = GenerativeModel(model_name) model = self._get_model(model_name)
chat = model.start_chat(history=history or []) chat = model.start_chat(history=history or [])
send_kwargs = {"tools": vertex_tools} if vertex_tools else {} send_kwargs = {"tools": vertex_tools} if vertex_tools else {}
response = chat.send_message(message, **send_kwargs) response = await asyncio.to_thread(chat.send_message, message, **send_kwargs)
break break
except NotFound as err: except NotFound as err:
last_error = err last_error = err
LLMService._models.pop(model_name, None)
continue continue
if response is None: if response is None:
@ -83,3 +117,14 @@ class LLMService:
"response": response.text, "response": response.text,
"tool_call": None, "tool_call": None,
} }
async def warmup(self) -> None:
"""Preaquece conexao/modelo para reduzir latencia da primeira requisicao real."""
try:
await self.generate_response(
message="Responda apenas: ok",
tools=[],
)
except Exception:
# Warmup e melhor esforco; falhas nao devem bloquear inicializacao.
return

Loading…
Cancel
Save