diff --git a/app/integrations/telegram_satellite_service.py b/app/integrations/telegram_satellite_service.py index 54a2e00..2688e03 100644 --- a/app/integrations/telegram_satellite_service.py +++ b/app/integrations/telegram_satellite_service.py @@ -10,6 +10,7 @@ from fastapi import HTTPException from app.core.settings import settings from app.db.database import SessionLocal from app.db.mock_database import SessionMockLocal +from app.services.llm_service import LLMService from app.services.orquestrador_service import OrquestradorService from app.services.user_service import UserService @@ -65,6 +66,7 @@ class TelegramSatelliteService: async def run(self) -> None: """Inicia loop de long polling para consumir atualizacoes do bot.""" logger.info("Telegram satellite iniciado com long polling.") + await self._warmup_llm() offset = None timeout = aiohttp.ClientTimeout(total=self.request_timeout) @@ -82,6 +84,14 @@ class TelegramSatelliteService: offset = update_id + 1 await self._handle_update(session=session, update=update) + async def _warmup_llm(self) -> None: + """Preaquece o LLM no startup do satelite para reduzir latencia do primeiro usuario.""" + try: + await LLMService().warmup() + logger.info("Warmup de LLM concluido no Telegram satellite.") + except Exception: + logger.exception("Falha no warmup de LLM do Telegram satellite.") + async def _initialize_offset(self, session: aiohttp.ClientSession) -> int | None: """ Descarta backlog pendente no startup para evitar respostas repetidas apos restart. diff --git a/app/main.py b/app/main.py index 0803f9f..2fafe6f 100644 --- a/app/main.py +++ b/app/main.py @@ -6,6 +6,7 @@ from app.db.database import Base, engine from app.db.mock_database import MockBase, mock_engine from app.db.models import Tool from app.db.mock_models import Customer, Order, ReviewSchedule, Vehicle +from app.services.llm_service import LLMService app = FastAPI(title="AI Orquestrador") @@ -39,4 +40,10 @@ async def startup_event(): except Exception as e: print(f"[Auto-Seed] Aviso: falha ao inicializar MySQL (mock): {e}") + try: + await LLMService().warmup() + print("[Startup] LLM warmup concluido.") + except Exception as e: + print(f"[Startup] Aviso: falha no warmup do LLM: {e}") + print("[Auto-Seed] Startup finalizado.") diff --git a/app/services/llm_service.py b/app/services/llm_service.py index 0d358e7..89b04ab 100644 --- a/app/services/llm_service.py +++ b/app/services/llm_service.py @@ -1,3 +1,5 @@ +import asyncio +import json from typing import Dict, Any, List, Optional import vertexai @@ -9,13 +11,18 @@ from app.models.tool_model import ToolDefinition class LLMService: + _vertex_initialized = False + _models: dict[str, GenerativeModel] = {} + _vertex_tools_cache: dict[str, Optional[List[Tool]]] = {} def __init__(self): """Inicializa o cliente Vertex AI e define modelos de fallback.""" - vertexai.init( - project=settings.google_project_id, - location=settings.google_location, - ) + if not LLMService._vertex_initialized: + vertexai.init( + project=settings.google_project_id, + location=settings.google_location, + ) + LLMService._vertex_initialized = True configured = settings.vertex_model_name.strip() fallback_models = ["gemini-2.5-flash", "gemini-2.0-flash-001", "gemini-1.5-pro"] @@ -27,6 +34,23 @@ class LLMService: if not tools: return None + cache_key = json.dumps( + [ + { + "name": tool.name, + "description": tool.description, + "parameters": tool.parameters, + } + for tool in tools + ], + sort_keys=True, + ensure_ascii=True, + separators=(",", ":"), + ) + cached = LLMService._vertex_tools_cache.get(cache_key) + if cached is not None: + return cached + function_declarations = [ FunctionDeclaration( name=tool.name, @@ -36,7 +60,16 @@ class LLMService: for tool in tools ] - return [Tool(function_declarations=function_declarations)] + vertex_tools = [Tool(function_declarations=function_declarations)] + LLMService._vertex_tools_cache[cache_key] = vertex_tools + return vertex_tools + + def _get_model(self, model_name: str) -> GenerativeModel: + model = LLMService._models.get(model_name) + if model is None: + model = GenerativeModel(model_name) + LLMService._models[model_name] = model + return model async def generate_response( self, @@ -52,13 +85,14 @@ class LLMService: for model_name in self.model_names: try: - model = GenerativeModel(model_name) + model = self._get_model(model_name) chat = model.start_chat(history=history or []) send_kwargs = {"tools": vertex_tools} if vertex_tools else {} - response = chat.send_message(message, **send_kwargs) + response = await asyncio.to_thread(chat.send_message, message, **send_kwargs) break except NotFound as err: last_error = err + LLMService._models.pop(model_name, None) continue if response is None: @@ -83,3 +117,14 @@ class LLMService: "response": response.text, "tool_call": None, } + + async def warmup(self) -> None: + """Preaquece conexao/modelo para reduzir latencia da primeira requisicao real.""" + try: + await self.generate_response( + message="Responda apenas: ok", + tools=[], + ) + except Exception: + # Warmup e melhor esforco; falhas nao devem bloquear inicializacao. + return