diff --git a/app/integrations/telegram_satellite_service.py b/app/integrations/telegram_satellite_service.py
index 54a2e00..2688e03 100644
--- a/app/integrations/telegram_satellite_service.py
+++ b/app/integrations/telegram_satellite_service.py
@@ -10,6 +10,7 @@ from fastapi import HTTPException
 from app.core.settings import settings
 from app.db.database import SessionLocal
 from app.db.mock_database import SessionMockLocal
+from app.services.llm_service import LLMService
 from app.services.orquestrador_service import OrquestradorService
 from app.services.user_service import UserService
 
@@ -65,6 +66,7 @@ class TelegramSatelliteService:
     async def run(self) -> None:
         """Inicia loop de long polling para consumir atualizacoes do bot."""
         logger.info("Telegram satellite iniciado com long polling.")
+        await self._warmup_llm()
         offset = None
         timeout = aiohttp.ClientTimeout(total=self.request_timeout)
 
@@ -82,6 +84,14 @@ class TelegramSatelliteService:
                     offset = update_id + 1
                     await self._handle_update(session=session, update=update)
 
+    async def _warmup_llm(self) -> None:
+        """Preaquece o LLM no startup do satelite para reduzir latencia do primeiro usuario."""
+        try:
+            await LLMService().warmup()
+            logger.info("Warmup de LLM concluido no Telegram satellite.")
+        except Exception:
+            logger.exception("Falha no warmup de LLM do Telegram satellite.")
+
     async def _initialize_offset(self, session: aiohttp.ClientSession) -> int | None:
         """
         Descarta backlog pendente no startup para evitar respostas repetidas apos restart.
diff --git a/app/main.py b/app/main.py
index 0803f9f..2fafe6f 100644
--- a/app/main.py
+++ b/app/main.py
@@ -6,6 +6,7 @@ from app.db.database import Base, engine
 from app.db.mock_database import MockBase, mock_engine
 from app.db.models import Tool
 from app.db.mock_models import Customer, Order, ReviewSchedule, Vehicle
+from app.services.llm_service import LLMService
 
 app = FastAPI(title="AI Orquestrador")
 
@@ -39,4 +40,10 @@ async def startup_event():
     except Exception as e:
         print(f"[Auto-Seed] Aviso: falha ao inicializar MySQL (mock): {e}")
 
+    try:
+        await LLMService().warmup()
+        print("[Startup] LLM warmup concluido.")
+    except Exception as e:
+        print(f"[Startup] Aviso: falha no warmup do LLM: {e}")
+
     print("[Auto-Seed] Startup finalizado.")
diff --git a/app/services/llm_service.py b/app/services/llm_service.py
index 0d358e7..89b04ab 100644
--- a/app/services/llm_service.py
+++ b/app/services/llm_service.py
@@ -1,3 +1,5 @@
+import asyncio
+import json
 from typing import Dict, Any, List, Optional
 
 import vertexai
@@ -9,13 +11,18 @@ from app.models.tool_model import ToolDefinition
 
 
 class LLMService:
+    _vertex_initialized = False
+    _models: dict[str, GenerativeModel] = {}
+    _vertex_tools_cache: dict[str, Optional[List[Tool]]] = {}
 
     def __init__(self):
         """Inicializa o cliente Vertex AI e define modelos de fallback."""
-        vertexai.init(
-            project=settings.google_project_id,
-            location=settings.google_location,
-        )
+        if not LLMService._vertex_initialized:
+            vertexai.init(
+                project=settings.google_project_id,
+                location=settings.google_location,
+            )
+            LLMService._vertex_initialized = True
 
         configured = settings.vertex_model_name.strip()
         fallback_models = ["gemini-2.5-flash", "gemini-2.0-flash-001", "gemini-1.5-pro"]
@@ -27,6 +34,23 @@ class LLMService:
         if not tools:
             return None
 
+        cache_key = json.dumps(
+            [
+                {
+                    "name": tool.name,
+                    "description": tool.description,
+                    "parameters": tool.parameters,
+                }
+                for tool in tools
+            ],
+            sort_keys=True,
+            ensure_ascii=True,
+            separators=(",", ":"),
+        )
+        cached = LLMService._vertex_tools_cache.get(cache_key)
+        if cached is not None:
+            return cached
+
         function_declarations = [
             FunctionDeclaration(
                 name=tool.name,
@@ -36,7 +60,16 @@ class LLMService:
             for tool in tools
         ]
 
-        return [Tool(function_declarations=function_declarations)]
+        vertex_tools = [Tool(function_declarations=function_declarations)]
+        LLMService._vertex_tools_cache[cache_key] = vertex_tools
+        return vertex_tools
+
+    def _get_model(self, model_name: str) -> GenerativeModel:
+        model = LLMService._models.get(model_name)
+        if model is None:
+            model = GenerativeModel(model_name)
+            LLMService._models[model_name] = model
+        return model
 
     async def generate_response(
         self,
@@ -52,13 +85,14 @@ class LLMService:
 
         for model_name in self.model_names:
             try:
-                model = GenerativeModel(model_name)
+                model = self._get_model(model_name)
                 chat = model.start_chat(history=history or [])
                 send_kwargs = {"tools": vertex_tools} if vertex_tools else {}
-                response = chat.send_message(message, **send_kwargs)
+                response = await asyncio.to_thread(chat.send_message, message, **send_kwargs)
                 break
             except NotFound as err:
                 last_error = err
+                LLMService._models.pop(model_name, None)
                 continue
 
         if response is None:
@@ -83,3 +117,14 @@ class LLMService:
             "response": response.text,
             "tool_call": None,
         }
+
+    async def warmup(self) -> None:
+        """Preaquece conexao/modelo para reduzir latencia da primeira requisicao real."""
+        try:
+            await self.generate_response(
+                message="Responda apenas: ok",
+                tools=[],
+            )
+        except Exception:
+            # Warmup e melhor esforco; falhas nao devem bloquear inicializacao.
+            return