orquestrador/app/services/llm_service.py

import asyncio
import json
from typing import Dict, Any, List, Optional

import vertexai
from google.api_core.exceptions import NotFound
from vertexai.generative_models import FunctionDeclaration, GenerativeModel, Tool

from app.core.settings import settings
from app.models.tool_model import ToolDefinition


class LLMService:
    _vertex_initialized = False
    _models: dict[str, GenerativeModel] = {}
    _vertex_tools_cache: dict[str, Optional[List[Tool]]] = {}

    def __init__(self):
        """Inicializa o cliente Vertex AI e define modelos de fallback."""
        if not LLMService._vertex_initialized:
            vertexai.init(
                project=settings.google_project_id,
                location=settings.google_location,
            )
            LLMService._vertex_initialized = True

        configured = settings.vertex_model_name.strip()
        fallback_models = ["gemini-2.5-flash", "gemini-2.0-flash-001", "gemini-1.5-pro"]
        self.model_names = [configured] + [m for m in fallback_models if m != configured]

    def build_vertex_tools(self, tools: List[ToolDefinition]) -> Optional[List[Tool]]:
        """Converte tools internas para o formato esperado pelo Vertex AI."""
        # Vertex espera uma lista de Tool, com function_declarations agrupadas em um unico Tool.
        if not tools:
            return None

        cache_key = json.dumps(
            [
                {
                    "name": tool.name,
                    "description": tool.description,
                    "parameters": tool.parameters,
                }
                for tool in tools
            ],
            sort_keys=True,
            ensure_ascii=True,
            separators=(",", ":"),
        )
        cached = LLMService._vertex_tools_cache.get(cache_key)
        if cached is not None:
            return cached

        function_declarations = [
            FunctionDeclaration(
                name=tool.name,
                description=tool.description,
                parameters=tool.parameters,
            )
            for tool in tools
        ]

        vertex_tools = [Tool(function_declarations=function_declarations)]
        LLMService._vertex_tools_cache[cache_key] = vertex_tools
        return vertex_tools

    def _get_model(self, model_name: str) -> GenerativeModel:
        model = LLMService._models.get(model_name)
        if model is None:
            model = GenerativeModel(model_name)
            LLMService._models[model_name] = model
        return model

    async def generate_response(
        self,
        message: str,
        tools: List[ToolDefinition],
        history: List[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Gera resposta textual ou chamada de tool a partir da mensagem do usuario."""
        vertex_tools = self.build_vertex_tools(tools)

        response = None
        last_error = None

        for model_name in self.model_names:
            try:
                model = self._get_model(model_name)
                chat = model.start_chat(history=history or [])
                send_kwargs = {"tools": vertex_tools} if vertex_tools else {}
                response = await asyncio.to_thread(chat.send_message, message, **send_kwargs)
                break
            except NotFound as err:
                last_error = err
                LLMService._models.pop(model_name, None)
                continue

        if response is None:
            if last_error:
                raise RuntimeError(
                    f"Nenhum modelo Vertex disponivel. Verifique VERTEX_MODEL_NAME e acesso no projeto. Erro: {last_error}"
                ) from last_error
            raise RuntimeError("Falha ao gerar resposta no Vertex AI.")

        part = response.candidates[0].content.parts[0]

        if part.function_call:
            return {
                "response": None,
                "tool_call": {
                    "name": part.function_call.name,
                    "arguments": dict(part.function_call.args),
                },
            }

        return {
            "response": response.text,
            "tool_call": None,
        }

    async def warmup(self) -> None:
        """Preaquece conexao/modelo para reduzir latencia da primeira requisicao real."""
        try:
            await self.generate_response(
                message="Responda apenas: ok",
                tools=[],
            )
        except Exception:
            # Warmup e melhor esforco; falhas nao devem bloquear inicializacao.
            return