@ -1,3 +1,5 @@
import asyncio
import json
from typing import Dict , Any , List , Optional
import vertexai
@ -9,13 +11,18 @@ from app.models.tool_model import ToolDefinition
class LLMService :
_vertex_initialized = False
_models : dict [ str , GenerativeModel ] = { }
_vertex_tools_cache : dict [ str , Optional [ List [ Tool ] ] ] = { }
def __init__ ( self ) :
""" Inicializa o cliente Vertex AI e define modelos de fallback. """
vertexai . init (
project = settings . google_project_id ,
location = settings . google_location ,
)
if not LLMService . _vertex_initialized :
vertexai . init (
project = settings . google_project_id ,
location = settings . google_location ,
)
LLMService . _vertex_initialized = True
configured = settings . vertex_model_name . strip ( )
fallback_models = [ " gemini-2.5-flash " , " gemini-2.0-flash-001 " , " gemini-1.5-pro " ]
@ -27,6 +34,23 @@ class LLMService:
if not tools :
return None
cache_key = json . dumps (
[
{
" name " : tool . name ,
" description " : tool . description ,
" parameters " : tool . parameters ,
}
for tool in tools
] ,
sort_keys = True ,
ensure_ascii = True ,
separators = ( " , " , " : " ) ,
)
cached = LLMService . _vertex_tools_cache . get ( cache_key )
if cached is not None :
return cached
function_declarations = [
FunctionDeclaration (
name = tool . name ,
@ -36,7 +60,16 @@ class LLMService:
for tool in tools
]
return [ Tool ( function_declarations = function_declarations ) ]
vertex_tools = [ Tool ( function_declarations = function_declarations ) ]
LLMService . _vertex_tools_cache [ cache_key ] = vertex_tools
return vertex_tools
def _get_model ( self , model_name : str ) - > GenerativeModel :
model = LLMService . _models . get ( model_name )
if model is None :
model = GenerativeModel ( model_name )
LLMService . _models [ model_name ] = model
return model
async def generate_response (
self ,
@ -52,13 +85,14 @@ class LLMService:
for model_name in self . model_names :
try :
model = GenerativeM odel( model_name )
model = self . _get_m odel( model_name )
chat = model . start_chat ( history = history or [ ] )
send_kwargs = { " tools " : vertex_tools } if vertex_tools else { }
response = chat . send_message ( message , * * send_kwargs )
response = await asyncio . to_thread ( chat . send_message , message , * * send_kwargs )
break
except NotFound as err :
last_error = err
LLMService . _models . pop ( model_name , None )
continue
if response is None :
@ -83,3 +117,14 @@ class LLMService:
" response " : response . text ,
" tool_call " : None ,
}
async def warmup ( self ) - > None :
""" Preaquece conexao/modelo para reduzir latencia da primeira requisicao real. """
try :
await self . generate_response (
message = " Responda apenas: ok " ,
tools = [ ] ,
)
except Exception :
# Warmup e melhor esforco; falhas nao devem bloquear inicializacao.
return