Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# --- Final Runtime Image ---
# Using python:3.11 as the base image to support evaluations that require Python,
# while still running the pre-compiled Go binary for the toolbox server.
FROM python:3.11

# Install necessary runtime certificates, standard C libraries, and curl
RUN apt-get update && apt-get install -y ca-certificates libc6 curl && rm -rf /var/lib/apt/lists/*

WORKDIR /app

# Dynamically fetch the latest version and download the binary
RUN LATEST_VERSION=$(curl -s https://api.github.com/repos/googleapis/mcp-toolbox/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') && \
curl -L https://storage.googleapis.com/mcp-toolbox-for-databases/${LATEST_VERSION}/linux/amd64/toolbox -o /app/toolbox
RUN chmod +x /app/toolbox

# Copy the extension's skills and configuration into the container
COPY skills/ ./skills/
COPY gemini-extension.json .

# Add required tools.yaml placeholder to satisfy binary startup checks
RUN touch tools.yaml

# Expose HTTP API and UI endpoints to successfully pass Cloud Run health checks
ENTRYPOINT ["/app/toolbox", "--prebuilt", "cloud-sql-postgres", "--address=0.0.0.0", "--port=8080", "--enable-api", "--ui"]
98 changes: 98 additions & 0 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
steps:

# --- STEP 1: Build and Push Docker Image ---
- name: 'gcr.io/cloud-builders/docker'
args:
- 'build'
- '-t'
- 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
- '.'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'push'
- 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'

# --- STEP 2: Deploy to Cloud Run ---
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
entrypoint: gcloud
args:
- 'run'
- 'deploy'
- 'cloud-sql-postgresql-server'
- '--image=us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
- '--region=us-central1'
- '--allow-unauthenticated'
- '--port=8080'
- '--timeout=300'
- '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'

# --- STEP 3: Fully Integrated Evaluation to Persist Results ---
- name: 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/eval_server:latest'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /evalbench

export EVAL_GCP_PROJECT_ID=$PROJECT_ID
export EVAL_GCP_PROJECT_REGION=us-central1

echo "Compiling protobuf files..."
python3 -m grpc_tools.protoc --proto_path=evalbench/evalproto --python_out=evalbench/evalproto --grpc_python_out=evalbench/evalproto evalbench/evalproto/*.proto

echo "Patching client to use insecure credentials..."
# sed -i 's/"localhost:50051"/"127.0.0.1:50051"/g' evalbench/client/eval_client.py
sed -i 's/grpc.alts_channel_credentials()/None/g' evalbench/client/eval_client.py
sed -i 's/grpc.aio.secure_channel(address, channel_creds)/grpc.aio.insecure_channel(address)/g' evalbench/client/eval_client.py

echo "Patching server to listen on all IPv4 interfaces (0.0.0.0)..."
sed -i 's/"\[::\]:%s"/"0.0.0.0:%s"/g' /evalbench/evalbench/eval_server.py
echo "Checking bind success in server (writing to stderr)..."
sed -i 's|server.add_insecure_port("0.0.0.0:%s" % PORT)|bound_port = server.add_insecure_port("0.0.0.0:%s" % PORT)\n import sys\n sys.stderr.write(f"BOUND_PORT: {bound_port}\\n")\n if bound_port == 0: raise RuntimeError("Failed to bind to port!")|' /evalbench/evalbench/eval_server.py

echo "Patching eval_service.py to fix TypeError in get_reporters..."
sed -i 's|reporters = get_reporters(config.get("reporting"), job_id, run_time)|reporters = get_reporters(config.get("reporting") or {}, job_id, run_time)|' /evalbench/evalbench/eval_service.py

echo "Patching util/session.py to make ADK import lazy..."
sed -i 's|from google.adk.sessions import VertexAiSessionService||' /evalbench/evalbench/util/session.py
sed -i 's| def __init__(self, config):| def __init__(self, config):\n from google.adk.sessions import VertexAiSessionService|' /evalbench/evalbench/util/session.py
echo "Patching databases/util.py to make SecretManagerClient lazy..."
sed -i 's|CLIENT = secretmanager_v1.SecretManagerServiceClient()|CLIENT = None\ndef get_client():\n global CLIENT\n if CLIENT is None:\n CLIENT = secretmanager_v1.SecretManagerServiceClient()\n return CLIENT|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py"
sed -i 's|CLIENT.access_secret_version|get_client().access_secret_version|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py usage"
cd evalbench
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
export PYTHONPATH=./evalproto:.
export CLOUD_RUN=True
export PORT=50051



echo "Starting Evaluation Server in background..."
# NEW: Added </dev/null in case it was waiting for input
python3 -u ./eval_server.py --localhost </dev/null &
SERVER_PID=$$!

echo "Waiting for port 50051 to open..."
python3 -c "
import socket
import time
for i in range(20):
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(('127.0.0.1', 50051))
print('Port is open!')
exit(0)
except Exception as e:
print(f'Port not open yet: {e}')
time.sleep(1)
print('Port failed to open')
exit(1)
" || { echo "Server failed to bind port. Check logs above."; exit 1; }

echo "Server is running. Launching Evaluation Client..."
cd /evalbench
export PYTHONPATH=./evalbench:./evalbench/evalproto

python3 evalbench/client/eval_client.py --experiment=/workspace/evals/run_config.yaml --endpoint=local || { echo "Client failed! Server logs:"; cat /evalbench/evalbench/server.log; exit 1; }
14 changes: 14 additions & 0 deletions evals/dataset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"scenarios": [
{
"id": "cloud-sql-debug-01",
"starting_prompt": "I need to debug the database.",
"conversation_plan": "Ask the agent to list instances in project omkar-playground. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.",
"expected_trajectory": [
"list_instances"
],
"kind": "tool",
"max_turns": 15
}
]
}
4 changes: 4 additions & 0 deletions evals/gemini_2.5_pro_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
generator: gcp_vertex_gemini
vertex_model: gemini-2.5-pro
base_prompt: ""
execs_per_minute: 5
18 changes: 18 additions & 0 deletions evals/model_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
gemini_cli_version: "@google/gemini-cli@0.26.0"
generator: gemini_cli
env:
GOOGLE_CLOUD_PROJECT: "omkar-playground"
GOOGLE_CLOUD_LOCATION: "us-central1"
GOOGLE_GENAI_USE_VERTEXAI: "true"
GEMINI_API_MODEL: "gemini-2.5-pro"
setup:
extensions:
"https://github.com/gemini-cli-extensions/cloud-sql-postgresql":
settings:
CLOUD_SQL_POSTGRES_PROJECT: "omkar-playground"
CLOUD_SQL_POSTGRES_INSTANCE: "omkar-demo-postgres-1"
CLOUD_SQL_POSTGRES_REGION: "us-central1"
CLOUD_SQL_POSTGRES_DATABASE: "postgres"
CLOUD_SQL_POSTGRES_USER: "postgres"
CLOUD_SQL_POSTGRES_PASSWORD: ${CLOUD_SQL_POSTGRES_PASSWORD}
CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC"
15 changes: 15 additions & 0 deletions evals/run_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
dataset_config: /workspace/evals/dataset.json
dataset_format: gemini-cli-format

orchestrator: geminicli
model_config: /workspace/evals/model_config.yaml
# You can reference default simulated user models provided by the evalbench repo:
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml

scorers:
trajectory_matcher: {}
goal_completion:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml

reporting:
bigquery: {}
Loading